In [2]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('GuangzhouPM_Explore').getOrCreate()

In [3]:
# Let's read in the data. Note that it's in the format of csv.
GuangzhouPM = spark.read.csv('i4_Dataset/GuangzhouPM20100101_20151231.csv', inferSchema=True, header=True)

In [4]:
# Data Exploration
# Visualise DataFrames by using show method to see how many columns in the dataset
GuangzhouPM.show()

+---+----+-----+---+----+------+---------------+--------------------+----------+----+----+------+----+----+---+-------------+-----+
| No|year|month|day|hour|season|PM_City Station|PM_5th Middle School|PM_US Post|DEWP|HUMI|  PRES|TEMP|cbwd|Iws|precipitation|Iprec|
+---+----+-----+---+----+------+---------------+--------------------+----------+----+----+------+----+----+---+-------------+-----+
|  1|2010|    1|  1|   0|     4|             NA|                  NA|        NA| 9.4|  76|1015.1|13.5|  NW|0.8|            0|    0|
|  2|2010|    1|  1|   1|     4|             NA|                  NA|        NA|10.2|  83|1015.2|  13|  cv|0.5|            0|    0|
|  3|2010|    1|  1|   2|     4|             NA|                  NA|        NA|10.4|  87|  1015|12.5|  NW|0.6|          0.3|  0.3|
|  4|2010|    1|  1|   3|     4|             NA|                  NA|        NA|10.2|  89|1014.9|  12|  NW|1.4|          0.6|  0.9|
|  5|2010|    1|  1|   4|     4|             NA|                  NA|       

In [5]:
# Show general statistics on the data. 
GuangzhouPM.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+-----------------+------------------+-----------------+--------------------+-----------------+------------------+-----------------+-----------------+------------------+-----+------------------+-------------------+------------------+
|summary|               No|              year|            month|              day|             hour|            season|  PM_City Station|PM_5th Middle School|       PM_US Post|              DEWP|             HUMI|             PRES|              TEMP| cbwd|               Iws|      precipitation|             Iprec|
+-------+-----------------+------------------+-----------------+-----------------+-----------------+------------------+-----------------+--------------------+-----------------+------------------+-----------------+-----------------+------------------+-----+------------------+-------------------+------------------+
|  count|            52584|             52584|         

In [6]:
# Show the data type for each variable using print schema. 
GuangzhouPM.printSchema()

root
 |-- No: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- season: string (nullable = true)
 |-- PM_City Station: string (nullable = true)
 |-- PM_5th Middle School: string (nullable = true)
 |-- PM_US Post: string (nullable = true)
 |-- DEWP: string (nullable = true)
 |-- HUMI: string (nullable = true)
 |-- PRES: string (nullable = true)
 |-- TEMP: string (nullable = true)
 |-- cbwd: string (nullable = true)
 |-- Iws: string (nullable = true)
 |-- precipitation: string (nullable = true)
 |-- Iprec: string (nullable = true)



In [7]:
GuangzhouPM.select('PM_City Station','season').show()

+---------------+------+
|PM_City Station|season|
+---------------+------+
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
|             NA|     4|
+---------------+------+
only showing top 20 rows



In [8]:
GuangzhouPM.createOrReplaceTempView('GuangzhouPM')
results = spark.sql("SELECT season FROM GuangzhouPM")
results.show()

+------+
|season|
+------+
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
+------+
only showing top 20 rows



In [9]:
results.show()

+------+
|season|
+------+
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
|     4|
+------+
only showing top 20 rows



In [10]:
GuangzhouPM.columns

['No',
 'year',
 'month',
 'day',
 'hour',
 'season',
 'PM_City Station',
 'PM_5th Middle School',
 'PM_US Post',
 'DEWP',
 'HUMI',
 'PRES',
 'TEMP',
 'cbwd',
 'Iws',
 'precipitation',
 'Iprec']

In [11]:
from pyspark.sql.types import (StructField,StringType,IntegerType,StructType)

In [12]:
data_schema = [StructField('PM_City Station',IntegerType(),True)]

final_struct = StructType(fields=data_schema)

In [13]:
df = spark.read.csv('i4_Dataset/GuangzhouPM20100101_20151231.csv', schema=final_struct)

df.printSchema()

root
 |-- PM_City Station: integer (nullable = true)



In [14]:
import matplotlib.pyplot as plt

In [15]:
import numpy as np 