In [1]:
# Importing required libraries.
import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
import seaborn as sns #visualisation
%matplotlib inline 
sns.set(color_codes=True)
import plotly.express as px
import datetime as dt

from os import listdir
from os.path import isfile, join
import glob

#conda install -c conda-forge ipyleaflet
from ipyleaflet import Map, basemaps, basemap_to_tiles, Marker, Heatmap, WidgetControl, FullScreenControl
from bqplot import Lines, Figure, LinearScale, DateScale, Axis, Tooltip
from ipywidgets import Dropdown, IntSlider, ToggleButton

import aqi
import calendar

from Cleaner_Class import *
from Derive_Class import *

In [2]:
#create a cleaner object
# contains methods to clean data
cleaner = Cleaner()

# create a derive object
# Contains methods to derive data
deriver = Derive()


### Run the clean functions and the derived data functions


In [3]:
#actually laod all the data, clean it, and put it into a dataframe
# looks in the data directory, any csv files get put into one dataframe
df_all = cleaner.load_csv_data("data")

#Runs cleaner methods
#Removes duplicates
#Removes nulls
cleaner.clean_data(df_all)

#Group the data by date and station name
#Selects the max value for each attribute for each day
df_group = deriver.group_data_by_day(df_all)
#date information is originally stored in seperate fields needs to be combined and put into
#two fields, one as a string and one as a datetime. 
#different plotting libraries use different date formats
deriver.create_date_from_string(df_group, 'month', 'day', 'year')
#calculte the aqi for PM10 and PM2.5.
#If the resulting value of aqi is greater then 500 the aqi function throws an index error
#we capture the error and assing a value for those rows of 501
deriver.calc_aqis(df_group)
#the data needs to be sorted for plotting
df = df_group.sort_values(by = 'date')
#test what we have, make sure fields are there and are in the correct format.
df.info()

**************************
Removing Duplicates
**************************
number of duplicate rows:  0
**************************
Removing Nulls
**************************
- - - - - - - - - - - - - 
Before removing nulls
- - - - - - - - - - - - - 
No              0
year            0
month           0
day             0
hour            0
PM2.5        8739
PM10         6449
SO2          9021
NO2         12116
CO          20701
O3          13277
TEMP          398
PRES          393
DEWP          403
RAIN          390
wd           1822
WSPM          318
station         0
lat        210384
long       210384
dtype: int64
- - - - - - - - - - - - - 
After removing nulls
- - - - - - - - - - - - - 
No         382168
year       382168
month      382168
day        382168
hour       382168
PM2.5      382168
PM10       382168
SO2        382168
NO2        382168
CO         382168
O3         382168
TEMP       382168
PRES       382168
DEWP       382168
RAIN       382168
wd         382168
WSPM       38216

In [None]:
df.to_csv('cleaned.csv')

# Start here if you have already prepped the data and saved it

In [None]:
parse_dates = ['date']
df = pd.read_csv('cleaned.csv',parse_dates=parse_dates)


In [None]:
df.info()

In [None]:
#quick list of all unique station names
station_names = df_all.station.unique()
station_names

In [None]:
#Select just one station to examine
#df = df_all[df_all['station'].isin(['Aotizhongxin','Changping','Dingling'])]
df_A = df[df['station'].isin(['Aotizhongxin'])]
print('length of A : {}'.format(len(df_A.index)))
df_C = df[df['station'].isin(['Changping'])]
print('length of C : {}'.format(len(df_C.index)))
df_D = df[df['station'].isin(['Dingling'])]
print('length of D : {}'.format(len(df_D.index)))    

# Some exploration

lat longs:

- Aotizhongxin :  (41.741127, 123.462775)
- Changping :  (40.220585, 116.228038)
- Dingling :  (40.289968, 116.237352)
- Dongsi :  (39.929855, 116.421619)
- Guanyuan :  (39.932482, 116.355741)
- Gucheng :  (39.907599, 116.190328)
- Huairou :  (40.605853, 116.622746)
- Nongzhanguan :  (39.945631, 116.475666)
- Shunyi :  (40.152315, 116.714525)
- Tiantan :  (39.888430, 116.409856)
- Wanliu :  (39.977951, 116.292273)
- Wanghouxigong :  (39.879796, 116.368245)


In [None]:
df_numeric = df[['PM2.5','PM10','SO2','NO2','CO','O3','TEMP','PRES','DEWP','RAIN']]
sns.boxplot(x="variable", y="value", data=pd.melt(df_numeric))

In [None]:
Q1 = df_numeric.quantile(0.25)
Q3 = df_numeric.quantile(0.75)
IQR = Q3-Q1
print(IQR)

In [None]:
# Plotting a Histogram
df.CO.value_counts().nlargest(40).plot(kind='bar', figsize=(10,5))
plt.title('Number of days by CO')
plt.ylabel('Number of days')
plt.xlabel('CO');

In [None]:
# Plotting a Histogram
df.PM10.value_counts().nlargest(40).plot(kind='bar', figsize=(10,5))
plt.title('Number of days by PM10')
plt.ylabel('Number of days')
plt.xlabel('PM10');

In [None]:
# Plotting a Histogram
df['PM2.5'].value_counts().nlargest(40).plot(kind='bar', figsize=(10,5))
plt.title('Number of days by PM2.5')
plt.ylabel('Number of days')
plt.xlabel('PM2.5');

In [None]:
# Timeseries

fig = px.line(df_A, x='date_string', y='PM10')
# fig.add_scatter(x=df_all['date'], y=df_all['PM10'], mode='lines')
# fig.add_scatter(x=df_all['date'], y=df_all['SO2'], mode='lines')
#fig.add_scatter(x=df_all['date'], y=df_all['O3'], mode='lines')         

fig.show()


In [None]:
df.PM10.value_counts().nlargest(40).plot(kind='bar', figsize=(10,5))
plt.title('Number of days by PM10')
plt.ylabel('Number of days')
plt.xlabel('PM10');

In [None]:
# Finding the relations between the variables.
plt.figure(figsize=(20,10))
c = df.corr()
sns.heatmap(c,cmap='BrBG',annot=True)
c

In [None]:
# Plotting a scatter plot
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['NO2'], df['WSPM'])
ax.set_xlabel('NO2')
ax.set_ylabel('WSPM')
plt.show()

# Map Stuff

In [4]:
#some globals...not exactly the best way to do this buuut..
# global global_station_name
# global global_year
# global global_month
# global global_data_name

global_station_name = 'Aotizhongxin'
global_year = 2015
global_month = 1
global_data_name = 'CO'


y_data = df[(df['month'] == global_month) & (df['station'] == global_station_name) & (df['year'] == global_year)][global_data_name].values
x_data = df[(df['month'] == global_month) & (df['station'] == global_station_name) & (df['year'] == global_year)]['date'].values

y_scale = LinearScale(min=0, max=df[global_data_name].max())

date_start = dt.datetime(global_year, global_month, 1)
date_end = dt.datetime(global_year, global_month, 31)

date_scale = DateScale(min=date_start, max=date_end)



lines = Lines(x=x_data, y=y_data ,scales={'x': date_scale, 'y': y_scale})
label = calendar.month_name[global_month] + ' - ' + str(global_year)


ax_x = Axis(label=label, scale=date_scale, num_ticks=10, tick_format='%d')
ax_y = Axis(label=global_data_name.capitalize(), scale=y_scale, orientation='vertical', side='left')

# figure = Figure(axes=[ax_x, ax_y], title=global_station_name, marks=[lines], animation_duration=1000,
#                  layout={'max_height': '300px', 'max_width': '500px'})


figure = Figure(axes=[ax_x, ax_y], title=global_station_name, marks=[lines], animation_duration=1000,
                 layout={'width':'600px', 'height':'400px'})




In [None]:
# test_df = df[(df['month'] == 1) & (df['station'] == 'Aotizhongxin') & (df['year'] == 2015)]
# test_df = df[(df['station'] == 'Aotizhongxin')]
test_df = df[(df['station'] == 'Aotizhongxin') & (df['year'] == 2015)]
#test_df

In [5]:
def get_on_hover(marker):  
    def callback(*args, **kwargs): 
        global global_station_name
        global global_year        
        global global_month
        global global_data_name
        global_station_name = marker.name        
        update_figure(global_station_name, global_data_name, global_year, global_month)
    return callback




In [6]:
def update_figure(station_name, data_name, year, month):

    if(year == 999):
        y_data = df[(df['station'] == station_name)][data_name].values
        x_data = df[df['station'] == station_name]['date'].values
        
        year_start = 2013
        year_end = 2016
        month_start = 1
        month_end = 12
        
        date_start = dt.datetime(2013, 1, 1)
        date_end = dt.datetime(2016, 12, 31)
        
        ax_x.label = "2013 to 2016"
        ax_x.tick_format = '%y'
    
    if(month == 999 and year != 999):
        y_data = df[(df['station'] == station_name) & (df['year'] == year)][data_name].values
        x_data = df[(df['station'] == station_name) & (df['year'] == year)]['date'].values

        date_start = dt.datetime(year, 1, 1)
        date_end = dt.datetime(year, 12, 31)
        
        ax_x.label = str(year)
        ax_x.tick_format = '%m'

        
    if ((month != 999) and (year != 999) ):
        y_data = df[(df['month'] == month) & (df['station'] == station_name) & (df['year'] == year)][data_name].values
        x_data = df[(df['month'] == month) & (df['station'] == station_name) & (df['year'] == year)]['date'].values

        date_start = dt.datetime(year, month, 1)
        date_end = dt.datetime(year, month, calendar.monthrange(year, month)[1])
        
        ax_x.label = calendar.month_name[month] + " - " + str(year)
        ax_x.tick_format = '%d'

           
    lines.y = y_data
    lines.x = x_data
    
    ax_y.label = data_name.capitalize()
    figure.title = station_name  
    

    date_scale = DateScale(min=date_start, max=date_end)
    ax_x.scale = date_scale
    
    x_scale = LinearScale(min=0, max=df[global_data_name].max())
    ax_y.scale = x_scale
    
    
    lines.scales={'x': date_scale, 'y': x_scale}
    
   

In [7]:
m = Map(center=(39.987916, 116.383936), zoom=9)

stations = {
    'Aotizhongxin' : (39.987916, 116.383936),
    'Changping' : (40.220585, 116.228038),
    'Dingling' : (40.289968, 116.237352),
    'Dongsi' : (39.929855, 116.421619),
    'Guanyuan' : (39.932482, 116.355741),
    'Gucheng' : (39.907599, 116.190328),
    'Huairou' : (40.321012, 116.630901),
    'Nongzhanguan' : (39.945631, 116.475666),
    'Shunyi' : (40.136771, 116.656268),  
    'Tiantan' : (39.888430, 116.409856),
    'Wanliu' : (39.977951, 116.292273),
    'Wanshouxigong' : (39.879796, 116.368245) 
}

for station in stations.items():
    marker = Marker(location=station[1], draggable=False, title=station[0], name=station[0])
    marker.on_mouseover(get_on_hover(marker))    
    m.add_layer(marker);    
    #m.add_layer(Marker(location=station[1], draggable=False));

# marker = Marker(location=center, draggable=False)
# m.add_layer(marker);

map_layer = basemap_to_tiles(basemaps.CartoDB.Positron)
m.add_layer(map_layer)
m.add_control(FullScreenControl())
m


Map(center=[39.987916, 116.383936], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title'…

True
False
True
False


In [8]:
widget_control1 = WidgetControl(widget=figure, position='bottomright')
m.add_control(widget_control1)

In [9]:
dropdown = Dropdown(
    options=['CO', 'PM10', 'PM2.5', 'aqi_PM10', 'aqi_PM2.5', 'SO2','NO2','O3','TEMP','PRES','DEWP','RAIN'],
    value=global_data_name,
    description='Measurement:'
)



def data_on_click(change):
    global global_data_name
    global_data_name = change['new']
    update_figure(global_station_name, global_data_name, global_year, global_month)

    
dropdown.observe(data_on_click, 'value')

widget_control2 = WidgetControl(widget=dropdown, position='bottomleft')

m.add_control(widget_control2)



In [10]:
# dropdown_year = Dropdown(
#     options=[2013, 2014, 2015, 2016,'All Years'],
#     value=global_year,
#     description='Year:'
# )

def year_on_click(change):
    global global_year
    global_year = change['new']
    if global_year == 'All Years':
        global_year = 999
        global_data_month = 999
    update_figure(global_station_name, global_data_name, global_year, global_month)

# dropdown_year.observe(year_on_click, 'value')

# widget_control_year = WidgetControl(widget=dropdown_year, position='bottomleft')

# m.add_control(widget_control_year)


In [11]:
slider_year = IntSlider(
    value=global_year,
    min=2013,
    max=2016,
    step=1,
    description='Year:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

slider_year.observe(year_on_click, 'value')

widget_control_slider_year = WidgetControl(widget=slider_year, position='bottomleft')

m.add_control(widget_control_slider_year)

In [12]:
toggle_year = ToggleButton(
    value=False,
    description='Toggle Years',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Description',
    icon='check' # (FontAwesome names without the `fa-` prefix)
)

def toggle_year_on_click(change):
    global global_year        
    
    print(change['new'])
    if(change['new']):
        slider_year.disabled = True
        slider_month.disabled = True 
        toggle_month.disabled = True
        global_year = 999
    else :
        slider_year.disabled = False
        slider_month.disabled = False        
        toggle_month.disabled = False
        global_year = slider_year.value
        
        
    update_figure(global_station_name, global_data_name, global_year, global_month)

toggle_year.observe(toggle_year_on_click, 'value')

widget_control_toggle_year = WidgetControl(widget=toggle_year, position='bottomleft')

m.add_control(widget_control_toggle_year)

In [13]:
# dropdown_month = Dropdown(
#     options=[1,2,3,4,5,6,7,8,9,10,11,12,'All Months'],
#     value=global_month,
#     description='Month:'
# )

def month_on_click(change):
    global global_month
    global_month = change['new']
    if global_month == 'All Months':
        global_month = 999
        
    update_figure(global_station_name, global_data_name, global_year, global_month)

# dropdown_month.observe(month_on_click, 'value')

# widget_control_month = WidgetControl(widget=dropdown_month, position='bottomleft')

# m.add_control(widget_control_month)


In [14]:
slider_month = IntSlider(
    value=global_month,
    min=1,
    max=12,
    step=1,
    description='Month:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

slider_month.observe(month_on_click, 'value')

widget_control_slider_month = WidgetControl(widget=slider_month, position='bottomleft')

m.add_control(widget_control_slider_month)

In [15]:
toggle_month = ToggleButton(
    value=False,
    description='Toggle Months',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Description',
    icon='check' # (FontAwesome names without the `fa-` prefix)
)

def toggle_month_on_click(change):
    global global_month
    
    
    if(change['new']):
        slider_month.disabled = True        
        global_month = 999
    else :        
        slider_month.disabled = False        
        global_month = slider_month.value
        
        
    update_figure(global_station_name, global_data_name, global_year, global_month)

toggle_month.observe(toggle_month_on_click, 'value')

widget_control_toggle_month = WidgetControl(widget=toggle_month, position='bottomleft')

m.add_control(widget_control_toggle_month)