In [None]:
import findspark
import folium

findspark.init()

from pyspark import sql, SparkConf, SparkContext
from ipywidgets import interact

coordinates = (20.593684, 78.96288)
conf = SparkConf().setAppName("Read_CSV")
sc = SparkContext(conf=conf)
sqlContext = sql.SQLContext(sc)
m = folium.Map(location=coordinates, zoom_start=4)

dfCities = sqlContext.read.csv("Steden.csv", header=True)
dfPollution = sqlContext.read.csv("Luchtvervuiling.csv", header=True)

In [None]:
from folium.plugins import MarkerCluster

typesP = ["so2", "no2", "rspm", "spm"]
dfCities = dfCities.select("lat", "lng", "city")
dfPollution = dfPollution.select("location", "so2", "no2", "rspm", "spm")
c = '#3186cc'

@interact(types=typesP)
def get_pollution(types):
    index = 0
    joinDF = dfPollution.join(dfCities, dfPollution.location == dfCities.city, how='right')
    if types == 'so2':
        joinDF = joinDF.filter("so2 != ''").filter("so2 != 'NA'").filter("lat != ''").filter("lng != ''").dropDuplicates(['city']).collect()
    elif types == 'no2':
        joinDF = joinDF.filter("no2 != ''").filter("no2 != 'NA'").filter("lat != ''").filter("lng != ''").dropDuplicates(['city']).collect()
    elif types == 'rspm':
        joinDF = joinDF.filter("rspm != ''").filter("rspm != 'NA'").filter("lat != ''").filter("lng != ''").dropDuplicates(['city']).collect()
    elif types == 'spm':
        joinDF = joinDF.filter("spm != ''").filter("spm != 'NA'").filter("lat != ''").filter("lng != ''").dropDuplicates(['city']).collect()
        
    for line in joinDF:
        if types == 'so2':
            pollution = joinDF[index][1]
            if float(pollution) > 10:
                c = '#ff0000'
            elif float(pollution) < 5:
                c = '#7CFC00'
            else:
                c = '#ffd27f'
        elif types == 'no2':
            pollution = joinDF[index][2]
            if float(pollution) > 18:
                c = '#ff0000'
            elif float(pollution) < 10:
                c = '#7CFC00'
            else:
                c = '#ffd27f'
        elif types == 'rspm':
            pollution = joinDF[index][3]
            if float(pollution) > 200:
                c = '#ff0000'
            elif float(pollution) < 80:
                c = '#7CFC00'
            else:
                c = '#ffd27f'
        elif types == 'spm':
            pollution = joinDF[index][4]
            if float(pollution) > 300:
                c = '#ff0000'
            elif float(pollution) < 200:
                c = '#7CFC00'
            else:
                c = '#ffd27f'

        folium.CircleMarker(
            location=[joinDF[index][5], joinDF[index][6]],
            radius=4,
            popup=pollution,
            color=c,
            fill=True,
            fill_color='#3186cc'
        ).add_to(m)
        index = index + 1
    return m

In [None]:
import datetime
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from pyspark.sql.functions import avg, col, to_date, lit
import ipywidgets as ipy
from IPython.display import display, clear_output
from ipywidgets import Output, VBox, widgets

MONTHS = ['january','february','march','april','may','june','july','august','september','october','november','december']

#haal alle waarden uit date kolom met als doel alle verschillende jaren in een lijst te steken voor filter
date_rows = dfPollution.select(to_date(dfPollution.date).alias('to_date')).collect()
years = []
for row in date_rows:
    try:
        if row.to_date.year not in years:
            years.append(row.to_date.year)
    except:
        pass
years.sort(reverse=True)

#haal aalle waarden uit location kolom met als doel alle verschillende locations in een lijst te steken voor filter
location_rows = dfPollution.select("location").distinct().collect()
locations = [str(row['location']) for row in location_rows]
locations.sort()


cityFilter = widgets.Dropdown(
    options = locations,
    value = locations[0],
    description = 'Location:',
    disabled = False,
)
yearFilter = widgets.Dropdown(
    options = years,
    value = years[0],
    description = 'Year:',
    disabled = False,
)
display(widgets.HBox((cityFilter, yearFilter)))

def update_plot():
    x_axis = MONTHS
    y_axis = []
    #per maand het gemiddelde so2 berekenen a.d.h.v. filters voor het jaar en locatie
    for i in range(1,13):
        if i < 10:
            month = "0" + str(i)
        else:
            month = str(i)
        try:
            month_rows = dfPollution.filter((col("date").between(str(yearFilter.value) + "-" + month + "-01", str(yearFilter.value) + "-" + month + "-31")) & (col("location") == cityFilter.value) & (col("so2") != 'NA'))
            if month_rows.count() > 0:
                month_so2_rows = month_rows.select("so2").collect()
                try:
                    month_so2 = [int(row['so2']) for row in month_so2_rows]
                except:
                    pass
                monthly_so2_avg = sum(month_so2) / len(month_so2)
                y_axis.append(monthly_so2_avg)
            else:
                y_axis.append(0)
        except:
            pass
    #plot
    fig1 = plt.figure(figsize=(12, 7))
    fig1.suptitle('Air Pollution measurements in ' + str(cityFilter.value) + " (" + str(yearFilter.value) + ")", fontsize=14)
    ax = fig1.add_subplot(111)
    ax.set_xlabel('month')
    ax.set_ylabel('so2 (sulphur dioxide ppm)')
    ax.plot(x_axis, y_axis, 'go-', linewidth=2)
    plt.show()

update_plot()
#update na filter changes
def on_change(change):
    if change['name'] == 'value' and (change['new'] != change['old']):
        clear_output()
        display(widgets.HBox((cityFilter, yearFilter)))
        update_plot()
#filters linken aan update functie
cityFilter.observe(on_change)
yearFilter.observe(on_change)


In [None]:
#haal alle waarden uit date kolom met als doel alle verschillende jaren in een lijst te steken
date_rows = dfPollution.select(to_date(dfPollution.date).alias('to_date')).collect()
years = []
for row in date_rows:
    try:
        if row.to_date.year not in years:
            years.append(row.to_date.year)
    except:
        pass
years.sort(reverse=False)

x_axis = years
industry_y_axis = []
residential_y_axis = []

for year in years:
    industry_year_rows = dfPollution.filter((col("date").between(str(year) + "-01-01", str(year) + "-31-12")) & (col("so2") != 'NA') & ((col("type") == 'Industrial') | (col("type") == 'Industrial Area') | (col("type") == 'Industrial Areas')))
    industry_year_so2_rows = industry_year_rows.select("so2").collect()
    residential_year_rows = dfPollution.filter((col("date").between(str(year) + "-01-01", str(year) + "-31-12")) & (col("so2") != 'NA') & ((col("type") == 'Residential') | (col("type") == 'Residential and others') | (col("type") == 'Residential, Rural and other Areas')))
    residential_year_so2_rows = residential_year_rows.select("so2").collect()
    try:
        industry_year_so2 = [float(row['so2']) for row in industry_year_so2_rows]
        residential_year_so2 = [float(row['so2']) for row in residential_year_so2_rows]
    except:
        pass
    industry_year_so2_avg = sum(industry_year_so2) / len(industry_year_so2)
    residential_year_so2_avg = sum(residential_year_so2) / len(residential_year_so2)
    if industry_year_so2_avg != 0:
        industry_y_axis.append(industry_year_so2_avg)
    else:
        industry_y_axis.append(0)
    if residential_year_so2_avg != 0:
        residential_y_axis.append(residential_year_so2_avg)
    else:
        residential_y_axis.append(0)
    
#plot
fig2 = plt.figure(figsize=(12, 7))
fig2.suptitle('industrial vs residential so2 measurements (yearly averages)', fontsize=14)
ax2 = fig2.add_subplot(111)
ax2.set_xlabel('year')
ax2.set_ylabel('so2 ppm')
ax2.plot(x_axis, industry_y_axis, 'ro-', linewidth=2, label = 'industrial')
ax2.plot(x_axis, residential_y_axis, 'bo-', linewidth=2, label = 'residential')
ax2.legend()
plt.show()