In [None]:
import findspark
findspark.init()

import datetime
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from pyspark import sql, SparkConf, SparkContext
from pyspark.sql.functions import avg, col, to_date, lit
import ipywidgets as ipy
from IPython.display import display, clear_output
from ipywidgets import Output, VBox, widgets, interact

In [None]:
conf = SparkConf().setAppName("Read_CSV")
sc = SparkContext(conf=conf)
sqlContext = sql.SQLContext(sc)

dfPollution = sqlContext.read.csv("Luchtvervuiling.csv", header=True)

In [None]:
MONTHS = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october",\
          "november", "december"]
TYPES_P = ["so2", "no2", "rspm", "spm"]

In [None]:
#haal alle waarden uit date kolom met als doel alle verschillende jaren in een lijst te steken voor filter
date_rows = dfPollution.select(to_date(dfPollution.date).alias("to_date")).collect()
years = []
for row in date_rows:
    try:
        if row.to_date.year not in years:
            years.append(row.to_date.year)
    except:
        pass
years.sort(reverse=True)

#haal alle waarden uit location kolom met als doel alle verschillende locations in een lijst te steken voor filter
location_rows = dfPollution.select("location").distinct().collect()
locations = [str(row["location"]) for row in location_rows]
locations.sort()

In [None]:
cityFilter = widgets.Dropdown(
    options = locations,
    value = locations[0],
    description = "Location:",
    disabled = False,
)
yearFilter = widgets.Dropdown(
    options = years,
    value = years[0],
    description = "Year:",
    disabled = False,
)
typesPMonthlyFilter = widgets.Dropdown(
    options = TYPES_P,
    value = TYPES_P[0],
    description = "Type:",
    disabled = False,
)
display(widgets.HBox((cityFilter, yearFilter, typesPMonthlyFilter)))

def update_monthly_plot():
    #Get filter waarden
    selectedCity = cityFilter.value
    selectedYear = str(yearFilter.value)
    selectedPType = typesPMonthlyFilter.value
    
    x_axis = MONTHS
    y_axis = []
    #per maand het gemiddelde berekenen a.d.h.v. filters voor jaar, locatie en type vervuiling
    for i in range(1,13):
        if i < 10:
            month = "0" + str(i)
        else:
            month = str(i)
        try:
            month_rows = dfPollution.filter((col("date").between(selectedYear + "-" + month + "-01",\
                                                                 selectedYear + "-" + month + "-31"))\
                                            & (col("location") == selectedCity)\
                                            & (col(selectedPType) != "NA"))
            if month_rows.count() > 0:
                month_ptype_rows = month_rows.select(selectedPType).collect()
                try:
                    month_ptype = [float(row[selectedPType]) for row in month_ptype_rows]
                except:
                    pass
                monthly_ptype_avg = sum(month_ptype) / len(month_ptype)
                y_axis.append(monthly_ptype_avg)
            else:
                y_axis.append(0)
        except:
            pass
    #plot
    fig1 = plt.figure(figsize=(12, 7))
    fig1.suptitle("Air Pollution measurements in " + selectedCity + " (" + selectedYear + ")", fontsize=14)
    ax = fig1.add_subplot(111)
    ax.set_xlabel("month")
    fig1YLabel = "Concentration of " + selectedPType + " (μg/m³)"
    ax.set_ylabel(fig1YLabel)
    ax.plot(x_axis, y_axis, "go-", linewidth=2)
    plt.show()

update_monthly_plot()
#update na filter changes
def on_change_monthly_plot_filter(change):
    if change["name"] == "value" and (change["new"] != change["old"]):
        clear_output()
        display(widgets.HBox((cityFilter, yearFilter, typesPMonthlyFilter)))
        update_monthly_plot()

#filters linken aan update functie
cityFilter.observe(on_change_monthly_plot_filter)
yearFilter.observe(on_change_monthly_plot_filter)
typesPMonthlyFilter.observe(on_change_monthly_plot_filter)

In [None]:
typesPYearlyFilter = widgets.Dropdown(
    options = TYPES_P,
    value = TYPES_P[0],
    description = "Type:",
    disabled = False,
)
display(widgets.HBox((typesPYearlyFilter,)))

def update_yearly_plot():
    #Get filter waarden
    selectedPType = typesPYearlyFilter.value
    
    x_axis = years
    industry_y_axis = []
    residential_y_axis = []
    #per jaar het gemiddelde berekenen voor industrie en residentieel gebied a.d.h.v. de filter voor type vervuiling
    for year in years:
        industry_year_rows = dfPollution.filter((col("date").between(str(year) + "-01-01",\
                                                                     str(year) + "-31-12"))\
                                                & (col(selectedPType) != "NA")\
                                                & ((col("type") == "Industrial") | (col("type") == "Industrial Area")\
                                                   | (col("type") == "Industrial Areas")))
        industry_year_ptype_rows = industry_year_rows.select(selectedPType).collect()
        residential_year_rows = dfPollution.filter((col("date").between(str(year) + "-01-01",\
                                                                        str(year) + "-31-12"))\
                                                   & (col(selectedPType) != "NA")\
                                                   & ((col("type") == "Residential") | (col("type") == "Residential and others")\
                                                      | (col("type") == "Residential, Rural and other Areas")))
        residential_year_ptype_rows = residential_year_rows.select(selectedPType).collect()
        try:
            industry_year_ptype = [float(row[selectedPType]) for row in industry_year_ptype_rows]
            residential_year_ptype = [float(row[selectedPType]) for row in residential_year_ptype_rows]
        except:
            pass
        if len(industry_year_ptype) != 0:
            industry_year_ptype_avg = sum(industry_year_ptype) / len(industry_year_ptype)
        else:
            industry_year_ptype_avg = 0
        if len(residential_year_ptype) != 0:
            residential_year_ptype_avg = sum(residential_year_ptype) / len(residential_year_ptype)
        else:
            residential_year_ptype_avg = 0
        if industry_year_ptype_avg != 0:
            industry_y_axis.append(industry_year_ptype_avg)
        else:
            industry_y_axis.append(0)
        if residential_year_ptype_avg != 0:
            residential_y_axis.append(residential_year_ptype_avg)
        else:
            residential_y_axis.append(0)
    
    #plot
    fig2 = plt.figure(figsize=(12, 7))
    fig2.suptitle("industrial vs residential " + selectedPType + " measurements (yearly averages)", fontsize=14)
    ax2 = fig2.add_subplot(111)
    ax2.set_xlabel("year")
    fig2YLabel = "Concentration of " + selectedPType + " (μg/m³)"
    ax2.set_ylabel(fig2YLabel)
    ax2.plot(x_axis, industry_y_axis, "ro-", linewidth=2, label = "industrial")
    ax2.plot(x_axis, residential_y_axis, "bo-", linewidth=2, label = "residential")
    ax2.legend()
    plt.show()

update_yearly_plot()
#update na filter changes
def on_change_yearly_plot_filter(change):
    if change["name"] == "value" and (change["new"] != change["old"]):
        clear_output()
        display(widgets.HBox((typesPYearlyFilter,)))
        update_yearly_plot()

#filters linken aan update functie
typesPYearlyFilter.observe(on_change_yearly_plot_filter)