In [None]:
import findspark
findspark.init()

import datetime
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from pyspark import sql, SparkConf, SparkContext
from pyspark.sql.functions import avg, col, to_date, lit
import ipywidgets as ipy
from IPython.display import display, clear_output
from ipywidgets import Output, VBox, widgets, interact

In [None]:
conf = SparkConf().setAppName("Read_CSV")
sc = SparkContext(conf=conf)
sql_context = sql.SQLContext(sc)

df_pollution = sql_context.read.csv("Luchtvervuiling.csv", header=True)

In [None]:
MONTHS = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october",\
          "november", "december"]
TYPES_P = ["so2", "no2", "rspm", "spm"]

In [None]:
#Haal alle waarden uit date kolom met als doel alle verschillende jaren in een lijst te steken voor filter
date_rows = df_pollution.select(to_date(df_pollution.date).alias("to_date")).collect()
years = []
for row in date_rows:
    try:
        if row.to_date.year not in years:
            years.append(row.to_date.year)
    except:
        pass
years.sort(reverse=True)

#Haal alle waarden uit location kolom met als doel alle verschillende locations in een lijst te steken voor filter
location_rows = df_pollution.select("location").distinct().collect()
locations = [str(row["location"]) for row in location_rows]
locations.sort()

In [None]:
city_filter = widgets.Dropdown(
    options = locations,
    value = locations[0],
    description = "Location:",
    disabled = False,
)
year_filter = widgets.Dropdown(
    options = years,
    value = years[0],
    description = "Year:",
    disabled = False,
)
types_p_monthly_filter = widgets.Dropdown(
    options = TYPES_P,
    value = TYPES_P[0],
    description = "Type:",
    disabled = False,
)
display(widgets.HBox((city_filter, year_filter, types_p_monthly_filter)))

def update_monthly_plot():
    #Get filter waarden
    selected_city = city_filter.value
    selected_year = str(year_filter.value)
    selected_p_type = types_p_monthly_filter.value
    
    x_axis = MONTHS
    y_axis = []
    #Per maand het gemiddelde berekenen a.d.h.v. filters voor jaar, locatie en type vervuiling
    for i in range(1,13):
        if i < 10:
            month = "0" + str(i)
        else:
            month = str(i)
        try:
            month_rows = df_pollution.filter((col("date").between(selected_year + "-" + month + "-01",\
                                                                 selected_year + "-" + month + "-31"))\
                                            & (col("location") == selected_city)\
                                            & (col(selected_p_type) != "NA"))
            if month_rows.count() > 0:
                month_p_type_rows = month_rows.select(selected_p_type).collect()
                try:
                    month_p_type = [float(row[selected_p_type]) for row in month_p_type_rows]
                except:
                    pass
                monthly_p_type_avg = sum(month_p_type) / len(month_p_type)
                y_axis.append(monthly_p_type_avg)
            else:
                y_axis.append(0)
        except:
            pass
    #Plot
    fig1 = plt.figure(figsize=(12, 7))
    fig1.suptitle("Air Pollution measurements in " + selected_city + " (" + selected_year + ")", fontsize=14)
    ax = fig1.add_subplot(111)
    ax.set_xlabel("month")
    fig1_y_label = "Concentration of " + selected_p_type + " (μg/m³)"
    ax.set_ylabel(fig1_y_label)
    ax.plot(x_axis, y_axis, "go-", linewidth=2)
    plt.show()

update_monthly_plot()
#Update na filter changes
def on_change_monthly_plot_filter(change):
    if change["name"] == "value" and (change["new"] != change["old"]):
        clear_output()
        display(widgets.HBox((city_filter, year_filter, types_p_monthly_filter)))
        update_monthly_plot()

#Filters linken aan update functie
city_filter.observe(on_change_monthly_plot_filter)
year_filter.observe(on_change_monthly_plot_filter)
types_p_monthly_filter.observe(on_change_monthly_plot_filter)

In [None]:
types_p_yearly_filter = widgets.Dropdown(
    options = TYPES_P,
    value = TYPES_P[0],
    description = "Type:",
    disabled = False,
)
display(widgets.HBox((types_p_yearly_filter,)))

def update_yearly_plot():
    #Get filter waarden
    selected_p_type = types_p_yearly_filter.value
    
    x_axis = years
    industry_y_axis = []
    residential_y_axis = []
    #Per jaar het gemiddelde berekenen voor industrie en residentieel gebied a.d.h.v. de filter voor type vervuiling
    for year in years:
        industry_year_rows = df_pollution.filter((col("date").between(str(year) + "-01-01",\
                                                                     str(year) + "-12-31"))\
                                                & (col(selected_p_type) != "NA")\
                                                & ((col("type") == "Industrial") | (col("type") == "Industrial Area")\
                                                   | (col("type") == "Industrial Areas")))
        industry_year_p_type_rows = industry_year_rows.select(selected_p_type).collect()
        residential_year_rows = df_pollution.filter((col("date").between(str(year) + "-01-01",\
                                                                        str(year) + "-12-31"))\
                                                   & (col(selected_p_type) != "NA")\
                                                   & ((col("type") == "Residential") | (col("type") == "Residential and others")\
                                                      | (col("type") == "Residential, Rural and other Areas")))
        residential_year_p_type_rows = residential_year_rows.select(selected_p_type).collect()
        try:
            industry_year_p_type = [float(row[selected_p_type]) for row in industry_year_p_type_rows]
            residential_year_p_type = [float(row[selected_p_type]) for row in residential_year_p_type_rows]
        except:
            pass
        if len(industry_year_p_type) != 0:
            industry_year_p_type_avg = sum(industry_year_p_type) / len(industry_year_p_type)
        else:
            industry_year_p_type_avg = 0
        if len(residential_year_p_type) != 0:
            residential_year_p_type_avg = sum(residential_year_p_type) / len(residential_year_p_type)
        else:
            residential_year_p_type_avg = 0
        if industry_year_p_type_avg != 0:
            industry_y_axis.append(industry_year_p_type_avg)
        else:
            industry_y_axis.append(0)
        if residential_year_p_type_avg != 0:
            residential_y_axis.append(residential_year_p_type_avg)
        else:
            residential_y_axis.append(0)
    
    #Plot
    fig2 = plt.figure(figsize=(12, 7))
    fig2.suptitle("industrial vs residential " + selected_p_type + " measurements (yearly averages)", fontsize=14)
    ax2 = fig2.add_subplot(111)
    ax2.set_xlabel("year")
    fig2_y_label = "Concentration of " + selected_p_type + " (μg/m³)"
    ax2.set_ylabel(fig2_y_label)
    ax2.plot(x_axis, industry_y_axis, "ro-", linewidth=2, label = "industrial")
    ax2.plot(x_axis, residential_y_axis, "bo-", linewidth=2, label = "residential")
    ax2.legend()
    plt.show()

update_yearly_plot()
#Update na filter changes
def on_change_yearly_plot_filter(change):
    if change["name"] == "value" and (change["new"] != change["old"]):
        clear_output()
        display(widgets.HBox((types_p_yearly_filter,)))
        update_yearly_plot()

#Filters linken aan update functie
types_p_yearly_filter.observe(on_change_yearly_plot_filter)