In [None]:
import findspark
findspark.init()

import datetime
import pyparsing as pp
import ipywidgets as ipy

from pyspark import sql, SparkConf, SparkContext
from pyspark.sql.functions import col, to_date
from IPython.display import display, clear_output
from ipywidgets import Output, VBox, widgets, interact
from prettytable import PrettyTable

In [None]:
conf = SparkConf().setAppName("Read_CSV")
sc = SparkContext(conf=conf)
sqlContext = sql.SQLContext(sc)

dfPollution = sqlContext.read.csv("Luchtvervuiling.csv", header=True)

In [None]:
MONTHS = ['all months','january','february','march','april','may','june','july','august','september','october',\
          'november','december']

#Haal alle waarden uit date kolom met als doel alle verschillende jaren in een lijst te steken voor filter
date_rows = dfPollution.select(to_date(dfPollution.date).alias('to_date')).collect()
years = []
for row in date_rows:
    try:
        if row.to_date.year not in years:
            years.append(row.to_date.year)
    except:
        pass
years.sort(reverse=True)

#Haal alle waarden uit location kolom met als doel alle verschillende steden in een lijst te steken voor filter
location_rows = dfPollution.select("location").distinct().collect()
locations = [str(row['location']) for row in location_rows]
locations.sort()

#Filters
cityFilter = widgets.Dropdown(
    options = locations,
    value = locations[0],
    description = 'City:',
    disabled = False,
)
yearFilter = widgets.Dropdown(
    options = years,
    value = years[0],
    description = 'Year:',
    disabled = False,
)
monthFilter = widgets.Dropdown(
    options = MONTHS,
    value = MONTHS[0],
    description = 'Month:',
    disabled = False,
)
display(widgets.HBox((cityFilter, yearFilter, monthFilter)))

def update_table():
    #Data van dfPollution die getoond moet worden overzetten naar dfTable
    dfTable = dfPollution.select(dfPollution['date'], dfPollution['state'], dfPollution['location'], dfPollution['type'], \
                                 dfPollution['so2'], dfPollution['no2'], dfPollution['rspm'], dfPollution['spm'])
    if str(monthFilter.value) == "all months" :
        dfTable = dfTable.filter((col("date").between(str(yearFilter.value) + "-01-01",\
                                                      str(yearFilter.value) + "-12-31"))\
                                & (col("location") == cityFilter.value))
    else :
        month = MONTHS.index(monthFilter.value)
        if month < 10 :
            month = "0" + str(month)
        month = str(month)
        dfTable = dfTable.filter((col("date").between(str(yearFilter.value) + "-" + month + "-01",\
                                                      str(yearFilter.value) + "-" + month + "-31"))\
                                 & (col("location") == cityFilter.value))
    count = dfTable.count()
    if count == 0 :
        noMeasurementsError = "No measurements have taken place at " + cityFilter.value + " during "
        errorEndingStr = "the year " + str(yearFilter.value) + "."
        if str(monthFilter.value) == "all months" :
            noMeasurementsError += errorEndingStr
        else :
            noMeasurementsError += monthFilter.value + " of " + errorEndingStr
        print(noMeasurementsError)
    else :
        dfTable.sort(col("date")).show(dfTable.count())

update_table()
#Update bij aanpassing van filters
def on_change(change):
    if change['name'] == 'value' and (change['new'] != change['old']):
        clear_output()
        display(widgets.HBox((cityFilter, yearFilter, monthFilter)))
        update_table()

#Filters linken aan on_change functie
cityFilter.observe(on_change)
yearFilter.observe(on_change)
monthFilter.observe(on_change)

In [None]:
#Open de csv file
#pollution = open("Luchtvervuiling.csv", 'r')

#Lees de csv file
#pollution = pollution.readlines()

#Data opsplitsen op komma, zonder komma's in quotes te verwijderen
#def splitDataOnComma(str):
#    splitStr = pp.commaSeparatedList.copy().addParseAction(pp.tokenMap(lambda s: s.strip('"')))
#    splitStr = splitStr.parseString(str).asList()
#    return splitStr

#Headers opsplitsen
#headersStr = pollution[0]
#headers = splitDataOnComma(headersStr)

#Headers voor tabel
#tableHeaders = PrettyTable([headers[12], headers[2], headers[3], headers[5], headers[6], headers[7], headers[8], headers[9]])

# Data toevoegen aan tabel
#for i in range(1, 1000) :
#    row = splitDataOnComma(pollution[i])
#    tableHeaders.add_row([row[12], row[2], row[3], row[5], row[6], row[7], row[8], row[9]])

#print(tableHeaders)