# MLBD : Assignment

### TODO: Switch methods from panda to spark when possible
### TODO: Work on a way to acquire data multiples times a day

In [None]:
# Imports:

import numpy as np
from urllib.request import urlopen
from pyspark.sql.session import SparkSession
from pyspark import SparkFiles
from pyspark.sql.functions import *
import urllib.request, json, datetime

In [None]:
# Spark session builder:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
sc.uiWebUrl

In [None]:
# Locally save instances of the data:
import time

today = datetime.datetime.now()
url5min = 'https://data.sensor.community/static/v2/data.json'
url24h = 'https://data.sensor.community/static/v2/data.24h.json'

# sleep until specific time (e.g. 5pm) before running the next line
today = datetime.datetime.now()
exactImportTime = datetime.datetime(today.year, today.month, today.day, 17, 0, 0)
awaitingTime = exactImportTime - today
time.sleep(awaitingTime.total_seconds())

today = datetime.datetime.now()
with urllib.request.urlopen(url5min) as url:
    data5min = json.load(url)
with open('output/5min/data5min_{}-{}-{}_{}h{}.json'.format(today.year, today.month, today.day, today.hour, str(today.minute).zfill(2)), 'w') as outfile:
    json.dump(data5min, outfile)
with urllib.request.urlopen(url24h) as url:
    data24h = json.load(url)
with open('output/24h/data24h_{}-{}-{}_{}h{}.json'.format(today.year, today.month, today.day, today.hour, str(today.minute).zfill(2)), 'w') as outfile:
    json.dump(data24h, outfile)

In [None]:
# Load data from local files and load them into Spark DataFrames:
file1 = 'output/data24h_2022-10-31.json'
file2 = 'output/data24h_2022-11-01.json'

# File 1:
spark.sparkContext.addFile(file1)
filename_1 = SparkFiles.get(file1.split('/')[-1])
df1 = spark.read.json(filename_1)
df1.createOrReplaceTempView("df1")
df1.printSchema()

# File 2:
spark.sparkContext.addFile(file2)
filename_2 = SparkFiles.get(file2.split('/')[-1])
df2 = spark.read.json(filename_2)
df2.createOrReplaceTempView("df2")
df2.printSchema()

In [None]:
# Spark dataframes used for tasks :
dfs = [df1, df2]

In [None]:
# AQI Map:
air = {}
air[1] = ["Low", [0,16], [0,11]]
air[2] = ["Low", [17,33], [12,23]]
air[3] = ["Low", [34,50], [24,35]]
air[4] = ["Medium", [51,58], [36,41]]
air[5] = ["Medium", [59,66], [42,47]]
air[6] = ["Medium", [67,75], [48,53]]
air[7] = ["High", [76,83], [54,58]]
air[8] = ["High", [84,91], [59,64]]
air[9] = ["High", [92,100], [65,70]]
air[10] = ["Very High", [101,10000000], [71,10000000]]

# Spark implementation & tasks:

### Task 1: Identify the top 10 countries in terms of average air quality improvement over the previous 24 hours as well as the current averaged air quality indices of each. As far as possible use the country field in the sensor data to identify the country.

In [None]:
# Preprocessing (P1 and P2 filtering):
for i in range(len(dfs)):
    print("Raw dataset count: ", dfs[i].count())
    # Explode sensordatavalues using pyspark.sql.functions.explode
    df_ = dfs[i].withColumn('sensordatavalues', explode('sensordatavalues'))
    print("Dataset count after exploding sensordatavalues: ", df_.count())
    # Remove rows that aren't P1 or P2:
    df_ = df_[df_.sensordatavalues.value_type.isin(['P1', 'P2'])]
    print("Dataset count after removing rows that aren't P1 or P2: ", df_.count())
    # Regroup sensordatavalues by record id:
    df_ = df_.groupby('id').agg(collect_list('sensordatavalues').alias('sensordatavalues'))
    # Remove the old sensordatavalues column still containing values different from P1 and P2:
    dfs[i] = dfs[i].drop('sensordatavalues')
    # Link the new sensordatavalues column to the old dataframe, on id:
    dfs[i] = dfs[i].join(df_, on='id', how='inner')
    print("Dataset count after joining the new sensordatavalues column to the old dataframe: ", dfs[i].count())

    dfs[i].show(5)
    # dfs[i].select('id','location.country','location.id', 'sensordatavalues.value_type','sensordatavalues.value').sort('location.country', 'location.id').show(5, False)

In [None]:
for i in range(len(dfs)):
    # replace location field with country field:
    df_ = dfs[i].withColumn('location', dfs[i]['location.country'])
    # sort by country:
    df_ = df_.sort('location')
    # explode sensordatavalues:
    df_ = df_.withColumn('sensordatavalues', explode('sensordatavalues'))
    # group by country:
    df_ = df_.groupby('location').agg(collect_list('sensordatavalues').alias('sensordatavalues'))
    # Create a RDD collection of tuples (country, (P1, P2)), so that each rdd element is a combo of a country and either its P1 or P2 values. Then convert RDDs to DataFrames and join them to the original dataframe:
    df_ = df_.join(df_.rdd.map(lambda x: (x[0], [float(y['value']) for y in x[1] if y['value_type'] == 'P1'])).toDF(['location', 'P1']), on='location', how='inner')
    df_ = df_.join(df_.rdd.map(lambda x: (x[0], [float(y['value']) for y in x[1] if y['value_type'] == 'P2'])).toDF(['location', 'P2']), on='location', how='inner')
    # Create a RDD collection to calculate the average P1 and P2 values for each country, and convert RDDs to DataFrames and join them to the original dataframe:
    df_ = df_.join(df_.rdd.map(lambda x: (x[0], float(np.round(np.mean(x[2]), 2)))).toDF(['location', 'avgP1']), on='location', how='inner')
    df_ = df_.join(df_.rdd.map(lambda x: (x[0], float(np.round(np.mean(x[3]), 2)))).toDF(['location', 'avgP1']), on='location', how='inner')
    # Associate P1 and P2 avg to their respective AQI:
    df_ = df_.join(df_.rdd.map(lambda x: (x[0], [y for y in air if air[y][2][0] <= np.round(x[4]) <= air[y][2][1]][0])).toDF(['location', 'P1_AQI']), on='location', how='inner')
    df_ = df_.join(df_.rdd.map(lambda x: (x[0], [y for y in air if air[y][2][0] <= np.round(x[5]) <= air[y][2][1]][0])).toDF(['location', 'P2_AQI']), on='location', how='inner')
    # Calculate the max AQI for each country:
    df_ = df_.withColumn('maxAQI', when(df_.P1_AQI > df_.P2_AQI, df_.P1_AQI).otherwise(df_.P2_AQI))

    dfs[i] = df_
    dfs[i].show(10)

In [None]:
# Task 1: Identify the top 10 countries in terms of average air quality improvement over the previous 24 hours as well as the current averaged air quality indices of each. As far as possible use the country field in the sensor data to identify the country.
df1 = dfs[0].select('location', 'maxAQI').withColumnRenamed('maxAQI', 'maxAQI_1')
df2 = dfs[1].select('location', 'maxAQI').withColumnRenamed('maxAQI', 'maxAQI_2')
df_diff = df1.join(df2, on='location', how='inner')
df_diff = df_diff.withColumn('diffAQI', df_diff.maxAQI_2 - df_diff.maxAQI_1).select('location', 'diffAQI')
df_diff = df_diff.sort('diffAQI', ascending=True)
df_diff.show(12)

#

### Task 2: Using the geo-coordinates from the sensor data, group the data into smaller regions using an appropriate clustering algorithm. Then determine the top 50 regions in terms of air quality improvement over the previous 24 hours.

In [None]:
from pyspark.mllib.clustering import KMeans

dfs = [df1, df2]
for i in range(len(dfs)):
    # Same process as task 1, we filter the dataframe to keep only P1 and P2 values
    df_ = dfs[i].withColumn('sensordatavalues', explode('sensordatavalues'))
    df_ = df_[df_.sensordatavalues.value_type.isin(['P1', 'P2'])]
    dfs[i] = dfs[i].drop('sensordatavalues').join(df_.groupby('id').agg(collect_list('sensordatavalues').alias('sensordatavalues')), on='id', how='inner')
    dfs[i] = dfs[i].select('sensor.id','location.latitude','location.longitude', 'location.altitude', 'sensordatavalues').sort('sensor.id')

# Create a RDD with the sensor id, and a tuple of the latitude and longitude, using the first dataframe:
rdd = dfs[0].rdd.map(lambda x: (x[0], (float(x[1]), float(x[2]))))
# Create a KMeans model with 200 clusters using latitude and longitude values from the first dataframe:
model = KMeans.train(rdd.map(lambda x: x[1]), 200, maxIterations=10, initializationMode="k-means||", seed=23)
# (This model will be used to predict the cluster for each sensor id in the second dataframe.)

for i in range(len(dfs)):
    # Create a RDD with the sensor id, and a tuple of the latitude and longitude using the current dataframe:
    rdd = dfs[i].rdd.map(lambda x: (x[0], (float(x[1]), float(x[2]))))
    # Create a RDD collection with both the sensor id and the corresponding predicted cluster:
    rdd_clusters = rdd.map(lambda x: (x[0], model.predict(x[1])))
    # Store each cluster center in a dictionary (Each cluster center is a tuple of latitude and longitude), rounded to 2 decimals:
    centers = {i: np.round(np.array(model.clusterCenters[i]), 2) for i in range(len(model.clusterCenters))}
    # Create a RDD collection with the cluster id, the sensor id, and the cluster center:
    rdd_clusters = rdd_clusters.map(lambda x: (x[1], x[0], centers[x[1]].tolist()))
    # Convert RDDs to DataFrames and join them to the original dataframe:
    dfs[i] = dfs[i].join(rdd_clusters.toDF(['cluster_id', 'id', 'cluster_center']), on='id', how='inner')
    # Group by cluster, keeping the cluster center and the sensordatavalues:
    dfs[i] = dfs[i].groupby('cluster_id').agg(collect_list('cluster_center')[0].alias('cluster_center'), collect_list('sensordatavalues').alias('sensordatavalues'))
    # PROBLEM: sensordatavalues is a list of list of data values. To fix this, explode two times the sensordatavalues column:
    dfs[i] = dfs[i].withColumn('sensordatavalues', explode('sensordatavalues'))
    dfs[i] = dfs[i].withColumn('sensordatavalues', explode('sensordatavalues'))
    # Then regroup by cluster, keeping the cluster center and the sensordatavalues for each cluster id:
    dfs[i] = dfs[i].groupby('cluster_id').agg(collect_list('cluster_center')[0].alias('cluster_center'), collect_list('sensordatavalues').alias('sensordatavalues'))
    # Once again, just like task 1, we compute the max AQI for each cluster:
    # Create a RDD collection of tuples (country, (P1, P2)), so that each rdd element is a combo of a cluster and eOk ither its P1 or P2 values. Then convert RDDs to DataFrames and join them to the original dataframe:
    dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], [float(y['value']) for y in x[2] if y[2] == 'P1'])).toDF(['cluster_id', 'P1']), on='cluster_id', how='inner')
    dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], [float(y['value']) for y in x[2] if y[2] == 'P2'])).toDF(['cluster_id', 'P2']), on='cluster_id', how='inner')
    # Create a RDD collection to calculate the average P1 and P2 values for each cluster, and convert RDDs to DataFrames and join them to the original dataframe:
    dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], float(np.round(np.mean(x[3]), 2)))).toDF(['cluster_id', 'avgP1']), on='cluster_id', how='inner')
    dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], float(np.round(np.mean(x[4]), 2)))).toDF(['cluster_id', 'avgP1']), on='cluster_id', how='inner')
    # Associate P1 and P2 avg to their respective AQI:
    dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], [y for y in air if air[y][2][0] <= np.round(x[5]) <= air[y][2][1]][0])).toDF(['cluster_id', 'P1_AQI']), on='cluster_id', how='inner')
    dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], [y for y in air if air[y][2][0] <= np.round(x[6]) <= air[y][2][1]][0])).toDF(['cluster_id', 'P2_AQI']), on='cluster_id', how='inner')
    # Calculate the max AQI for each country:
    dfs[i] = dfs[i].withColumn('maxAQI', when(dfs[i].P1_AQI > dfs[i].P2_AQI, dfs[i].P1_AQI).otherwise(dfs[i].P2_AQI))
    dfs[i] = dfs[i].select('cluster_id', 'cluster_center', 'maxAQI')
    dfs[i].show(10)

In [None]:
df1 = dfs[0].select('cluster_id', 'cluster_center', 'maxAQI').withColumnRenamed('maxAQI', 'maxAQI_1')
df2 = dfs[1].select('cluster_id', 'maxAQI').withColumnRenamed('maxAQI', 'maxAQI_2')
df_diff = df1.join(df2, on='cluster_id', how='inner')
df_diff = df_diff.withColumn('diffAQI', df_diff.maxAQI_2 - df_diff.maxAQI_1)
df_diff = df_diff.sort('diffAQI', ascending=True)
df_diff.show(50)

In [None]:
spark.stop()