# MLBD : Assignment

### TODO: Switch methods from panda to spark when possible
### TODO: Work on a way to acquire data multiples times a day

In [None]:
# Imports:

import pandas as pd
import numpy as np
import json
from urllib.request import urlopen
from pyspark.sql.session import SparkSession
from pyspark import SparkFiles
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql.functions import col, array_contains

In [None]:
# Spark session builder:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
sc.uiWebUrl

In [None]:
# Locally save instances of the data:
import urllib.request, json, datetime

today = datetime.date.today()
now = datetime.datetime.now()
url5min = 'https://data.sensor.community/static/v2/data.json'
url24h = 'https://data.sensor.community/static/v2/data.24h.json'

with urllib.request.urlopen(url5min) as url:
    data = json.load(url)
with urllib.request.urlopen(url24h) as url:
    data24h = json.load(url)
with open('output/data5min_{}_{}h{}.json'.format(today, now.hour, str(now.minute).zfill(2)), 'w') as outfile:
    json.dump(data, outfile)
with open('output/data24h_{}.json'.format(today), 'w') as outfile:
    json.dump(data24h, outfile)

In [None]:
# Load data from local files and urls into Spark DataFrames:
url24h = 'https://data.sensor.community/static/v2/data.24h.json'
url5min = 'https://data.sensor.community/static/v2/data.json'
fileToCompare = 'output/data5min_2022-10-31_16h08.json'

# 1
spark.sparkContext.addFile(url24h)
filename24 = SparkFiles.get(url24h.split('/')[-1])

df24h = spark.read.json(filename24)
df24h.createOrReplaceTempView("df24h")
df24h.printSchema()

# 2
spark.sparkContext.addFile(url5min)
filename5 = SparkFiles.get(url5min.split('/')[-1])

df5min = spark.read.json(filename5)
df5min.createOrReplaceTempView("df5min")
df5min.printSchema()

# 3: create spark dataframe using local json file (for comparison)
spark.sparkContext.addFile(fileToCompare)
filename5_2 = SparkFiles.get(fileToCompare.split('/')[-1])

df5min2 = spark.read.json(filename5_2)
df5min2.createOrReplaceTempView("df5min2")
df5min2.printSchema()

In [None]:
# Spark dataframes used for tasks :
dfs = [df5min, df5min2]

# Spark implementation & tasks:

### Task 1: Identify the top 10 countries in terms of average air quality improvement over the previous 24 hours as well as the current averaged air quality indices of each. As far as possible use the country field in the sensor data to identify the country.

In [None]:
# Remove sensor values whose sensor value_type is not 'P1' or 'P2':
for i in range(len(dfs)):
    dfs[i] = dfs[i].\
        where(array_contains(col('sensordatavalues.value_type'), 'P1')
              | array_contains(col('sensordatavalues.value_type'),'P2'))

    # Remove everything but P1 and P2 in df, by creating a new df using pandas and exploding the sensordatavalues column:
    pdf = dfs[i].toPandas()
    pdf = pdf.explode('sensordatavalues')
    pdf = pdf[pdf['sensordatavalues'].apply(lambda x: x['value_type'] in ['P1', 'P2'])]
    pdf = pdf.groupby('id').agg({'sensordatavalues': lambda x: list(x)})
    pdf['id'] = pdf.index

    df_sdv = spark.createDataFrame(pdf)

    dfs[i] = dfs[i].drop('sensordatavalues')
    dfs[i] = dfs[i].join(df_sdv, on='id')

    dfs[i].select('id','location.country','location.id', 'sensordatavalues.value_type','sensordatavalues.value').sort('location.country', 'location.id').show(30, False)

In [None]:
# Explode the sensordatavalues column, and make it so that P1 and P2 are in individual rows, for a given country:
for i in range(len(dfs)):
    pdf = dfs[i].toPandas()
    pdf['location'] = pdf['location'].apply(lambda x: x[1])
    pdf['sensor'] = pdf['sensor'].apply(lambda x: x[0])
    pdf = pdf.sort_values(by=['location'])
    pdf = pdf.explode('sensordatavalues')
    pdf = pdf.groupby('location').agg({'location': lambda x: list(x)[0], 'sensor': lambda x: list(x), 'sensordatavalues': lambda x: list(x)})
    pdf['sensordatavalues'] = pdf.apply(lambda x: [[str(x['sensor'][i]), str(x['sensordatavalues'][i][1]), x['sensordatavalues'][i][2]] for i in range(len(x['sensor']))], axis=1)
    pdf['P1'] = pdf['sensordatavalues'].apply(lambda x: [float(i[1]) for i in x if i[2] == 'P1'])
    pdf['P2'] = pdf['sensordatavalues'].apply(lambda x: [float(i[1]) for i in x if i[2] == 'P2'])
    pdf = pdf.drop(['sensor'], axis=1)
    sdf = spark.createDataFrame(pdf)
    dfs[i] = sdf
    dfs[i].show(10)

In [None]:
# Create a map to store air quality indexes: Bulk method
air = {}
air[1] = ["Low", [0,16], [0,11]]
air[2] = ["Low", [17,33], [12,23]]
air[3] = ["Low", [34,50], [24,35]]
air[4] = ["Medium", [51,58], [36,41]]
air[5] = ["Medium", [59,66], [42,47]]
air[6] = ["Medium", [67,75], [48,53]]
air[7] = ["High", [76,83], [54,58]]
air[8] = ["High", [84,91], [59,64]]
air[9] = ["High", [92,100], [65,70]]
air[10] = ["Very High", [101,10000000], [71,10000000]]

In [None]:
# Calculate the average air quality index for each country, and compare it to the previous 24 hours:
pdf1 = dfs[0].toPandas()
pdf2 = dfs[1].toPandas()

pdf1['meanP1'] = pdf1['P1'].apply(lambda x: np.mean(x))
pdf1['meanP2'] = pdf1['P2'].apply(lambda x: np.mean(x))
pdf1['AQI_P1'] = pdf1['meanP1'].apply(lambda x: [i for i in air if air[i][1][0] <= round(x) <= air[i][1][1]][0])
pdf1['AQI_P2'] = pdf1['meanP2'].apply(lambda x: [i for i in air if air[i][2][0] <= round(x) <= air[i][2][1]][0])
pdf1['maxAQI'] = pdf1.apply(lambda x: max(x['AQI_P1'], x['AQI_P2']), axis=1)
pdf1.drop(['sensordatavalues','P1', 'P2', 'meanP1', 'meanP2', 'AQI_P1', 'AQI_P2'], axis=1, inplace=True)

pdf2['meanP1'] = pdf2['P1'].apply(lambda x: np.mean(x))
pdf2['meanP2'] = pdf2['P2'].apply(lambda x: np.mean(x))
pdf2['AQI_P1'] = pdf2['meanP1'].apply(lambda x: [i for i in air if air[i][1][0] <= round(x) <= air[i][1][1]][0])
pdf2['AQI_P2'] = pdf2['meanP2'].apply(lambda x: [i for i in air if air[i][2][0] <= round(x) <= air[i][2][1]][0])
pdf2['maxAQI'] = pdf2.apply(lambda x: max(x['AQI_P1'], x['AQI_P2']), axis=1)
pdf2.drop(['sensordatavalues','P1', 'P2', 'meanP1', 'meanP2', 'AQI_P1', 'AQI_P2'], axis=1, inplace=True)

pdf = pdf1.merge(pdf2, on='location', how='outer', suffixes=('_1', '_2'))
# remove rows with NaN values: We consider that no new measure means that the air quality did not improve, nor worsen :
pdf = pdf.dropna()
# convert to int
pdf['maxAQI_1'] = pdf['maxAQI_1'].astype(int)
pdf['maxAQI_2'] = pdf['maxAQI_2'].astype(int)
pdf['AQI_diff'] = pdf.apply(lambda x: x['maxAQI_2'] - x['maxAQI_1'], axis=1)
pdf = pdf.sort_values(by=['AQI_diff'], ascending=True)
pdf.head(20)