# MSc in CSTE, CIDA option Machine learning & Big Data Assignment

### Analysis of data from an environmental sensor network using Hadoop/Spark

In [None]:
import numpy as np
from urllib.request import urlopen
from pyspark.sql.session import SparkSession
from pyspark import SparkFiles
from pyspark.sql.functions import *
import urllib.request, json, datetime, time
from pyspark.mllib.clustering import KMeans
from itertools import groupby
from ipywidgets import interact
import ipywidgets as widgets
import matplotlib.pyplot as plt

In [None]:
# Allocate more memory to the driver:
MAX_MEMORY= "8g"
# Spark session builder:
spark = SparkSession.builder.config("spark.executor.memory", MAX_MEMORY).config("spark.driver.memory", MAX_MEMORY).getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("OFF")
sc.uiWebUrl

In [None]:
# Locally save instances of the data:
def save_data(wait):
    # URLs of the data:
    url5min = 'https://data.sensor.community/static/v2/data.json'
    url24h = 'https://data.sensor.community/static/v2/data.24h.json'

    # If wait=True, sleep until specific time (e.g. 5pm) before running the next line
    if wait:
        today = datetime.datetime.now()
        exactImportTime = datetime.datetime(today.year, today.month, today.day, 17, 0, 0)
        awaitingTime = exactImportTime - today
        time.sleep(awaitingTime.total_seconds())

    # Download the data, saved as json files:
    today = datetime.datetime.now()
    with urllib.request.urlopen(url5min) as url:
        data5min = json.load(url)
    with open('output/5min/data5min_{}-{}-{}_{}h{}.json'.format(today.year, today.month, today.day, today.hour, str(today.minute).zfill(2)), 'w') as outfile:
        json.dump(data5min, outfile)
    with urllib.request.urlopen(url24h) as url:
        data24h = json.load(url)
    with open('output/24h/data24h_{}-{}-{}_{}h{}.json'.format(today.year, today.month, today.day, today.hour, str(today.minute).zfill(2)), 'w') as outfile:
        json.dump(data24h, outfile)

In [None]:
# Load data from local files and load them into Spark DataFrames:
def load_data(all):
    path = 'output/24h/'
    file1 = 'data24h_2022-11-3_17h00.json'
    file2 = 'data24h_2022-11-4_17h00.json'
    file3 = 'data24h_2022-11-5_17h00.json'
    file4 = 'data24h_2022-11-6_17h00.json'
    file5 = 'data24h_2022-11-7_17h00.json'
    file6 = 'data24h_2022-11-8_17h00.json'
    file7 = 'data24h_2022-11-9_17h00.json'
    file8 = 'data24h_2022-11-10_17h00.json'
    file9 = 'data24h_2022-11-11_17h00.json'
    file10 = 'data24h_2022-11-12_17h00.json'
    file11 = 'data24h_2022-11-13_17h00.json'

    files = [file1, file2, file3, file4, file5, file6, file7, file8, file9, file10, file11]
    dfs = []

    if all:
        for file in files:
            spark.sparkContext.addFile(path + file)
            filename = SparkFiles.get(file)
            df = spark.read.json(filename)
            dfs.append(df)
    else:
        for file in files[len(files)-2:]:
            spark.sparkContext.addFile(path + file)
            filename = SparkFiles.get(file)
            df = spark.read.json(filename)
            dfs.append(df)
    return dfs

In [None]:
# Define the AQI Map:
def air_quality_map():
    air = {}
    air[1] = ["Low", [0,16], [0,11]]
    air[2] = ["Low", [17,33], [12,23]]
    air[3] = ["Low", [34,50], [24,35]]
    air[4] = ["Medium", [51,58], [36,41]]
    air[5] = ["Medium", [59,66], [42,47]]
    air[6] = ["Medium", [67,75], [48,53]]
    air[7] = ["High", [76,83], [54,58]]
    air[8] = ["High", [84,91], [59,64]]
    air[9] = ["High", [92,100], [65,70]]
    air[10] = ["Very High", [101,10000000], [71,10000000]]
    return air

# Spark implementation & tasks:

### Task 0: Data pre-processing, filtering and cleaning:

In [None]:
# Preprocessing (P1 and P2 filtering) (Common to all tasks, the returned dataframe will be used for all three tasks):
def preprocessing(dfs):
    for i in range(len(dfs)):
        # Remove columns that are not needed for all three tasks:
        dfs[i] = dfs[i].drop('sampling_rate', 'timestamp').withColumn('country', dfs[i].location.country).withColumn('latitude', dfs[i].location.latitude).withColumn('longitude', dfs[i].location.longitude).withColumn('sensor_id', dfs[i].sensor.id).drop('location', 'sensor')
        # Explode sensordatavalues using pyspark.sql.functions.explode
        df_ = dfs[i].withColumn('sensordatavalues', explode('sensordatavalues'))
        # Remove rows that aren't P1 or P2:
        df_ = df_[df_.sensordatavalues.value_type.isin(['P1', 'P2'])]
        # Remove rows that have negative values:
        df_ = df_[df_.sensordatavalues.value >= 0]
        # Regroup sensordatavalues by record id:
        df_ = df_.groupby('id').agg(collect_list('sensordatavalues').alias('sensordatavalues'))
        # Remove the old sensordatavalues column still containing values different from P1 and P2:
        dfs[i] = dfs[i].drop('sensordatavalues')
        # Link the new sensordatavalues column to the old dataframe, on id:
        dfs[i] = dfs[i].join(df_, on='id', how='inner')
    return dfs

### Task 1: Identify the top 10 countries in terms of average air quality improvement over the previous 24 hours as well as the current averaged air quality indices of each. As far as possible use the country field in the sensor data to identify the country.

In [None]:
def task1(dfs, air):
    for i in range(len(dfs)):
        # For task 1, only the country and sensordatavalues columns are needed:
        df_ = dfs[i].select('country', 'sensordatavalues')
        # Start by sorting by country:
        df_ = df_.sort('country')
        # explode sensordatavalues:
        df_ = df_.withColumn('sensordatavalues', explode('sensordatavalues'))
        # group by country:
        df_ = df_.groupby('country').agg(collect_list('sensordatavalues').alias('sensordatavalues'))
        # Create a RDD collection of tuples (country, (P1, P2)), so that each rdd element is a combo of a country and either its P1 or P2 values. Then convert RDDs to DataFrames and join them to the original dataframe:
        df_ = df_.join(df_.rdd.map(lambda x: (x[0], [float(y['value']) for y in x[1] if y['value_type'] == 'P1'])).toDF(['country', 'P1']), on='country', how='inner')
        df_ = df_.join(df_.rdd.map(lambda x: (x[0], [float(y['value']) for y in x[1] if y['value_type'] == 'P2'])).toDF(['country', 'P2']), on='country', how='inner')
        # Create a RDD collection to calculate the average P1 and P2 values for each country, and convert RDDs to DataFrames and join them to the original dataframe:
        df_ = df_.join(df_.rdd.map(lambda x: (x[0], float(np.round(np.mean(x[2]), 2)))).toDF(['country', 'avgP1']), on='country', how='inner')
        df_ = df_.join(df_.rdd.map(lambda x: (x[0], float(np.round(np.mean(x[3]), 2)))).toDF(['country', 'avgP2']), on='country', how='inner')
        # Associate P1 and P2 avg to their respective AQI:
        df_ = df_.join(df_.rdd.map(lambda x: (x[0], [y for y in air if air[y][2][0] <= np.round(x[4]) <= air[y][2][1]][0])).toDF(['country', 'P1_AQI']), on='country', how='inner')
        df_ = df_.join(df_.rdd.map(lambda x: (x[0], [y for y in air if air[y][2][0] <= np.round(x[5]) <= air[y][2][1]][0])).toDF(['country', 'P2_AQI']), on='country', how='inner')
        # Calculate the max AQI for each country:
        dfs[i] = df_.withColumn('maxAQI', when(df_.P1_AQI > df_.P2_AQI, df_.P1_AQI).otherwise(df_.P2_AQI))
    # Choosing the last two dataframes, identify the top 10 countries in terms of average air quality improvement over the previous 24 hours as well as the current averaged air quality indices of each.
    df1 = dfs[len(dfs)-2].select('country', 'maxAQI').withColumnRenamed('maxAQI', 'maxAQI_1')
    df2 = dfs[len(dfs)-1].select('country', 'maxAQI').withColumnRenamed('maxAQI', 'maxAQI_2')
    df_diff = df1.join(df2, on='country', how='inner')
    df_diff = df_diff.withColumn('diffAQI', df_diff.maxAQI_2 - df_diff.maxAQI_1).select('country', 'diffAQI')
    df_diff = df_diff.sort('diffAQI', ascending=True)
    return df_diff

In [None]:
# Task 1:
air = air_quality_map()
dfs = load_data(False)
dfs = preprocessing(dfs)
df_diff = task1(dfs, air)
df_diff.show(10)

#

## Task 2: Using the geo-coordinates from the sensor data, group the data into smaller regions using an appropriate clustering algorithm. Then determine the top 50 regions in terms of air quality improvement over the previous 24 hours.

### Task 2-1: Pre-filter the pre-processed data by creating clusters and grouping data by cluster:

In [None]:
def filter_data_by_cluster(dfs):
    # Create a RDD with the sensor id, and a tuple of the latitude and longitude, using the oldest dataframe:
    rdd = dfs[0].rdd.map(lambda x: (x[4], (float(x[2]), float(x[3]))))
    # Create a KMeans model with 200 clusters using latitude and longitude values from the first dataframe:
    model = KMeans.train(rdd.map(lambda x: x[1]), 200, seed=23)
    # (This model will be used to predict the cluster for each sensor id in the second dataframe.)

    for i in range(len(dfs)):
        # Create a RDD with the sensor id, and a tuple of the latitude and longitude using the current dataframe:
        rdd = dfs[i].rdd.map(lambda x: (x[4], (float(x[2]), float(x[3]))))
        # Create a RDD collection with both the sensor id and the corresponding predicted cluster:
        rdd_clusters = rdd.map(lambda x: (x[0], model.predict(x[1])))
        # Create a RDD containing the amount of sensors in each cluster:
        rdd_sensorAmountByCluster = rdd_clusters.map(lambda x: (x[1], 1)).reduceByKey(lambda x, y: x + y)
        # Store each cluster center in a dictionary (Each cluster center is a tuple of latitude and longitude), rounded to 2 decimals:
        centers = {i: np.round(np.array(model.clusterCenters[i]), 2) for i in range(len(model.clusterCenters))}
        # Add the cluster center to the RDD collection:
        rdd_clusters = rdd_clusters.map(lambda x: (x[1], x[0], centers[x[1]].tolist()))
        # Convert RDDs to DataFrames and join them to the original dataframe:
        dfs[i] = dfs[i].join(rdd_clusters.toDF(['cluster_id', 'sensor_id', 'cluster_center']), on='sensor_id', how='inner')
        dfs[i] = dfs[i].join(rdd_sensorAmountByCluster.toDF(['cluster_id', 'sensor_amount']), on='cluster_id', how='inner')
        # Group by cluster, keeping the cluster center and the sensordatavalues:
        dfs[i] = dfs[i].groupby('cluster_id').agg(collect_list('cluster_center')[0].alias('cluster_center'), collect_list('sensor_amount')[0].alias('sensor_amount'), collect_list('sensordatavalues').alias('sensordatavalues'))
        # PROBLEM: sensordatavalues is a list of list of data values. To fix this, explode two times the sensordatavalues column:
        dfs[i] = dfs[i].withColumn('sensordatavalues', explode('sensordatavalues'))
        dfs[i] = dfs[i].withColumn('sensordatavalues', explode('sensordatavalues'))
        # Then regroup by cluster, keeping the cluster center and the sensordatavalues for each cluster id:
        dfs[i] = dfs[i].groupby('cluster_id').agg(collect_list('cluster_center')[0].alias('cluster_center'), collect_list('sensor_amount')[0].alias('sensor_amount'), collect_list('sensordatavalues').alias('sensordatavalues'))
        # Once again, just like task 1, we compute the max AQI for each cluster:
        # Create a RDD collection of tuples (country, (P1, P2)), so that each rdd element is a combo of a cluster and either its P1 or P2 values. Then convert RDDs to DataFrames and join them to the original dataframe:
        dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], [float(y['value']) for y in x[3] if y[2] == 'P1'])).toDF(['cluster_id', 'P1']), on='cluster_id', how='inner')
        dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], [float(y['value']) for y in x[3] if y[2] == 'P2'])).toDF(['cluster_id', 'P2']), on='cluster_id', how='inner')
        # Create a RDD collection to calculate the average P1 and P2 values for each cluster, and convert RDDs to DataFrames and join them to the original dataframe:
        dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], float(np.round(np.mean(x[4]), 2)))).toDF(['cluster_id', 'avgP1']), on='cluster_id', how='inner')
        dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], float(np.round(np.mean(x[5]), 2)))).toDF(['cluster_id', 'avgP1']), on='cluster_id', how='inner')
        # Associate P1 and P2 avg to their respective AQI:
        dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], [y for y in air if air[y][2][0] <= np.round(x[6]) <= air[y][2][1]][0])).toDF(['cluster_id', 'P1_AQI']), on='cluster_id', how='inner')
        dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], [y for y in air if air[y][2][0] <= np.round(x[7]) <= air[y][2][1]][0])).toDF(['cluster_id', 'P2_AQI']), on='cluster_id', how='inner')
        # Calculate the max AQI for each cluster:
        dfs[i] = dfs[i].withColumn('maxAQI', when(dfs[i].P1_AQI > dfs[i].P2_AQI, dfs[i].P1_AQI).otherwise(dfs[i].P2_AQI))
        dfs[i] = dfs[i].select('cluster_id', 'cluster_center', 'sensor_amount', 'maxAQI')
    return dfs

### Task 2-2: Select last two dataframes and compare their AQI to sort clusters by the AQI difference over the last 24 hours:

In [None]:
def task2(dfs):
    # Filter the dataframes by cluster:
    dfs = filter_data_by_cluster(dfs)
    # Select the last two dataframes, to compare the evolution of the air quality between the last 24 hours:
    df1 = dfs[len(dfs)-2].select('cluster_id', 'cluster_center', 'sensor_amount', 'maxAQI').withColumnRenamed('maxAQI', 'maxAQI_1')
    df2 = dfs[len(dfs)-1].select('cluster_id', 'maxAQI').withColumnRenamed('maxAQI', 'maxAQI_2')
    # Join both dataframes on cluster_id:
    df_diff = df1.join(df2, on='cluster_id', how='inner')
    # Create a column named diffAQI, whose value is the relative difference between today's maxAQI, and yesterday maxAQI:
    df_diff = df_diff.withColumn('diffAQI', df_diff.maxAQI_2 - df_diff.maxAQI_1).select('cluster_id', 'cluster_center', 'sensor_amount', 'diffAQI')
    # Sort the dataframe by diffAQI, starting with the lowest diffAQIs:
    df_diff = df_diff.sort('diffAQI', ascending=True)
    return df_diff

In [None]:
air = air_quality_map()
dfs = load_data(False)
dfs = preprocessing(dfs)
df_diff = task2(dfs)
df_diff.show(50)

###

### Task 3: Calculate the longest streaks of good air quality (ie low index values) and display as a histogram.

In [None]:
def task3(dfs):
    # Filter the dataframes by cluster:
    dfs = filter_data_by_cluster(dfs)
    # Create a RDD with the cluster id, and a list containing 0s or 1s if the maxAQI is respectively lower or higher than 3:
    rdd = dfs[0].rdd.map(lambda x: (x[0], [0 if x[3] < 4 else 1 ]))
    # Create a RDD with the cluster id, the list of 0s/1s, the current streak of repetitive 0s and the max streak:
    rdd_streaks = rdd.map(lambda x: (x[0], x[1], (1 if x[1][0] == 0 else 0), (1 if x[1][0] == 0 else 0)))
    # Convert the RDD to a dataframe:
    df = rdd_streaks.toDF(['cluster_id', 'streaks', 'current_streak', 'max_streak'])
    # For all the following days (each dataframes following the first stored one)
    for i in range(1, len(dfs)):
        # Create a RDD with the cluster id, and a list containing 0s or 1s if the maxAQI is respectively lower or higher than 3:
        rdd = dfs[i].rdd.map(lambda x: (x[0], [0 if x[3] < 4 else 1 ]))
        # Create a RDD with the cluster id, the list of 0s/1s and the current streak:
        rdd_streaks = rdd.map(lambda x: (x[0], x[1], (1 if x[1][len(x[1])-1] == 0 else 0)))
        # Convert the RDD containing streak information to a dataframe, and join it to the previous dataframe:
        df = df.join(rdd_streaks.toDF(['cluster_id', 'streak', 'previous_streak']), on='cluster_id', how='inner')
        # Concatenate the 0s/1s values list with the current df 0s/1s value into a single list, and drop the colomn with only one value:
        df = df.withColumn('streaks', concat('streaks', 'streak'))
        df = df.drop('streak')
        # Update the current_streak column using the previous_streak value:
        df = df.withColumn('current_streak', when(df.previous_streak == 1, df.current_streak + 1).otherwise(0))
        # Update the max_streak value using the previous_streak and the max_streak:
        df = df.withColumn('max_streak', when(df.current_streak > df.max_streak, df.current_streak).otherwise(df.max_streak))
        # Drop the current streak value:
        df = df.drop('previous_streak')
    # Show the current state of streaks for each cluster id (for verification):
    df = df.sort('cluster_id')
    return df

In [None]:
air = air_quality_map()
dfs = load_data(True)
dfs = preprocessing(dfs)
df = task3(dfs)
df.show(10)

### Task 3: Two histogram methods:
Method 1: A single histogram showing longest (maximum or average) streaks across all regions.

In [None]:
def method_one_histogram(df):
    # Group by max_streak, show the cluster_id and the amount of clusters with that max_streak:
    df = df.groupBy('max_streak').agg(count('cluster_id').alias('cluster_amount'), collect_list('cluster_id').alias('cluster_ids'))
    df = df.sort('max_streak', ascending=False)
    print('The clusters ids with the longest streaks of good air quality (ie low index values), as well as the amount of clusters with that streak:')
    df.show(df.count())
    # Create histogram of the streaks:
    df = df.withColumn('max_streak', df.max_streak.cast('int'))
    df = df.withColumn('cluster_amount', df.cluster_amount.cast('int'))
    pdf = df.sort('max_streak', ascending=True).toPandas()
    pdf.plot.bar(x='max_streak', y='cluster_amount', rot=0)

In [None]:
df1 = method_one_histogram(df)

Method 2: A histogram for each region/cluster, showing the distribution of continuous good AQI streaks.

In [None]:
def method_two_histogram(df):
    # Calculate successive 0s for each cluster:
    pdf = df.toPandas()
    max_streak = len(dfs)
    # Create a list of lists, where each list contains the successive 0s for each cluster:
    pdf['successive_0s'] = pdf['streaks'].apply(lambda x: [len(list(g)) for k, g in groupby(x) if k == 0])
    # Count the amount of clusters with a certain amount of successive 0s:
    pdf['successive_0s_hist'] = pdf['successive_0s'].apply(lambda x: [x.count(i) for i in range(1, max_streak+1)])
    # Deduce the amount of time the cluster didn't came back to a good air quality:
    pdf['successive_0s_hist'] = pdf.apply(lambda x: [max_streak - np.sum(x['successive_0s'])] + x['successive_0s_hist'], axis=1)
    # Remove useless columns:
    pdf = pdf.drop('streaks', axis=1).drop('current_streak', axis=1).drop('max_streak', axis=1)
    return pdf

In [None]:
df2 = method_two_histogram(df)
df2.head(10)

In [None]:
# # Create a histogram of the successive 0s with a slider to select the cluster id using matplotlib:
# def f(cluster_id):
#     # Fix it so that you can plot using the cluster_id from a spark dataframe:
#     plt.bar(range(0, len(dfs)+1), df2.iloc[cluster_id]['successive_0s_hist'])
#     plt.show()
# interact(f, cluster_id=widgets.IntSlider(min=0, max=len(df2)-1, step=1, value=0))
plt.bar(range(0, len(dfs)+1), df2.iloc[3]['successive_0s_hist'])
plt.show()

In [None]:
# TODO: Folium map with three sets

In [None]:
spark.stop()