# MSc in CSTE, CIDA option Machine learning & Big Data Assignment

### Analysis of data from an environmental sensor network using Hadoop/Spark

In [1]:
import numpy as np
from urllib.request import urlopen
from pyspark.sql.session import SparkSession
from pyspark import SparkFiles
from pyspark.sql.functions import *
import urllib.request, json, datetime

In [2]:
# Spark session builder:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
sc.uiWebUrl

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/06 19:02:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


'http://rn-dynamic-190-059.wless.cranfield.ac.uk:4040'

In [4]:
# Locally save instances of the data:
import time

today = datetime.datetime.now()
url5min = 'https://data.sensor.community/static/v2/data.json'
url24h = 'https://data.sensor.community/static/v2/data.24h.json'

# sleep until specific time (e.g. 5pm) before running the next line
today = datetime.datetime.now()
exactImportTime = datetime.datetime(today.year, today.month, today.day, 17, 0, 0)
awaitingTime = exactImportTime - today
time.sleep(awaitingTime.total_seconds())

today = datetime.datetime.now()
with urllib.request.urlopen(url5min) as url:
    data5min = json.load(url)
with open('output/5min/data5min_{}-{}-{}_{}h{}.json'.format(today.year, today.month, today.day, today.hour, str(today.minute).zfill(2)), 'w') as outfile:
    json.dump(data5min, outfile)
with urllib.request.urlopen(url24h) as url:
    data24h = json.load(url)
with open('output/24h/data24h_{}-{}-{}_{}h{}.json'.format(today.year, today.month, today.day, today.hour, str(today.minute).zfill(2)), 'w') as outfile:
    json.dump(data24h, outfile)

In [3]:
# Load data from local files and load them into Spark DataFrames:
path = 'output/24h/'
file1 = 'data24h_2022-11-3_17h00.json'
file2 = 'data24h_2022-11-4_17h00.json'
file3 = 'data24h_2022-11-5_17h00.json'
file4 = 'data24h_2022-11-6_17h00.json'

files = [file1, file2, file3, file4]
dfs = []

for file in files:
    spark.sparkContext.addFile(path + file)
    filename = SparkFiles.get(file)
    df = spark.read.json(filename)
    df.printSchema()
    dfs.append(df)

                                                                                

root
 |-- id: long (nullable = true)
 |-- location: struct (nullable = true)
 |    |-- altitude: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- exact_location: long (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- indoor: long (nullable = true)
 |    |-- latitude: string (nullable = true)
 |    |-- longitude: string (nullable = true)
 |-- sampling_rate: long (nullable = true)
 |-- sensor: struct (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- pin: string (nullable = true)
 |    |-- sensor_type: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- manufacturer: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- sensordatavalues: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- value: string (nullable = true)
 |    |    |-- value_type: string (nullable = true)
 |-- timestamp: string (nullable

In [4]:
# AQI Map:
air = {}
air[1] = ["Low", [0,16], [0,11]]
air[2] = ["Low", [17,33], [12,23]]
air[3] = ["Low", [34,50], [24,35]]
air[4] = ["Medium", [51,58], [36,41]]
air[5] = ["Medium", [59,66], [42,47]]
air[6] = ["Medium", [67,75], [48,53]]
air[7] = ["High", [76,83], [54,58]]
air[8] = ["High", [84,91], [59,64]]
air[9] = ["High", [92,100], [65,70]]
air[10] = ["Very High", [101,10000000], [71,10000000]]

# Spark implementation & tasks:

### Task 1: Identify the top 10 countries in terms of average air quality improvement over the previous 24 hours as well as the current averaged air quality indices of each. As far as possible use the country field in the sensor data to identify the country.

In [12]:
# Preprocessing (P1 and P2 filtering):
for i in range(len(dfs)):
    print("Raw dataset count: ", dfs[i].count())
    # Explode sensordatavalues using pyspark.sql.functions.explode
    df_ = dfs[i].withColumn('sensordatavalues', explode('sensordatavalues'))
    print("Dataset count after exploding sensordatavalues: ", df_.count())
    # Remove rows that aren't P1 or P2:
    df_ = df_[df_.sensordatavalues.value_type.isin(['P1', 'P2'])]
    print("Dataset count after removing rows that aren't P1 or P2: ", df_.count())
    # Regroup sensordatavalues by record id:
    df_ = df_.groupby('id').agg(collect_list('sensordatavalues').alias('sensordatavalues'))
    # Remove the old sensordatavalues column still containing values different from P1 and P2:
    dfs[i] = dfs[i].drop('sensordatavalues')
    # Link the new sensordatavalues column to the old dataframe, on id:
    dfs[i] = dfs[i].join(df_, on='id', how='inner')
    print("Dataset count after joining the new sensordatavalues column to the old dataframe: ", dfs[i].count())

    dfs[i].show(5)
    # dfs[i].select('id','location.country','location.id', 'sensordatavalues.value_type','sensordatavalues.value').sort('location.country', 'location.id').show(5, False)

Raw dataset count:  25282
Dataset count after exploding sensordatavalues:  62869
Dataset count after removing rows that aren't P1 or P2:  26374
Dataset count after joining the new sensordatavalues column to the old dataframe:  13194
+-----------+--------------------+-------------+--------------------+-------------------+--------------------+
|         id|            location|sampling_rate|              sensor|          timestamp|    sensordatavalues|
+-----------+--------------------+-------------+--------------------+-------------------+--------------------+
|12819615587|{365.6, DE, 0, 16...|         null|{92, 1, {14, Nova...|2022-11-04 12:03:40|[{28631854905, 4....|
|12815343559|{373.1, DE, 0, 49...|         null|{107, 5, {1, Shin...|2022-11-04 04:27:31|[{28621987635, 18...|
|12819622473|{282.5, DE, 0, 65...|         null|{140, 1, {14, Nov...|2022-11-04 12:04:18|[{28631870936, 6....|
|12819625695|{113.2, DE, 0, 63...|         null|{142, 1, {14, Nov...|2022-11-04 12:04:44|[{2863187837

In [65]:
for i in range(len(dfs)):
    # replace location field with country field:
    df_ = dfs[i].withColumn('location', dfs[i]['location.country'])
    # sort by country:
    df_ = df_.sort('location')
    # explode sensordatavalues:
    df_ = df_.withColumn('sensordatavalues', explode('sensordatavalues'))
    # group by country:
    df_ = df_.groupby('location').agg(collect_list('sensordatavalues').alias('sensordatavalues'))
    # Create a RDD collection of tuples (country, (P1, P2)), so that each rdd element is a combo of a country and either its P1 or P2 values. Then convert RDDs to DataFrames and join them to the original dataframe:
    df_ = df_.join(df_.rdd.map(lambda x: (x[0], [float(y['value']) for y in x[1] if y['value_type'] == 'P1'])).toDF(['location', 'P1']), on='location', how='inner')
    df_ = df_.join(df_.rdd.map(lambda x: (x[0], [float(y['value']) for y in x[1] if y['value_type'] == 'P2'])).toDF(['location', 'P2']), on='location', how='inner')
    # Create a RDD collection to calculate the average P1 and P2 values for each country, and convert RDDs to DataFrames and join them to the original dataframe:
    df_ = df_.join(df_.rdd.map(lambda x: (x[0], float(np.round(np.mean(x[2]), 2)))).toDF(['location', 'avgP1']), on='location', how='inner')
    df_ = df_.join(df_.rdd.map(lambda x: (x[0], float(np.round(np.mean(x[3]), 2)))).toDF(['location', 'avgP1']), on='location', how='inner')
    # Associate P1 and P2 avg to their respective AQI:
    df_ = df_.join(df_.rdd.map(lambda x: (x[0], [y for y in air if air[y][2][0] <= np.round(x[4]) <= air[y][2][1]][0])).toDF(['location', 'P1_AQI']), on='location', how='inner')
    df_ = df_.join(df_.rdd.map(lambda x: (x[0], [y for y in air if air[y][2][0] <= np.round(x[5]) <= air[y][2][1]][0])).toDF(['location', 'P2_AQI']), on='location', how='inner')
    # Calculate the max AQI for each country:
    df_ = df_.withColumn('maxAQI', when(df_.P1_AQI > df_.P2_AQI, df_.P1_AQI).otherwise(df_.P2_AQI))

    dfs[i] = df_
    dfs[i].show(10)

+--------+--------------------+--------------------+--------------------+-----+-----+------+------+------+
|location|    sensordatavalues|                  P1|                  P2|avgP1|avgP1|P1_AQI|P2_AQI|maxAQI|
+--------+--------------------+--------------------+--------------------+-----+-----+------+------+------+
|      AD|[{28512148628, 14...|              [14.6]|               [5.5]| 14.6|  5.5|     2|     1|     2|
|      AL|[{28501540558, 18...|[18.78, 29.38, 54...|[9.62, 13.22, 24....|33.74|15.87|     3|     2|     3|
|      AR|[{28511988059, 3....|[3.38, 15.23, 17....|[2.38, 2.58, 2.77...|10.99| 2.68|     1|     1|     1|
|      AT|[{28483617772, 7....|[7.91, 7.32, 0.39...|[4.67, 3.27, 0.39...|40.52|19.83|     4|     2|     4|
|      AU|[{28496018035, 0....|[0.24, 3.94, 1.27...|[0.15, 0.59, 0.34...| 3.09|  1.1|     1|     1|     1|
|      AX|[{28512155019, 9....|        [9.36, 29.9]|        [5.66, 13.4]|19.63| 9.53|     2|     1|     2|
|      AZ|[{28512185217, 23...|      

In [66]:
# Task 1: Identify the top 10 countries in terms of average air quality improvement over the previous 24 hours as well as the current averaged air quality indices of each. As far as possible use the country field in the sensor data to identify the country.
df1 = dfs[0].select('location', 'maxAQI').withColumnRenamed('maxAQI', 'maxAQI_1')
df2 = dfs[1].select('location', 'maxAQI').withColumnRenamed('maxAQI', 'maxAQI_2')
df_diff = df1.join(df2, on='location', how='inner')
df_diff = df_diff.withColumn('diffAQI', df_diff.maxAQI_2 - df_diff.maxAQI_1).select('location', 'diffAQI')
df_diff = df_diff.sort('diffAQI', ascending=True)
df_diff.show(12)

                                                                                

+--------+-------+
|location|diffAQI|
+--------+-------+
|      LU|     -3|
|      ZA|     -2|
|      TH|     -2|
|      SE|     -2|
|      IS|     -1|
|      AD|     -1|
|      BE|     -1|
|      KH|     -1|
|      HK|     -1|
|      HR|      0|
|      PT|      0|
|      SA|      0|
+--------+-------+
only showing top 12 rows



#

### Task 2: Using the geo-coordinates from the sensor data, group the data into smaller regions using an appropriate clustering algorithm. Then determine the top 50 regions in terms of air quality improvement over the previous 24 hours.

In [5]:
from pyspark.mllib.clustering import KMeans


for i in range(len(dfs)):
    # Same process as task 1, we filter the dataframe to keep only P1 and P2 values
    df_ = dfs[i].withColumn('sensordatavalues', explode('sensordatavalues'))
    df_ = df_[df_.sensordatavalues.value_type.isin(['P1', 'P2'])]
    dfs[i] = dfs[i].drop('sensordatavalues').join(df_.groupby('id').agg(collect_list('sensordatavalues').alias('sensordatavalues')), on='id', how='inner')
    dfs[i] = dfs[i].select('sensor.id','location.latitude','location.longitude', 'location.altitude', 'sensordatavalues').sort('sensor.id')

# Create a RDD with the sensor id, and a tuple of the latitude and longitude, using the oldest dataframe:
rdd = dfs[0].rdd.map(lambda x: (x[0], (float(x[1]), float(x[2]))))
# Create a KMeans model with 200 clusters using latitude and longitude values from the first dataframe:
model = KMeans.train(rdd.map(lambda x: x[1]), 200, maxIterations=10, initializationMode="k-means||", seed=23)
# (This model will be used to predict the cluster for each sensor id in the second dataframe.)

for i in range(len(dfs)):
    # Create a RDD with the sensor id, and a tuple of the latitude and longitude using the current dataframe:
    rdd = dfs[i].rdd.map(lambda x: (x[0], (float(x[1]), float(x[2]))))
    # Create a RDD collection with both the sensor id and the corresponding predicted cluster:
    rdd_clusters = rdd.map(lambda x: (x[0], model.predict(x[1])))
    # Create a RDD containing the amount of sensors in each cluster:
    rdd_sensorAmountByCluster = rdd_clusters.map(lambda x: (x[1], 1)).reduceByKey(lambda x, y: x + y)
    # Store each cluster center in a dictionary (Each cluster center is a tuple of latitude and longitude), rounded to 2 decimals:
    centers = {i: np.round(np.array(model.clusterCenters[i]), 2) for i in range(len(model.clusterCenters))}
    # Add the cluster center to the RDD collection:
    rdd_clusters = rdd_clusters.map(lambda x: (x[1], x[0], centers[x[1]].tolist()))
    # Convert RDDs to DataFrames and join them to the original dataframe:
    dfs[i] = dfs[i].join(rdd_clusters.toDF(['cluster_id', 'id', 'cluster_center']), on='id', how='inner')
    dfs[i] = dfs[i].join(rdd_sensorAmountByCluster.toDF(['cluster_id', 'sensor_amount']), on='cluster_id', how='inner')
    # Group by cluster, keeping the cluster center and the sensordatavalues:
    dfs[i] = dfs[i].groupby('cluster_id').agg(collect_list('cluster_center')[0].alias('cluster_center'), collect_list('sensor_amount')[0].alias('sensor_amount'), collect_list('sensordatavalues').alias('sensordatavalues'))
    # PROBLEM: sensordatavalues is a list of list of data values. To fix this, explode two times the sensordatavalues column:
    dfs[i] = dfs[i].withColumn('sensordatavalues', explode('sensordatavalues'))
    dfs[i] = dfs[i].withColumn('sensordatavalues', explode('sensordatavalues'))
    # Then regroup by cluster, keeping the cluster center and the sensordatavalues for each cluster id:
    dfs[i] = dfs[i].groupby('cluster_id').agg(collect_list('cluster_center')[0].alias('cluster_center'), collect_list('sensor_amount')[0].alias('sensor_amount'), collect_list('sensordatavalues').alias('sensordatavalues'))
    # Once again, just like task 1, we compute the max AQI for each cluster:
    # Create a RDD collection of tuples (country, (P1, P2)), so that each rdd element is a combo of a cluster and either its P1 or P2 values. Then convert RDDs to DataFrames and join them to the original dataframe:
    dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], [float(y['value']) for y in x[3] if y[2] == 'P1'])).toDF(['cluster_id', 'P1']), on='cluster_id', how='inner')
    dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], [float(y['value']) for y in x[3] if y[2] == 'P2'])).toDF(['cluster_id', 'P2']), on='cluster_id', how='inner')
    # Create a RDD collection to calculate the average P1 and P2 values for each cluster, and convert RDDs to DataFrames and join them to the original dataframe:
    dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], float(np.round(np.mean(x[4]), 2)))).toDF(['cluster_id', 'avgP1']), on='cluster_id', how='inner')
    dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], float(np.round(np.mean(x[5]), 2)))).toDF(['cluster_id', 'avgP1']), on='cluster_id', how='inner')
    # Associate P1 and P2 avg to their respective AQI:
    dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], [y for y in air if air[y][2][0] <= np.round(x[6]) <= air[y][2][1]][0])).toDF(['cluster_id', 'P1_AQI']), on='cluster_id', how='inner')
    dfs[i] = dfs[i].join(dfs[i].rdd.map(lambda x: (x[0], [y for y in air if air[y][2][0] <= np.round(x[7]) <= air[y][2][1]][0])).toDF(['cluster_id', 'P2_AQI']), on='cluster_id', how='inner')
    # Calculate the max AQI for each cluster:
    dfs[i] = dfs[i].withColumn('maxAQI', when(dfs[i].P1_AQI > dfs[i].P2_AQI, dfs[i].P1_AQI).otherwise(dfs[i].P2_AQI))
    dfs[i] = dfs[i].select('cluster_id', 'cluster_center', 'sensor_amount', 'maxAQI')
    dfs[i].show(10)

                                                                                

22/11/06 19:02:51 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


                                                                                

+----------+----------------+-------------+------+
|cluster_id|  cluster_center|sensor_amount|maxAQI|
+----------+----------------+-------------+------+
|         0|   [51.97, 5.93]|          295|     3|
|         1|[-33.73, 151.09]|           42|     1|
|         2|[51.27, -115.24]|            5|     1|
|         3|  [43.12, 27.81]|           75|     3|
|         4| [39.96, -75.88]|           11|     1|
|         5|  [45.83, 11.96]|           88|     8|
|         6|  [55.18, 73.42]|           56|     2|
|         7|  [49.58, 11.05]|          172|     2|
|         8|  [51.08, 17.16]|          192|     3|
|         9|  [53.37, -6.32]|           39|     2|
+----------+----------------+-------------+------+
only showing top 10 rows



                                                                                

+----------+----------------+-------------+------+
|cluster_id|  cluster_center|sensor_amount|maxAQI|
+----------+----------------+-------------+------+
|         0|   [51.97, 5.93]|          299|     3|
|         1|[-33.73, 151.09]|           42|     1|
|         2|[51.27, -115.24]|            5|     1|
|         3|  [43.12, 27.81]|           73|     2|
|         4| [39.96, -75.88]|           11|     1|
|         5|  [45.83, 11.96]|           88|     4|
|         6|  [55.18, 73.42]|           56|     1|
|         7|  [49.58, 11.05]|          170|     2|
|         8|  [51.08, 17.16]|          198|     3|
|         9|  [53.37, -6.32]|           39|     2|
+----------+----------------+-------------+------+
only showing top 10 rows



                                                                                

+----------+----------------+-------------+------+
|cluster_id|  cluster_center|sensor_amount|maxAQI|
+----------+----------------+-------------+------+
|         0|   [51.97, 5.93]|          294|     3|
|         1|[-33.73, 151.09]|           41|     1|
|         2|[51.27, -115.24]|            5|     1|
|         3|  [43.12, 27.81]|           73|     4|
|         4| [39.96, -75.88]|           11|     1|
|         5|  [45.83, 11.96]|           87|     1|
|         6|  [55.18, 73.42]|           54|     1|
|         7|  [49.58, 11.05]|          177|     2|
|         8|  [51.08, 17.16]|          199|     3|
|         9|  [53.37, -6.32]|           40|     2|
+----------+----------------+-------------+------+
only showing top 10 rows



[Stage 1205:>                                                       (0 + 1) / 1]

+----------+----------------+-------------+------+
|cluster_id|  cluster_center|sensor_amount|maxAQI|
+----------+----------------+-------------+------+
|         0|   [51.97, 5.93]|          300|     2|
|         1|[-33.73, 151.09]|           45|     2|
|         2|[51.27, -115.24]|            5|     1|
|         3|  [43.12, 27.81]|           73|     5|
|         4| [39.96, -75.88]|           11|     1|
|         5|  [45.83, 11.96]|           85|     1|
|         6|  [55.18, 73.42]|           54|     1|
|         7|  [49.58, 11.05]|          178|     3|
|         8|  [51.08, 17.16]|          198|     5|
|         9|  [53.37, -6.32]|           40|     2|
+----------+----------------+-------------+------+
only showing top 10 rows



                                                                                

In [7]:
df1 = dfs[len(dfs)-2].select('cluster_id', 'cluster_center', 'sensor_amount', 'maxAQI').withColumnRenamed('maxAQI', 'maxAQI_1')
df2 = dfs[len(dfs)-1].select('cluster_id', 'maxAQI').withColumnRenamed('maxAQI', 'maxAQI_2')
df_diff = df1.join(df2, on='cluster_id', how='inner')
df_diff = df_diff.withColumn('diffAQI', df_diff.maxAQI_2 - df_diff.maxAQI_1).select('cluster_id', 'cluster_center', 'sensor_amount', 'diffAQI')
df_diff = df_diff.sort('diffAQI', ascending=True)
df_diff.show(50)

[Stage 1326:>               (0 + 1) / 1][Stage 1351:>               (0 + 1) / 1]

+----------+----------------+-------------+-------+
|cluster_id|  cluster_center|sensor_amount|diffAQI|
+----------+----------------+-------------+-------+
|        21|  [55.05, 85.56]|            5|     -7|
|        84| [52.01, 112.32]|            6|     -4|
|       132|   [53.9, 12.06]|           47|     -4|
|        88|  [6.38, -10.78]|            2|     -2|
|       165|  [39.18, 22.65]|           15|     -2|
|        37|   [47.4, 19.07]|          219|     -2|
|       196|  [45.13, 39.23]|           17|     -2|
|        31|   [50.92, 6.97]|          398|     -2|
|        36|   [50.11, 8.67]|          267|     -1|
|        11|[-36.94, -64.18]|           10|     -1|
|       103|  [27.67, 85.32]|            2|     -1|
|       133|   [50.08, 7.94]|          139|     -1|
|        42|   [32.7, 73.81]|            3|     -1|
|       179|   [43.48, 1.52]|           41|     -1|
|        46|   [52.2, 10.08]|          196|     -1|
|         0|   [51.97, 5.93]|          294|     -1|
|        25|

                                                                                

###

### Task 3: Calculate the longest streaks of good air quality (ie low index values) and display as a histogram.

Assume task 2 is already done. We will use the dataframes where AQI values are already calculated for each cluster.

In [8]:
# Verify task 2 is done:
for i in range(len(dfs)):
    dfs[i].show(2)

# Task 3: Calculate the longest streaks of good air quality (ie low index values) and display as a histogram.
rdd = dfs[0].rdd.map(lambda x: (x[0], [0 if x[3] < 4 else 1 ]))
rdd_streaks = rdd.map(lambda x: (x[0], x[1], (1 if x[1][0] == 0 else 0), (1 if x[1][0] == 0 else 0)))
df = rdd_streaks.toDF(['cluster_id', 'streaks', 'current_streak', 'max_streak'])
for i in range(1, len(dfs)):
    rdd = dfs[i].rdd.map(lambda x: (x[0], [0 if x[3] < 4 else 1 ]))
    rdd_streaks = rdd.map(lambda x: (x[0], x[1], (1 if x[1][len(x[1])-1] == 0 else 0)))
    df = df.join(rdd_streaks.toDF(['cluster_id', 'streak', 'previous_streak']), on='cluster_id', how='inner')
    df = df.withColumn('streaks', concat('streaks', 'streak'))
    df = df.drop('streak')
    df = df.withColumn('current_streak', when(df.previous_streak == 1, df.current_streak + 1).otherwise(0))
    df = df.withColumn('max_streak', when(df.current_streak > df.max_streak, df.current_streak).otherwise(df.max_streak))
    df = df.drop('previous_streak')
df.sort('cluster_id').show(5)
# Group by max_streak, show the cluster_id and the amount of clusters with that max_streak:
df = df.groupBy('max_streak').agg(count('cluster_id').alias('cluster_amount'), collect_list('cluster_id').alias('cluster_ids'))
df = df.sort('max_streak', ascending=False)
print('The clusters ids with the longest streaks of good air quality (ie low index values), as well as the amount of clusters with that streak:')
df.show(10)

                                                                                

+----------+----------------+-------------+------+
|cluster_id|  cluster_center|sensor_amount|maxAQI|
+----------+----------------+-------------+------+
|         0|   [51.97, 5.93]|          295|     3|
|         1|[-33.73, 151.09]|           42|     1|
+----------+----------------+-------------+------+
only showing top 2 rows



                                                                                

+----------+----------------+-------------+------+
|cluster_id|  cluster_center|sensor_amount|maxAQI|
+----------+----------------+-------------+------+
|         0|   [51.97, 5.93]|          299|     3|
|         1|[-33.73, 151.09]|           42|     1|
+----------+----------------+-------------+------+
only showing top 2 rows



                                                                                

+----------+----------------+-------------+------+
|cluster_id|  cluster_center|sensor_amount|maxAQI|
+----------+----------------+-------------+------+
|         0|   [51.97, 5.93]|          294|     3|
|         1|[-33.73, 151.09]|           41|     1|
+----------+----------------+-------------+------+
only showing top 2 rows



                                                                                

+----------+----------------+-------------+------+
|cluster_id|  cluster_center|sensor_amount|maxAQI|
+----------+----------------+-------------+------+
|         0|   [51.97, 5.93]|          300|     2|
|         1|[-33.73, 151.09]|           45|     2|
+----------+----------------+-------------+------+
only showing top 2 rows



                                                                                

+----------+------------+--------------+----------+
|cluster_id|     streaks|current_streak|max_streak|
+----------+------------+--------------+----------+
|         0|[0, 0, 0, 0]|             4|         4|
|         1|[0, 0, 0, 0]|             4|         4|
|         2|[0, 0, 0, 0]|             4|         4|
|         3|[0, 0, 1, 1]|             0|         2|
|         4|[0, 0, 0, 0]|             4|         4|
+----------+------------+--------------+----------+
only showing top 5 rows

The clusters ids with the longest streaks of good air quality (ie low index values), as well as the amount of clusters with that streak:
+----------+--------------+--------------------+
|max_streak|cluster_amount|         cluster_ids|
+----------+--------------+--------------------+
|         4|           119|[26, 65, 191, 19,...|
|         3|            14|[22, 181, 161, 56...|
|         2|            20|[77, 87, 79, 170,...|
|         1|             9|[198, 84, 103, 28...|
|         0|            38|

In [7]:
spark.stop()