In [1]:
#Import libraries as needed
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, first, last, lag, lead, when
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

conf = SparkConf().setAppName('yuck').setMaster("local[*]").set("spark.driver.memory", "4g")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [3]:
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

# Define the schema for the CSV files
schema = StructType([
    StructField("unix_time", TimestampType(), True),
    StructField("row_num", IntegerType(), False),
    StructField("BEN", DoubleType(), True),
    StructField("CO", DoubleType(), True),
    StructField("EBE", DoubleType(), True),
    StructField("MXY", DoubleType(), True),
    StructField("NMHC", DoubleType(), True),
    StructField("NO_2", DoubleType(), True),
    StructField("NOx", DoubleType(), True),
    StructField("OXY", DoubleType(), True),
    StructField("O_3", DoubleType(), True),
    StructField("PM10", DoubleType(), True),
    StructField("PM25", DoubleType(), True),
    StructField("PXY", DoubleType(), False),
    StructField("SO_2", DoubleType(), True),
    StructField("TCH", DoubleType(), True),
    StructField("TOL", DoubleType(), True)])

In [23]:
# insert csv files to spark_df dataframe
data_path = 'C:\\Users\\eleni\\Documents\\Diplw\\Jupyter-Notebooks\\diplw\\csvs_per_year\\yuck\\clean_data.csv'
spark_df = spark.read.csv(data_path, header=True, schema=schema)

In [24]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Define the breakpoints and corresponding index levels for each pollutant
breakpoints = {
    'PM25': [0, 10, 20, 25, 50, 75, 800],
    'PM10': [0, 20, 40, 50, 100, 150, 1200],
    'NO_2': [0, 40, 90, 120, 230, 340, 1000],
    'O_3': [0, 50, 100, 130, 240, 380, 800],
    'SO_2': [0, 100, 200, 350, 500, 750, 1250]
}

# Define each category
categories = ['Good', 'Fair', 'Moderate', 'Poor', 'Very Poor', 'Extremely Poor']


In [25]:
# Define a function to calculate the index level for each pollutant concentration
from pyspark.sql.functions import array, array_max
#import pyspark.sql.functions as F

def calculate_index_level(pollutant, concentration):
    breakpoints_list = breakpoints[pollutant]
    for i in range(len(breakpoints_list)-1):
        if breakpoints_list[i] <= concentration < breakpoints_list[i+1]:
            return i+1
    return 6  # If concentration exceeds the highest breakpoint, return the highest index level


In [26]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

calculate_aqi_index_udf = udf(
    lambda no2_conc, o3_conc, pm10_conc, pm25_conc, so2_conc:
        int(max([calculate_index_level('NO_2', no2_conc),
                 calculate_index_level('O_3', o3_conc),
                 calculate_index_level('PM10', pm10_conc),
                 calculate_index_level('PM25', pm25_conc),
                 calculate_index_level('SO_2', so2_conc)])),
    IntegerType()
)

In [12]:
#debug
print(calculate_index_level('PM25', 1.77),
calculate_index_level('PM10', 23),
calculate_index_level('NO_2', 54),
calculate_index_level('O_3', 5),
calculate_index_level('SO_2', 1.5))

1 2 2 1 1


In [27]:
from pyspark.sql.functions import col

# Define a sample row to test the UDF
sample_row = [(54.0, 5.0, 23.0, 1.77, 1.5)]

# Convert the row to a Spark DataFrame
sample_df = spark.createDataFrame(sample_row, ['NO_2', 'O_3', 'PM10', 'PM25', 'SO_2'])

# Apply the UDF to the sample DataFrame
sample_df = sample_df.withColumn('AQI_Index', calculate_aqi_index_udf('NO_2', 'O_3', 'PM10', 'PM25', 'SO_2'))

# Show the results
sample_df.show()

+----+---+----+----+----+---------+
|NO_2|O_3|PM10|PM25|SO_2|AQI_Index|
+----+---+----+----+----+---------+
|54.0|5.0|23.0|1.77| 1.5|        2|
+----+---+----+----+----+---------+



In [28]:
# Add AQI index as a new column to the DataFrame using the UDF
spark_df = spark_df.withColumn('AQI_Index', calculate_aqi_index_udf('NO_2','O_3','PM10', 'PM25', 'SO_2'))

In [30]:
# Define a UDF to calculate the AQI category for each row
calculate_aqi_category_udf = udf(lambda index_level: categories[index_level-1], StringType())

In [31]:
# Add AQI category as a new column to the DataFrame using the UDF
spark_df = spark_df.withColumn('AQI_Category', calculate_aqi_category_udf('AQI_Index'))

In [None]:
spark_df.printSchema()

In [33]:
spark_df.show(10)

+-------------------+-------+-----------------+-------------------+------------------+-----------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+-----------+---------+------------+
|          unix_time|row_num|              BEN|                 CO|               EBE|              MXY|               NMHC|              NO_2|               NOx|               OXY|               O_3|              PM10|              PM25|              PXY|              SO_2|              TCH|        TOL|AQI_Index|AQI_Category|
+-------------------+-------+-----------------+-------------------+------------------+-----------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+-----------+---------+------------+
|2001-01-01 0

In [35]:
spark_df.write.format('csv').option('header', True ).mode('overwrite').save('C:/Users/eleni/Documents/Diplw/Jupyter-Notebooks/diplw/csvs_per_year/yuck/clean_data_aqi_cat.csv')