In [None]:
##################################
##      IT'S DANGEROUS TO GO    ##       
##       ALONE! TAKE THIS.      ##
##        🔥  🧙‍♂️  🔥          ## 
##            🗡️               ##
##                              ##  
############  🧝  ############### 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score , confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, first, last, lag, lead, when
from pyspark.sql.functions import row_number

conf = SparkConf().setAppName('yuck').setMaster("local[*]").set("spark.driver.memory", "5g").set("spark.executor.memory", "5g").set("spark.executor.cores", "6")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [2]:
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

#Define the schema for the CSV files
schema = StructType([
    StructField("row_num", IntegerType(), False),
    StructField("unix_time", TimestampType(), True),
    StructField("AQI_Index", IntegerType(), True),
    StructField("AQI_Category", StringType(), True),
    StructField("AQI_GenPop_Category", StringType(), True),
    StructField("AQI_GenPop_Index", IntegerType(), True),
    StructField("BEN", DoubleType(), True),
    StructField("CO", DoubleType(), True),
    StructField("EBE", DoubleType(), True),
    StructField("MXY", DoubleType(), True),
    StructField("NMHC", DoubleType(), True),
    StructField("NO_2", DoubleType(), True),
    StructField("NOx", DoubleType(), True),
    StructField("OXY", DoubleType(), True),
    StructField("O_3", DoubleType(), True),
    StructField("PM10", DoubleType(), True),
    StructField("PM25", DoubleType(), True),
    StructField("PXY", DoubleType(), False),
    StructField("SO_2", DoubleType(), True),
    StructField("TCH", DoubleType(), True),
    StructField("TOL", DoubleType(), True)])

#insert clean- enormous df csv files to spark_df dataframe
data_path = 'C:\\Users\\eleni\\Documents\\Diplw\\Jupyter-Notebooks\\diplw\\csvs_per_year\\clean_data_norm.csv'
spark_df = spark.read.csv(data_path, header=True, schema=schema)
#insert training data
data = pd.read_csv("balanced_sample.csv")

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator

#Define feature columns
#feature_columns = ['BEN','EBE', 'CO', 'NMHC', 'NO_2', 'O_3', 'PM10', 'PM25', 'SO_2','TCH','TOL']
feature_columns = ["NO_2", "O_3", "PM10", "PM25", "SO_2"]

#Define target column
target_column = "AQI_GenPop_Index"

#Convert pandas DataFrame to Spark DataFrame
data_spark = spark.createDataFrame(data)

#Extract columns from data_spark to train_data
train_data = data_spark.select(feature_columns + [target_column])

#Extract columns from spark_df to test_data
test_data = spark_df.select(feature_columns+[target_column])

from pyspark.sql.functions import col
train_data = train_data.withColumn("AQI_GenPop_Index", col("AQI_GenPop_Index").cast("double"))

In [4]:
#Vectorize feature columns
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
train_data = assembler.transform(train_data)
test_data = assembler.transform(test_data)

#LogisticRegression object
lr = LogisticRegression(featuresCol='features', labelCol='AQI_GenPop_Index')

#Fit the model to the training data
model = lr.fit(train_data)

threshold=0.505
#Predictions with threhold
predictions = model.transform(test_data, {lr.threshold: threshold})

In [5]:
#Evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator

#Create an evaluator for binary classification
evaluator = BinaryClassificationEvaluator(labelCol="AQI_GenPop_Index", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

#Evaluate the model on the test data
auc = evaluator.evaluate(predictions)

# Calculate the true positives, false positives, true negatives, and false negatives WHERE positive=hazardous, negative=safe(=not hazardous)
tp = predictions.where((predictions["AQI_GenPop_Index"] == 1) & (predictions["prediction"] == 1)).count()
fp = predictions.where((predictions["AQI_GenPop_Index"] == 0) & (predictions["prediction"] == 1)).count()
tn = predictions.where((predictions["AQI_GenPop_Index"] == 0) & (predictions["prediction"] == 0)).count()
fn = predictions.where((predictions["AQI_GenPop_Index"] == 1) & (predictions["prediction"] == 0)).count()

#Calculate the accuracy, precision, and recall
accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0.0
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
recall_spec= tn /(tn + fp) if (tn+fp) >0 else 0.0

In [9]:
print("🌈Frequentist Logistic Regression in Spark🌈\n same train and test sets as before(5 features).\n")
print('Test Accuracy:', accuracy)
print('Test Precision:', precision)
print('Test Recall:', recall)
print('Test Recall/Specificity:', recall_spec)
print('Test AUC ROC:', auc)
print('\n    ✨Confusion matrix✨')
print('TP:',tp,'\t','FN:',fn,'\n')
print('FP:',fp,'\t','TN:',tn,)

🌈Frequentist Logistic Regression in Spark🌈
 same train and test sets as before(5 features).

Test Accuracy: 0.892364262186258
Test Precision: 0.9270058241129475
Test Recall/Specificity: 0.945249522794354
Test AUC ROC: 0.961498458128979

    ✨Confusion matrix✨
TP: 1440301 	 FN: 296489 

FP: 113412 	 TN: 1958022
