In [None]:
##################################
##      IT'S DANGEROUS TO GO    ##       
##       ALONE! TAKE THIS.      ##
##        🔥  🧙‍♂️  🔥          ## 
##            🗡️               ##
##                              ##  
############  🧝  ############### 

import numpy as np
import pandas as pd
import pymc3 as pm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score , confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split
import theano.tensor as tt

In [None]:
data = pd.read_csv("balanced_sample.csv")
#FEATURES
#feature_columns = ['BEN','EBE', 'CO', 'NMHC', 'NO_2', 'O_3', 'PM10', 'PM25', 'SO_2','TCH','TOL'] #11 features - Best
feature_columns = ["NO_2", "O_3", "PM10", "PM25", "SO_2"]
X_train = data[feature_columns].values
n_features = X_train.shape[1]

#TARGET
target_column = "AQI_GenPop_Index" 
y_train = data[target_column].values
n_classes=2

In [None]:
with pm.Model() as AQI_model:
    # Priors for coefficients and bias, with better starting values
    coeffs = pm.Normal("coeffs", mu=0, sigma=1, shape=n_features, testval=np.zeros((n_features)))
    bias = pm.Normal("bias", mu=0, sigma=1)
    
    # Likelihood function
    # Define the logistic function with added epsilon
def logistic(x, epsilon=1e-6):
    return 1 / (1 + tt.exp(-x)) + epsilon
       
    p = logistic(pm.math.dot(X_train, coeffs) + bias)
    
    # Define the Bernoulli likelihood
    y_obs = pm.Bernoulli("y_obs", p=p, observed=y_train)
# MCMC
with AQI_model:
    #step=pm.Metropolis()
    trace = pm.sample(1000, tune=2000, chains=4, cores=8, init='advi_map', n_init=100000, progressbar=True)
    sns.set_palette("BuPu")
    pm.plot_trace(trace)

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, first, last, lag, lead, when
from pyspark.sql.functions import row_number

conf = SparkConf().setAppName('yuck').setMaster("local[*]").set("spark.driver.memory", "5g").set("spark.executor.memory", "5g").set("spark.executor.cores", "6")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
#Define the schema for the CSV files
schema = StructType([
    StructField("row_num", IntegerType(), False),
    StructField("unix_time", TimestampType(), True),
    StructField("AQI_Index", IntegerType(), True),
    StructField("AQI_Category", StringType(), True),
    StructField("AQI_GenPop_Category", StringType(), True),
    StructField("AQI_GenPop_Index", IntegerType(), True),
    StructField("BEN", DoubleType(), True),
    StructField("CO", DoubleType(), True),
    StructField("EBE", DoubleType(), True),
    StructField("MXY", DoubleType(), True),
    StructField("NMHC", DoubleType(), True),
    StructField("NO_2", DoubleType(), True),
    StructField("NOx", DoubleType(), True),
    StructField("OXY", DoubleType(), True),
    StructField("O_3", DoubleType(), True),
    StructField("PM10", DoubleType(), True),
    StructField("PM25", DoubleType(), True),
    StructField("PXY", DoubleType(), False),
    StructField("SO_2", DoubleType(), True),
    StructField("TCH", DoubleType(), True),
    StructField("TOL", DoubleType(), True)])

# insert clean- enormous df csv files to spark_df dataframe
data_path = 'C:\\Users\\eleni\\Documents\\Diplw\\Jupyter-Notebooks\\diplw\\csvs_per_year\\clean_data_norm.csv'
spark_df = spark.read.csv(data_path, header=True, schema=schema)
#Extract the features from the test data
X_test = spark_df.select(feature_columns).rdd.map(list).collect()
#Extract labels from the test data
y_test = spark_df.select(target_column).rdd.map(list).collect()

In [None]:
import math
from math import exp
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType

@udf(returnType=ArrayType(DoubleType()))
def predict_proba_udf(coeffs_list, bias_value, *features):
    linear = sum([features[i] * coeffs_list[i] for i in range(len(coeffs_list))]) + bias_value
    proba = 1 / (1 + math.exp(-linear))
    return [1 - proba, proba]

from pyspark.sql.functions import col, array, lit

#Handle trace coeffs and bias for manipulation in spark
coeffs = trace["coeffs"].mean(axis=0)
bias = trace["bias"].mean()

#🧚‍♂️✨Hey, listen✨🧚‍♂️
#coeffs and bias are NumPy arrays -> we can't pass NumPy arrays to a Spark UDF. 
#Solution = convert arrays to list before passing them to the UDF

coeffs_list = coeffs.tolist()
bias_value = float(bias)
# Convert coeffs_list to a list of Column objects
coeffs_columns = [lit(x) for x in coeffs_list]

# Create a column for the coefficients array
coeffs_array = array(*coeffs_columns)

spark_df = spark_df.withColumn("proba", predict_proba_udf(coeffs_array, lit(bias_value), *[col(c) for c in feature_columns]))
spark_df = spark_df.withColumn("y_pred", (col("proba")[1] >= 0.50).cast(DoubleType())) #try 0.5,0.499999, 0.49785 etc

In [None]:
from pyspark.sql.functions import count, when, col

# Calculate the true positives, false positives, true negatives, and false negatives WHERE positive=hazardous, negative=safe(=not hazardous)
tp = spark_df.where((spark_df[target_column] == 1) & (spark_df["y_pred"] == 1)).count()
fp = spark_df.where((spark_df[target_column] == 0) & (spark_df["y_pred"] == 1)).count()
tn = spark_df.where((spark_df[target_column] == 0) & (spark_df["y_pred"] == 0)).count()
fn = spark_df.where((spark_df[target_column] == 1) & (spark_df["y_pred"] == 0)).count()

# Calculate the accuracy, precision, and recall
accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0.0
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
recall_spec= tn /(tn + fp) if (tn+fp) >0 else 0.0

from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Create evaluator object
evaluator = BinaryClassificationEvaluator(rawPredictionCol="y_pred", labelCol="AQI_GenPop_Index")

# Compute AUC ROC score
auc_roc = evaluator.evaluate(spark_df)

In [None]:
#✨Check if unbiased✨
#Logistic regression predictions should be unbiased. That is: "average of predictions" should ≈ "average of observations

from pyspark.sql.functions import mean, format_number

# Calculate the mean of y_pred and AQI_GenPop_Index columns
means = spark_df.agg(*[mean(c).alias(c) for c in ['y_pred', 'AQI_GenPop_Index']])

# Format the mean values to 4 decimal places
formatted_means = means.select(*[format_number(c, 4).alias(c) for c in means.columns])

# Show the formatted mean values
formatted_means.show()