In [3]:
#Import libraries as needed
import os
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score , confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split
import pymc3 as pm
import theano.tensor as tt

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, first, last, lag, lead, when
from pyspark.sql.functions import row_number

conf = SparkConf().setAppName('yuck').setMaster("local[*]").set("spark.driver.memory", "5g").set("spark.executor.memory", "5g").set("spark.executor.cores", "6")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)


In [None]:
#FOR BIG DF

In [None]:
#from pyspark.sql.functions import unix_timestamp
#from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

# Define the schema for the CSV files
#schema = StructType([
#    StructField("row_num", IntegerType(), False),
#    StructField("unix_time", TimestampType(), True),
#     StructField("AQI_Index", IntegerType(), True),
#     StructField("AQI_Category", StringType(), True),
#     StructField("AQI_GenPop_Category", StringType(), True),
#     StructField("AQI_GenPop_Index", IntegerType(), True),
#     StructField("BEN", DoubleType(), True),
#     StructField("CO", DoubleType(), True),
#     StructField("EBE", DoubleType(), True),
#     StructField("MXY", DoubleType(), True),
#     StructField("NMHC", DoubleType(), True),
#     StructField("NO_2", DoubleType(), True),
#     StructField("NOx", DoubleType(), True),
#     StructField("OXY", DoubleType(), True),
#     StructField("O_3", DoubleType(), True),
#     StructField("PM10", DoubleType(), True),
#     StructField("PM25", DoubleType(), True),
#     StructField("PXY", DoubleType(), False),
#     StructField("SO_2", DoubleType(), True),
#     StructField("TCH", DoubleType(), True),
#     StructField("TOL", DoubleType(), True)])
# insert clean- enormous df csv files to spark_df dataframe
# data_path = 'C:\\Users\\eleni\\Documents\\Diplw\\Jupyter-Notebooks\\diplw\\csvs_per_year\\clean_data_norm.csv'
# data = spark.read.csv(data_path, header=True, schema=schema)

In [4]:
#Play around with small df first
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

# Define the schema for the CSV files
schema = StructType([
    StructField("AQI_Index", IntegerType(), True),
    StructField("AQI_Category", StringType(), True),
    StructField("AQI_GenPop_Category", StringType(), True),
    StructField("AQI_GenPop_Index", IntegerType(), True),
    StructField("BEN", DoubleType(), True),
    StructField("CO", DoubleType(), True),
    StructField("EBE", DoubleType(), True),
    StructField("MXY", DoubleType(), True),
    StructField("NMHC", DoubleType(), True),
    StructField("NO_2", DoubleType(), True),
    StructField("NOx", DoubleType(), True),
    StructField("OXY", DoubleType(), True),
    StructField("O_3", DoubleType(), True),
    StructField("PM10", DoubleType(), True),
    StructField("PM25", DoubleType(), True),
    StructField("PXY", DoubleType(), False),
    StructField("SO_2", DoubleType(), True),
    StructField("TCH", DoubleType(), True),
    StructField("TOL", DoubleType(), True)])

In [5]:
# insert 1 year csv file to spark dataframe
data_path = 'C:\\Users\\eleni\\Documents\\Diplw\\Jupyter-Notebooks\\diplw\\balanced_sample.csv'
data = spark.read.csv(data_path, header=True, schema=schema)

In [None]:
#CHECK IF SPARK_DF GOOD TO GO

In [None]:
data.printSchema()

In [None]:
#Check if df ok
from pyspark.sql.functions import col, sum
from pyspark.sql.functions import isnan, when, count

null_counts = data.agg(*[sum(col(c).isNull().cast("int")).alias(c) for c in data.columns])

# Print out the null counts for each column
null_counts.show()

In [None]:
pollutants=data.columns[6:]

In [None]:
from pyspark.sql.functions import mean, format_number

# assume 'data' is your Spark DataFrame
means = data.agg(*[mean(c).alias(c) for c in data.columns[6:]])
# Format the mean values to 4 decimal places
formatted_means = means.select(*[format_number(c, 4).alias(c) for c in means.columns])

# Show the formatted mean values
formatted_means.show()

In [None]:
from pyspark.sql.functions import stddev, format_number

# assume 'data' is your Spark DataFrame
stds = data.agg(*[stddev(c).alias(c) for c in data.columns[6:]])
# Format the mean values to 4 decimal places
formatted_stvs = stds.select(*[format_number(c, 4).alias(c) for c in stds.columns])

# Show the formatted mean values
formatted_stvs.show()

In [None]:
#BAYESIAN LOGISTIC REGRESSION STARTS HERE

In [6]:
def bernoulli_likelihood(p, y):
    return tt.pow(p, y) * tt.pow(1 - p, 1 - y)


In [7]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType, IntegerType

In [8]:
#  Bayesian logistic regression model
def bayesian_logistic_regression(X_train, y_train, n_features):
    print("X_train shape:", X_train.shape)
    print("y_train shape:", y_train.shape)
    
    with pm.Model() as AQI_model:
        # Priors for coefficients and bias, with better starting values
        coeffs = pm.Normal("coeffs", mu=0, sigma=1, shape=n_features, testval=np.zeros((n_features)))
        bias = pm.Normal("bias", mu=0, sigma=1)

    def logistic(x, epsilon=1e-6):
        return 1 / (1 + tt.exp(-x)) + epsilon   
    
        # Calculate the probability of each observation using the logistic function
        p = logistic(pm.math.sigmoid(pm.math.dot(X_train, coeffs) + bias))

        # Define the Bernoulli likelihood
        y_obs = pm.Bernoulli("y_obs", p=p, observed=y_train)

    with AQI_model:
        print("Starting MCMC sampling...")
        # Fit the model using the MCMC No-U-Turn Sampler (NUTS)
        step=pm.NUTS(target_accept=0.8)
        trace = pm.sample(8000, tune=800, chains=4, cores=4, step=step, progressbar=True)

        # Return the posterior mean of the coefficients and bias
        return {"coeffs": np.mean(trace["coeffs"], axis=0), "bias": np.mean(trace["bias"])}

    
bayesian_logistic_regression_udf = udf(bayesian_logistic_regression, returnType=DoubleType())

In [9]:
@udf(returnType=ArrayType(DoubleType()))
def predict_proba_udf(X_test, coeffs, bias):
    linear = np.dot(X_test, coeffs) + bias
    proba = 1 / (1 + np.exp(-linear))
    return np.column_stack((1 - proba, proba))

@udf(returnType=IntegerType())
def predict_udf(X_test, coeffs, bias):
    y_test_pred_proba = predict_proba_udf(X_test, coeffs, bias)
    y_test_pred = np.argmax(y_test_pred_proba, axis=1)
    return y_test_pred

In [10]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler

In [11]:
# Extract the feature columns and target column from the spark dataframe
feature_columns = ['BEN', 'EBE', 'CO', 'NMHC', 'NO_2', 'O_3', 'PM10', 'PM25', 'SO_2', 'TCH', 'TOL']
n_features = len(feature_columns)
target_column = "AQI_GenPop_Index"
print(n_features)

11


In [12]:
# Create a VectorAssembler to combine the feature columns into a single feature vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
# Apply the VectorAssembler to your data
data = assembler.transform(data.select(feature_columns + [target_column]))

In [13]:
# Prepare the target variable y
data = data.select("features", target_column).withColumnRenamed(target_column, "AQI_GenPop_Index")

In [14]:
# Split the data into train and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [None]:
#TRAIN AND TEST

In [None]:

print(bayesian_logistic_regression_udf("features", "AQI_GenPop_Index", lit(n_features)))
train_data.select("features", "AQI_GenPop_Index").show()
print("Input data to UDF: train_data")

In [17]:
# Train the model on the training data
from pyspark.sql.functions import col, lit

model_dict = train_data.select(bayesian_logistic_regression_udf(col("features"), col("AQI_GenPop_Index"), lit(n_features)).alias("model")).first()["model"]

coeffs = model_dict["coeffs"]
bias = model_dict["bias"]

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "c:\Users\eleni\AppData\Local\Programs\Python\Python310\lib\site-packages\theano\__init__.py", line 83, in <module>
    from theano import scalar, tensor
  File "c:\Users\eleni\AppData\Local\Programs\Python\Python310\lib\site-packages\theano\tensor\__init__.py", line 20, in <module>
    from theano.tensor import nnet  # used for softmax, sigmoid, etc.
  File "c:\Users\eleni\AppData\Local\Programs\Python\Python310\lib\site-packages\theano\tensor\nnet\__init__.py", line 3, in <module>
    from . import opt
  File "c:\Users\eleni\AppData\Local\Programs\Python\Python310\lib\site-packages\theano\tensor\nnet\opt.py", line 32, in <module>
    from theano.tensor.nnet.conv import ConvOp, conv2d
  File "c:\Users\eleni\AppData\Local\Programs\Python\Python310\lib\site-packages\theano\tensor\nnet\conv.py", line 20, in <module>
    from theano.tensor import blas
  File "c:\Users\eleni\AppData\Local\Programs\Python\Python310\lib\site-packages\theano\tensor\blas.py", line 163, in <module>
    from theano.tensor.blas_headers import blas_header_text, blas_header_version
  File "c:\Users\eleni\AppData\Local\Programs\Python\Python310\lib\site-packages\theano\tensor\blas_headers.py", line 1016, in <module>
    if not config.blas__ldflags:
  File "c:\Users\eleni\AppData\Local\Programs\Python\Python310\lib\site-packages\theano\configparser.py", line 358, in __get__
    val_str = self.default()
  File "c:\Users\eleni\AppData\Local\Programs\Python\Python310\lib\site-packages\theano\link\c\cmodule.py", line 2826, in default_blas_ldflags
    except KeyError:
AttributeError: module 'numpy.distutils.__config__' has no attribute 'blas_opt_info'


In [None]:
# Train the model on the training data
coeffs, bias = train_data.select(bayesian_logistic_regression_udf("features", "AQI_GenPop_Index", lit(n_features)).alias("model")).first()["model"]

In [None]:
# Make predictions on the test data
test_data_pred = test_data.withColumn("prediction", predict_udf("features", lit(coeffs), lit(bias)))

In [None]:
# Evaluation of model // WE WANT BINARY
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="AQI_GenPop_Index", metricName="accuracy")
accuracy = evaluator.evaluate(test_data_pred)

print("Test accuracy:", accuracy)