In [None]:
#Import libraries as needed
import os
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm
from sklearn.metrics import accuracy_score , confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split
import theano.tensor as tt

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, first, last, lag, lead, when
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

conf = SparkConf().setAppName('yuck').setMaster("local[*]").set("spark.driver.memory", "4g")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [None]:
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

# Define the schema for the CSV files
schema = StructType([
    StructField("row_num", IntegerType(), False),
    StructField("unix_time", TimestampType(), True),
    StructField("AQI_Index", IntegerType(), True),
    StructField("AQI_Category", StringType(), True),
    StructField("AQI_GenPop_Category", StringType(), True),
    StructField("AQI_GenPop_Index", IntegerType(), True),
    StructField("BEN", DoubleType(), True),
    StructField("CO", DoubleType(), True),
    StructField("EBE", DoubleType(), True),
    StructField("MXY", DoubleType(), True),
    StructField("NMHC", DoubleType(), True),
    StructField("NO_2", DoubleType(), True),
    StructField("NOx", DoubleType(), True),
    StructField("OXY", DoubleType(), True),
    StructField("O_3", DoubleType(), True),
    StructField("PM10", DoubleType(), True),
    StructField("PM25", DoubleType(), True),
    StructField("PXY", DoubleType(), False),
    StructField("SO_2", DoubleType(), True),
    StructField("TCH", DoubleType(), True),
    StructField("TOL", DoubleType(), True)])

In [None]:
# insert csv files to spark_df dataframe
data_path = 'C:\\Users\\eleni\\Documents\\Diplw\\Jupyter-Notebooks\\diplw\\csvs_per_year\\clean_data_norm.csv'
data = spark.read.csv(data_path, header=True, schema=schema)

In [None]:
#CHECK IF SPARK_DF GOOD TO GO

In [None]:
data.printSchema()

In [None]:
#Check if df ok
from pyspark.sql.functions import col, sum
from pyspark.sql.functions import isnan, when, count

null_counts = data.agg(*[sum(col(c).isNull().cast("int")).alias(c) for c in data.columns])

# Print out the null counts for each column
null_counts.show()

In [None]:
pollutants=data.columns[6:]

In [None]:
from pyspark.sql.functions import mean, format_number

# assume 'data' is your Spark DataFrame
means = data.agg(*[mean(c).alias(c) for c in data.columns[6:]])
# Format the mean values to 4 decimal places
formatted_means = means.select(*[format_number(c, 4).alias(c) for c in means.columns])

# Show the formatted mean values
formatted_means.show()

In [None]:
from pyspark.sql.functions import stddev, format_number

# assume 'data' is your Spark DataFrame
stds = data.agg(*[stddev(c).alias(c) for c in data.columns[6:]])
# Format the mean values to 4 decimal places
formatted_stvs = stds.select(*[format_number(c, 4).alias(c) for c in stds.columns])

# Show the formatted mean values
formatted_stvs.show()

In [None]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
import pyspark.sql.functions as F

In [None]:
# Extract the feature columns and target column from the spark_df dataframe
feature_columns = ['BEN', 'EBE', 'CO', 'NMHC', 'NO_2', 'O_3', 'PM10', 'PM25', 'SO_2', 'TCH', 'TOL']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)
data = data.select(col("AQI_GenPop_Index").alias("label"), col("features"))

In [None]:
# Split the data into train and test sets using the pyspark.ml library
train, test = data.randomSplit([0.8, 0.2], seed=42)

#later: train=small df, test: big df

In [None]:
# Build the Bayesian logistic regression model in PyMC3
n_features = len(feature_columns)
with pm.Model() as AQI_model_spark:
    # Priors for coefficients and bias, with better starting values
    coeffs = pm.Normal("coeffs", mu=0, sigma=1, shape=n_features, testval=np.zeros((n_features)))
    bias = pm.Normal("bias", mu=0, sigma=1)
    
    # Define the logistic function with added epsilon using theano.tensor as tt
def logistic(x, epsilon=1e-6):
   return 1 / (1 + tt.exp(-x)) + epsilon

# Define the logistic function
p = logistic(pm.math.dot(data.select("features").rdd.flatMap(lambda x: x).collect(), coeffs) + bias)
        
# Define the Bernoulli likelihood
y_obs = pm.Bernoulli("y_obs", p=p, observed=data.select("label").rdd.flatMap(lambda x: x).collect())
