In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm
from sklearn.metrics import accuracy_score , confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split
import theano.tensor as tt
import seaborn as sns

In [None]:
data = pd.read_csv("balanced_sample.csv")

In [None]:
print(data.info())

In [None]:
# Get the columns to plot
pollutants = data.columns[4:] 

# Create a figure with 5x3 subplots
fig, axs = plt.subplots(nrows=5, ncols=3, figsize=(16, 16))

# Plot a histogram for each column
for i, pollutant in enumerate(pollutants):
    
    # Get the data for the column
    dat = data[pollutant].dropna()
    
    # Determine the subplot location based on the column index
    row = i // 3
    col = i % 3
    
    # get the range of values in the column, ignoring NaN and Inf
    x_min = np.nanmin(data[pollutant][np.isfinite(data[pollutant])])
    x_max = np.nanmax(data[pollutant][np.isfinite(data[pollutant])])
    
    # calculate number of bins using IQR rule
    n = len(data[pollutant])
    std = np.std(data[pollutant])
    k = 3.5 * std / (n**(1/3))
    num_bins = int((x_max - x_min) / k) if k != 0 else 1 # Added this line to handle the case when k=0
    
    # Plot the histogram with kde
    sns.histplot(data[pollutant], kde=True, bins=num_bins, color=sns.color_palette("PuRd", 15)[i], ax=axs[row, col])
    
    # Add a vertical line for the mean
    mean = data[pollutant].mean()
    axs[row, col].axvline(mean, color='k', linestyle='dashed', linewidth=1)
    
    # Add a vertical line for the standard deviation
    std = data[pollutant].std()
    axs[row, col].axvline(mean+std, color='#8C78F0', linestyle='dashed', linewidth=1)
    axs[row, col].axvline(mean-std, color='#8C78F0', linestyle='dashed', linewidth=1)
    
    # Set the title and axis labels
    axs[row, col].set_title(pollutant)
    axs[row, col].set_xlabel('Value')
    axs[row, col].set_ylabel('Density')
    axs[row, col].set_xlim([dat.min(), dat.max()])  # set x-axis range

# Adjust the spacing between subplots
fig.subplots_adjust(hspace=0.4, wspace=0.4)

# Show the plot
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the color palette to PuRd
sns.set_palette('BuPu',2)

# Create bar chart of the categorical variable
data['AQI_GenPop_Index'].value_counts().plot(kind='bar')

# Create bar chart of the categorical variable
#data['AQI_Index'].value_counts().plot(kind='bar')

# Set plot title and axis labels
plt.title('Distribution of my categorical variable')
plt.xlabel('Categories')
plt.ylabel('Frequency')

# Display plot
plt.show()

In [None]:
feature_columns = ['BEN','EBE', 'CO', 'NMHC', 'NO_2', 'O_3', 'PM10', 'PM25', 'SO_2','TCH','TOL'] #11 features - Best

#feature_columns = ["NO_2", "O_3", "PM10", "PM25", "SO_2"] #testing with less //not so good

#feature_columns = ['BEN','EBE', 'CO', 'NMHC', 'NO_2','NOx', 'O_3', 'PM10', 'PM25', 'SO_2','TCH', 'TOL','MXY','OXY','PXY'] #15features to test with more features -NOT BAD

X = data[feature_columns].values

In [None]:
# Prepare the target variable y
target_column = "AQI_GenPop_Index" 
y = data[target_column].values

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

n_features = X_train.shape[1]
#n_classes = len(np.unique(y_train)) #for Multi Class LogReg
n_classes=2

In [None]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

In [None]:
#View data mean and std
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

print(f"Mean of features: {mean}")
print(f"Std of features: {std}")

In [None]:
# GEN_POP - Build the Binary Bayesian logistic regression model in PyMC3
with pm.Model() as AQI_model:
    # Priors for coefficients and bias, with better starting values
    coeffs = pm.Normal("coeffs", mu=0, sigma=1, shape=n_features, testval=np.zeros((n_features)))
    bias = pm.Normal("bias", mu=0, sigma=1)
    
    #prin to eixes ektos tou modelou kai kapws douleve kalutera
# Define the logistic function with added epsilon
def logistic(x, epsilon=1e-6):
    return 1 / (1 + tt.exp(-x)) + epsilon

    p = pm.math.sigmoid(pm.math.dot(X_train, coeffs) + bias)

    # Define the Bernoulli likelihood
    y_obs = pm.Bernoulli("y_obs", p=p, observed=y_train)

In [None]:
# MCMC
with AQI_model:
    #step= pm.Metropolis()
    step=pm.NUTS(target_accept=0.8)
    #step=pm.HamiltonianMC()
    trace = pm.sample(10000, tune=1000, chains=8, cores=8, step=step, progressbar=True)
    sns.set_palette("BuPu")
    pm.plot_trace(trace)

In [None]:
with AQI_model:
    print(pm.summary(trace))

In [None]:
# BINARY Predicting on test data

def predict_proba(X, trace):
    linear = np.dot(X, trace["coeffs"].mean(axis=0)) + trace["bias"].mean()
    proba = 1 / (1 + np.exp(-linear))
    return np.column_stack((1 - proba, proba)) #this code is the original code

y_test_pred_proba = predict_proba(X_test, trace)
y_test_pred = np.argmax(y_test_pred_proba, axis=1)

In [None]:
# Compute the confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion matrix:")
print(cm)

# Evaluation of model performance
accuracy = accuracy_score(y_test, y_test_pred)
print("Test accuracy:", accuracy)

from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_test_pred, average='macro') #try also 'micro' or 'weighted'
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1_score)

In [None]:
#Multiclass bad accuracies //AYTA POU EIXA MIN TA PEIRAKSEIS

In [None]:
# Build the Bayesian logistic regression model in PyMC3
with pm.Model() as AQI_model:
    # Priors for coefficients and bias, with better starting values
    coeffs = pm.Normal("coeffs", mu=0, sigma=1, shape=(n_features, n_classes - 1), testval=np.zeros((n_features, n_classes - 1)))
    bias = pm.Normal("bias", mu=0, sigma=1, shape=(n_classes - 1,), testval=np.zeros(n_classes - 1))

    # Define the softmax function using Theano functions with added epsilon for numerical stability
    def softmax(x, epsilon=1e-6):
        e_x = tt.exp(x - tt.max(x, axis=1, keepdims=True))
        e_x += epsilon
        return e_x / tt.sum(e_x, axis=1, keepdims=True)

        # Likelihood function (using the custom softmax function for multi-class classification)
        linear = pm.math.dot(X_train, coeffs) + bias
        softmax_probs = softmax(linear)

        # Define the categorical likelihood
        y_obs = pm.Categorical("y_obs", p=softmax_probs, observed=y_train)

In [None]:
# MCMC
with AQI_model:
    #step= pm.Metropolis()
    step=pm.NUTS(target_accept=0.8)
    trace = pm.sample(8000, tune=800, chains=8, cores=8, step=step, progressbar=True)
    pm.plot_trace(trace)

In [None]:
with AQI_model:
    pm.summary(trace)

In [None]:
#MULTI-CLASS:Predicting on test data
def predict_proba(X, trace):
  linear = np.dot(X, trace["coeffs"].mean(axis=0)) + trace["bias"].mean(axis=0)
  softmax = np.exp(linear - np.max(linear, axis=1, keepdims=True))
  return softmax / np.sum(softmax, axis=1, keepdims=True)

y_test_pred_proba = predict_proba(X_test, trace)
y_test_pred = np.argmax(y_test_pred_proba, axis=1)

In [None]:
# Compute the confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion matrix:")
print(cm)

In [None]:
# Evaluation of model performance
accuracy = accuracy_score(y_test, y_test_pred)
print("Test accuracy:", accuracy)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_test_pred, average='macro') #try also 'micro' or 'weighted'
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1_score)

In [None]:
from sklearn.metrics import precision_score, recall_score

precision_micro = precision_score(y_test, y_test_pred, average='micro')
recall_micro = recall_score(y_test, y_test_pred, average='micro')

print("Micro-averaged precision:", precision_micro)
print("Micro-averaged recall:", recall_micro)


In [None]:
from sklearn.metrics import precision_score, recall_score

precision_macro = precision_score(y_test, y_test_pred, average='macro')
recall_macro = recall_score(y_test, y_test_pred, average='macro')

print("Macro-averaged precision:", precision_macro)
print("Macro-averaged recall:", recall_macro)
