In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import itertools
tf.config.set_visible_devices([], 'GPU') #disables GPU



data = pd.read_csv("BC-Data-Set.csv")
data = data.set_index(pd.to_datetime(data['date']))
data.describe()
data = data.sort_index()
data = data.reset_index(drop=True)
dates = data["date"]
dataset = data.drop(["date"], axis=1)



In [6]:
set(dataset.columns) - set(['N_CPC', 'PM-2.5', 'PM-1.0', 'NO2', 'O3', 'CO', 'NO', 'TEMP', 'HUM'])

{'BC', 'NOX', 'PM-10', 'SO2'}

#TEDIOUS PHASE, YOU HAVE TO SEE WRT TO THE DATA IF THE OUTLIERS SHOULD BE REMOVED
outliers = dataset.loc[dataset['BC'] > 10]
try:
    outliers = pd.concat([outliers, dataset.loc[dataset['PM-10'] > 70]], axis=0)
except:
    pass
try:
    outliers = pd.concat([outliers, dataset.loc[dataset['N_CPC'] > 70]], axis=0)
except:
    pass
try:
    outliers = pd.concat([outliers, dataset.loc[dataset['PM-2.5'] > 60]], axis=0)
except:
    pass
try:
    outliers = pd.concat([outliers, dataset.loc[dataset['SO2'] > 7]], axis=0)
except:
    pass
try:
    outliers = pd.concat([outliers, dataset.loc[dataset['CO'] > 1.75]], axis=0)
except:
    pass
try:
    outliers = pd.concat([outliers, dataset.loc[dataset['NO'] > 225]], axis=0)
except:
    pass
try:
    outliers = pd.concat([outliers, dataset.loc[dataset['NOX'] > 410]], axis=0)
except:
    pass

outliers = outliers[~outliers.duplicated()]
print (outliers.shape)

data = data.drop(outliers.index)

In [2]:
# Function to evaluate model performance
def evaluate_model(features, target, model):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.1, random_state=42)
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True)])
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    return rmse, r2

In [3]:
# Function to find the combination of features with the best performance
def find_best_feature_combination(features, target):
    best_rmse = float('inf')
    best_r2 = -float('inf')
    best_features = None
    num_features = len(features.columns)
    
    for i in range(9, num_features):
        feature_combinations = itertools.combinations(features.columns, i)
        print(i)
        for combo in feature_combinations:
            selected_features = features[list(combo)]
            model = tf.keras.Sequential()
            model.add(tf.keras.layers.GaussianNoise(0.00, input_shape=(i,)))
            model.add(tf.keras.layers.Dense(64, activation='relu'))
            #model.add(tf.keras.layers.BatchNormalization())
            model.add(tf.keras.layers.Dropout(0.0))
            model.add(tf.keras.layers.GaussianNoise(0.1))
            model.add(tf.keras.layers.Dense(112, activation='relu'))
            model.add(tf.keras.layers.GaussianNoise(0.05))
            model.add(tf.keras.layers.Dense(96, activation='relu'))
            model.add(tf.keras.layers.Dropout(0.1))
            model.add(tf.keras.layers.Dense(1))
            model.compile(loss='mean_squared_error', optimizer='adam')
            rmse, r2 = evaluate_model(selected_features, target, model)

            if rmse < best_rmse and r2 > best_r2:
                best_rmse = rmse
                best_r2 = r2
                best_features = list(combo)
    
    return best_features, best_r2, best_rmse

threshold = 5# theshold a little high to retain some outliers
z_scores = np.abs((X_train - X_train.mean()) / X_train.std())
outliers = (z_scores > threshold).any(axis=1)
for column in X_train.columns:
    column_median = X_train[column].median()
    X_train.loc[outliers, column] = column_median

In [4]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

target = pd.DataFrame(dataset.BC)
data = dataset.drop(['BC'], axis=1)

scaler_x = StandardScaler()
x_scaled = scaler_x.fit_transform(data)
x_scaled = pd.DataFrame(x_scaled, columns=data.columns)

#do the same for y

scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(target) 
y_scaled = pd.DataFrame(y_scaled, columns=target.columns)


print(y_scaled.describe())   
x_scaled.describe()

                 BC
count  4.223000e+03
mean   1.665729e-16
std    1.000118e+00
min   -1.068886e+00
25%   -5.800934e-01
50%   -3.016109e-01
75%    1.813890e-01
max    1.021032e+01


Unnamed: 0,N_CPC,PM-10,PM-2.5,PM-1.0,NO2,O3,SO2,CO,NO,NOX,TEMP,HUM
count,4223.0,4223.0,4223.0,4223.0,4223.0,4223.0,4223.0,4223.0,4223.0,4223.0,4223.0,4223.0
mean,-1.211439e-16,1.346044e-16,-1.884461e-16,-1.3460440000000002e-17,-1.3460440000000002e-17,1.144137e-16,-2.086367e-16,-3.903526e-16,3.365109e-18,-2.6920870000000002e-17,-2.15367e-16,-8.076261e-17
std,1.000118,1.000118,1.000118,1.000118,1.000118,1.000118,1.000118,1.000118,1.000118,1.000118,1.000118,1.000118
min,-1.490589,-1.019634,-1.445664,-1.264327,-1.287226,-1.805996,-1.086622,-1.00552,-0.3939257,-0.8945402,-2.246321,-3.394947
25%,-0.7196058,-0.4618384,-0.698399,-0.7410321,-0.7361954,-0.6949478,-0.4610184,-0.5332776,-0.3669326,-0.542082,-0.8304461,-0.6837334
50%,-0.2555834,-0.1578222,-0.2249122,-0.2633677,-0.3229224,0.03298058,-0.4610184,-0.5332776,-0.2589603,-0.299767,-0.114422,0.08143306
75%,0.4679226,0.2339678,0.4266043,0.4377968,0.4577044,0.6459729,0.7901887,-0.06103483,-0.09700172,0.1408058,0.8072774,0.8032273
max,7.265794,26.91344,11.09081,4.950602,4.682273,3.595999,14.55347,7.494849,13.34556,10.56035,2.801624,1.922991


In [5]:
# Find the best feature combination
best_features, best_r2, best_rmse = find_best_feature_combination(x_scaled, y_scaled)

print("Best feature combination:", best_features, best_r2, best_rmse)

9


2023-06-02 12:30:22.453098: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


10
11
Best feature combination: ['N_CPC', 'PM-2.5', 'NO2', 'O3', 'CO', 'NO', 'NOX', 'TEMP', 'HUM'] 0.8126650074140349 0.4442680232655792


Outliers removed:
Best feature combination: ['N_CPC', 'PM-2.5', 'PM-1.0', 'NO2', 'O3', 'CO', 'NO', 'TEMP', 'HUM'] 0.8122760511895382 0.44472899216608563

Outliers not removed:
Best feature combination: ['N_CPC', 'PM-2.5', 'PM-1.0', 'O3', 'CO', 'NOX', 'TEMP', 'HUM'] 0.7912537483185005 0.4689698735499292