# Model training

In [8]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
from scipy import stats

# Load the CSV file
data = pd.read_csv("BC-Data-Set.csv")

In [9]:
# Convert the date column to a datetime object
data['date'] = pd.to_datetime(data['date'])

# Remove any missing values
data = data.dropna()

# Set the date column as the index of the DataFrame
data = data.set_index('date')

In [10]:
seed = 42

In [11]:
x_train, x_test, y_train, y_test = train_test_split(data, data.BC, test_size=0.15, random_state=seed, shuffle=True)
x_train = x_train.drop(columns=['BC'])
x_test = x_test.drop(columns=['BC'])

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.15, random_state=seed, shuffle=True)

In [12]:
from sklearn.preprocessing import StandardScaler

## Random Forrest

In [13]:
# TODO: tune max_depth, n_estimators?

In [14]:
clf = ExtraTreesRegressor(random_state=seed)
scaler = StandardScaler()

scaler.fit_transform(x_train)

clf.fit(scaler.fit_transform(x_train), y_train)
r2 = clf.score(scaler.transform(x_val), y_val)
y_hat = clf.predict(scaler.transform(x_val))
rmse = metrics.mean_squared_error(y_val, y_hat, squared=False)
r2, rmse

(0.7784590248066453, 0.5888828067406692)

In [15]:
clf = RandomForestRegressor(random_state=seed)
scaler = StandardScaler()

scaler.fit_transform(x_train)

clf.fit(scaler.fit_transform(x_train), y_train)
r2 = clf.score(scaler.transform(x_val), y_val)
y_hat = clf.predict(scaler.transform(x_val))
rmse = metrics.mean_squared_error(y_val, y_hat, squared=False)
r2, rmse

(0.7807920328011395, 0.5857738909843954)

## SVM

In [16]:
from sklearn.svm import SVR

In [17]:
# TODO: tune gamma, C

In [18]:
clf = SVR()
scaler = StandardScaler()

scaler.fit_transform(x_train)

clf.fit(scaler.fit_transform(x_train), y_train)
r2 = clf.score(scaler.transform(x_val), y_val)
y_hat = clf.predict(scaler.transform(x_val))
rmse = metrics.mean_squared_error(y_val, y_hat, squared=False)
r2, rmse

(0.7076857617858519, 0.6764361795167784)

# Subset selection / regularization

In [26]:
# TODO: try different column combinations, also tune parameters after dropping

In [25]:
clf = SVR()
scaler = StandardScaler()

_x_train = x_train.drop(columns=['TEMP', 'SO2']) 
_x_val = x_val.drop(columns=['TEMP', 'SO2'])

scaler.fit_transform(_x_train)

clf.fit(scaler.fit_transform(_x_train), y_train)
r2 = clf.score(scaler.transform(_x_val), y_val)
y_hat = clf.predict(scaler.transform(_x_val))
rmse = metrics.mean_squared_error(y_val, y_hat, squared=False)
# we MAXIMIZE R2 and MINIMIZE RMSE
r2, rmse

(0.6568496069050174, 0.7328989398700102)