In [108]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [109]:
df = pd.read_csv( "../ML-Project-CS361/cleaned_shifted_data.csv" ) 
drop_cols = [0,1,2,12,14,16]
drop_cols = df.columns[drop_cols]
drop_cols

Index(['Timestamp', 'Unnamed: 0', 'Station', 'Checks', 'AQI_bucket_calculated',
       'AQI_bucket_calculated_shifted'],
      dtype='object')

In [110]:
df.drop(columns=drop_cols, inplace=True)
df.head()

Unnamed: 0,PM2.5 (µg/m³),PM10 (µg/m³),NO (µg/m³),NO2 (µg/m³),NOx (ppb),NH3 (µg/m³),SO2 (µg/m³),CO (mg/m³),Ozone (µg/m³),AQI_calculated,AQI_calculated_shifted
0,46.0,80.0,1.29,9.16,12.02,27.19,13.56,0.4,15.8,67.0,296.0
1,46.0,80.0,1.74,8.93,12.48,30.29,13.71,0.41,15.52,68.0,297.0
2,45.62,79.92,1.87,8.56,12.17,28.2,13.88,0.41,15.33,68.0,298.0
3,41.0,72.92,1.83,8.72,12.37,26.69,13.77,0.4,15.3,68.0,298.0
4,41.0,79.0,1.69,7.91,11.3,26.83,13.87,0.41,15.49,68.0,299.0


In [111]:
df.corr()['AQI_calculated_shifted']


PM2.5 (µg/m³)             0.502895
PM10 (µg/m³)              0.412791
NO (µg/m³)                0.281539
NO2 (µg/m³)               0.171903
NOx (ppb)                 0.245873
NH3 (µg/m³)               0.202202
SO2 (µg/m³)              -0.081608
CO (mg/m³)                0.391340
Ozone (µg/m³)            -0.013711
AQI_calculated            0.604224
AQI_calculated_shifted    1.000000
Name: AQI_calculated_shifted, dtype: float64

#### Removing The Ozone column since its correlation value with AQI_calculated_shifted is very low compare to others.

In [112]:
df=df.drop(['Ozone (µg/m³)'],axis=1)
df

Unnamed: 0,PM2.5 (µg/m³),PM10 (µg/m³),NO (µg/m³),NO2 (µg/m³),NOx (ppb),NH3 (µg/m³),SO2 (µg/m³),CO (mg/m³),AQI_calculated,AQI_calculated_shifted
0,46.00,80.00,1.29,9.16,12.02,27.19,13.56,0.40,67.0,296.0
1,46.00,80.00,1.74,8.93,12.48,30.29,13.71,0.41,68.0,297.0
2,45.62,79.92,1.87,8.56,12.17,28.20,13.88,0.41,68.0,298.0
3,41.00,72.92,1.83,8.72,12.37,26.69,13.77,0.40,68.0,298.0
4,41.00,79.00,1.69,7.91,11.30,26.83,13.87,0.41,68.0,299.0
...,...,...,...,...,...,...,...,...,...,...
174757,72.00,116.00,6.40,3.30,6.90,4.90,44.20,0.63,252.0,219.0
174758,71.00,114.00,6.40,3.40,7.00,4.90,41.30,0.68,249.0,219.0
174759,71.00,114.00,6.30,3.50,7.00,4.90,42.20,0.73,247.0,219.0
174760,73.00,114.00,6.30,5.00,7.80,5.50,0.40,0.76,238.0,220.0


In [113]:
df_np=df.to_numpy()
df_np.shape

(174762, 10)

In [114]:
X = df.drop(columns=[ 'AQI_calculated_shifted'])
X.columns

Index(['PM2.5 (µg/m³)', 'PM10 (µg/m³)', 'NO (µg/m³)', 'NO2 (µg/m³)',
       'NOx (ppb)', 'NH3 (µg/m³)', 'SO2 (µg/m³)', 'CO (mg/m³)',
       'AQI_calculated'],
      dtype='object')

In [115]:
Y=df['AQI_calculated_shifted']
Y

0         296.0
1         297.0
2         298.0
3         298.0
4         299.0
          ...  
174757    219.0
174758    219.0
174759    219.0
174760    220.0
174761    220.0
Name: AQI_calculated_shifted, Length: 174762, dtype: float64

In [116]:
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size =0.3,  random_state = 0 )

In [117]:

X_train_np=X_train.to_numpy()
Y_train_np=Y_train.to_numpy()


In [118]:
X_train_np

array([[2.400e+01, 1.630e+02, 2.100e-01, ..., 1.996e+01, 4.600e-01,
        1.220e+02],
       [8.900e+01, 1.950e+02, 5.400e+01, ..., 2.168e+01, 1.970e+00,
        1.930e+02],
       [3.100e+01, 8.000e+01, 2.070e+00, ..., 1.896e+01, 5.800e-01,
        2.610e+02],
       ...,
       [1.200e+01, 3.086e+01, 1.240e+00, ..., 2.823e+01, 3.700e-01,
        4.700e+01],
       [9.000e+00, 2.600e+01, 6.500e+00, ..., 7.700e+00, 2.340e+00,
        1.150e+02],
       [1.825e+01, 5.250e+01, 5.510e+00, ..., 1.895e+01, 5.500e-01,
        3.800e+01]])

In [119]:
Y_train_np.shape

(122333,)

In [120]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression().fit(X_train_np, Y_train_np)
Y_pred_train = lr.predict(X_train_np)
Y_pred_train

array([102.5065938 , 203.41985836, 191.50870193, ...,  75.9432746 ,
       133.95405761,  78.80723038])

In [121]:

import numpy as np
from numpy.linalg import inv

class Linear_Regression:
    def __init__(self):
        pass
# Assuming total_bedrooms[i] = alpha + (beta_1 * population[i]) + (beta_2 * households[i]) + (beta_3 * total_rooms[i]) + error
# Generally: y[i] = alpha + (beta_1 * x_1[i]) + (beta_2 * x_2[i]) + (beta_3 * x_3[i]) + error
# Model:     y_hat[i] = alpha_hat + (beta_1_hat * x_1[i]) + (beta_2_hat * x_2[i]) + (beta_3_hat * x_3[i])

    def get_predictions(self, model, X):
        '''
        Obtain the predictions for the given model and inputs.

        model: np.array of Floats with shape (p,) of parameters
        X: np.array of Floats with shape (n, p-1) of inputs

        Returns: np.array of Floats with shape (n,).
        '''
        (n, p_minus_one) = X.shape
        p = p_minus_one + 1

        new_X = np.ones(shape=(n, p))
        new_X[:, 1:] = X

        return np.dot(new_X, model)
    
    def get_best_model(self, X, y):
        '''
        Returns the model with the parameters that minimize the MSE.

        X: np.array of Floats with shape (n, p-1) of inputs
        y: np.array of Floats with shape (n,) of observed outputs

        Returns: np.array of shape (p,) representing the model.
        '''
        (n, p_minus_one) = X.shape
        p = p_minus_one + 1

        new_X = np.ones(shape=(n, p))
        new_X[:, 1:] = X

        return np.dot(np.dot(inv(np.dot(new_X.T, new_X)), new_X.T), y)


In [122]:
predictions_df_train = pd.DataFrame({
                               
                               'PM2.5 (µg/m³)': X_train['PM2.5 (µg/m³)'],
                               'PM10 (µg/m³)': X_train['PM10 (µg/m³)'],
                               'NO (µg/m³)': X_train['NO (µg/m³)'],
                               'NO2 (µg/m³)': X_train['NO2 (µg/m³)'],
                               'NOx (ppb)': X_train['NOx (ppb)'],
                               'NH3 (µg/m³)': X_train['NH3 (µg/m³)'],
                               'SO2 (µg/m³)': X_train['SO2 (µg/m³)'],
                               'CO (mg/m³)': X_train['CO (mg/m³)'],
                               'AQI_calculated': X_train['AQI_calculated'],
                               'AQI_calculated_shifted': Y_train,
    'Sklearn AQI Predictions':Y_pred_train
                               })
predictions_df_train

Unnamed: 0,PM2.5 (µg/m³),PM10 (µg/m³),NO (µg/m³),NO2 (µg/m³),NOx (ppb),NH3 (µg/m³),SO2 (µg/m³),CO (mg/m³),AQI_calculated,AQI_calculated_shifted,Sklearn AQI Predictions
49172,24.00,163.00,0.21,10.27,5.10,2.41,19.96,0.46,122.0,192.0,102.506594
50053,89.00,195.00,54.00,17.77,48.98,6.73,21.68,1.97,193.0,156.0,203.419858
139770,31.00,80.00,2.07,3.55,5.72,7.36,18.96,0.58,261.0,324.0,191.508702
149023,229.40,304.10,28.67,9.59,35.64,21.23,5.61,0.71,267.0,332.0,258.448641
5361,59.00,83.00,56.97,16.26,111.13,10.64,17.77,0.66,93.0,79.0,126.990786
...,...,...,...,...,...,...,...,...,...,...,...
95939,21.00,72.00,5.21,2.67,5.16,4.76,11.08,0.23,177.0,134.0,150.886811
152315,35.00,206.30,0.25,2.22,1.69,20.78,4.67,0.17,89.0,306.0,84.873857
117952,12.00,30.86,1.24,2.08,3.47,3.56,28.23,0.37,47.0,73.0,75.943275
173685,9.00,26.00,6.50,3.80,7.30,5.10,7.70,2.34,115.0,116.0,133.954058


In [123]:
linear_regression = Linear_Regression()

In [124]:
best_model = linear_regression.get_best_model(X_train_np, Y_train_np)
predictions_df_train['Best Predictions'] = linear_regression.get_predictions(best_model, X_train_np)
predictions_df_train

Unnamed: 0,PM2.5 (µg/m³),PM10 (µg/m³),NO (µg/m³),NO2 (µg/m³),NOx (ppb),NH3 (µg/m³),SO2 (µg/m³),CO (mg/m³),AQI_calculated,AQI_calculated_shifted,Sklearn AQI Predictions,Best Predictions
49172,24.00,163.00,0.21,10.27,5.10,2.41,19.96,0.46,122.0,192.0,102.506594,102.506594
50053,89.00,195.00,54.00,17.77,48.98,6.73,21.68,1.97,193.0,156.0,203.419858,203.419858
139770,31.00,80.00,2.07,3.55,5.72,7.36,18.96,0.58,261.0,324.0,191.508702,191.508702
149023,229.40,304.10,28.67,9.59,35.64,21.23,5.61,0.71,267.0,332.0,258.448641,258.448641
5361,59.00,83.00,56.97,16.26,111.13,10.64,17.77,0.66,93.0,79.0,126.990786,126.990786
...,...,...,...,...,...,...,...,...,...,...,...,...
95939,21.00,72.00,5.21,2.67,5.16,4.76,11.08,0.23,177.0,134.0,150.886811,150.886811
152315,35.00,206.30,0.25,2.22,1.69,20.78,4.67,0.17,89.0,306.0,84.873857,84.873857
117952,12.00,30.86,1.24,2.08,3.47,3.56,28.23,0.37,47.0,73.0,75.943275,75.943275
173685,9.00,26.00,6.50,3.80,7.30,5.10,7.70,2.34,115.0,116.0,133.954058,133.954058


In [125]:
X_test_np=X_test.to_numpy()
Y_test_np=Y_test.to_numpy()

In [126]:
X_test_np, Y_test_np

(array([[2.3311e+02, 4.2811e+02, 8.2720e+01, ..., 2.3710e+01, 3.6800e+00,
         3.2100e+02],
        [8.2000e+01, 1.1300e+02, 1.1560e+01, ..., 1.9270e+01, 8.2000e-01,
         1.2100e+02],
        [1.1500e+02, 2.6062e+02, 5.4900e+00, ..., 1.0260e+01, 1.0300e+00,
         2.0300e+02],
        ...,
        [2.8000e+01, 5.6000e+01, 5.3800e+00, ..., 1.2120e+01, 3.0000e-01,
         1.5000e+02],
        [2.1000e+01, 2.7000e+01, 5.4000e+00, ..., 8.6500e+00, 3.1000e-01,
         4.0000e+01],
        [3.2000e+01, 5.9000e+01, 1.0100e+00, ..., 1.4630e+01, 4.9000e-01,
         7.6000e+01]]),
 array([325., 149., 301., ..., 227., 132.,  32.]))

In [127]:
lr_test=LinearRegression().fit(X_test_np, Y_test_np)
Y_pred_test = lr.predict(X_test_np)
Y_pred_test

array([320.71814876, 138.8569213 , 175.93115792, ..., 142.08506363,
        88.57762889, 101.33273603])

In [128]:
predictions_df_test = pd.DataFrame({
                               
                               'PM2.5 (µg/m³)': X_test['PM2.5 (µg/m³)'],
                               'PM10 (µg/m³)': X_test['PM10 (µg/m³)'],
                               'NO (µg/m³)': X_test['NO (µg/m³)'],
                               'NO2 (µg/m³)': X_test['NO2 (µg/m³)'],
                               'NOx (ppb)': X_test['NOx (ppb)'],
                               'NH3 (µg/m³)': X_test['NH3 (µg/m³)'],
                               'SO2 (µg/m³)': X_test['SO2 (µg/m³)'],
                               'CO (mg/m³)': X_test['CO (mg/m³)'],
                               'AQI_calculated': X_test['AQI_calculated'],
                               'AQI_calculated_shifted': Y_test,
    'Sklearn AQI Predictions':Y_pred_test
                               })
predictions_df_test

Unnamed: 0,PM2.5 (µg/m³),PM10 (µg/m³),NO (µg/m³),NO2 (µg/m³),NOx (ppb),NH3 (µg/m³),SO2 (µg/m³),CO (mg/m³),AQI_calculated,AQI_calculated_shifted,Sklearn AQI Predictions
51427,233.11,428.11,82.72,14.19,68.66,4.17,23.71,3.68,321.0,325.0,320.718149
83982,82.00,113.00,11.56,2.93,17.70,11.87,19.27,0.82,121.0,149.0,138.856921
88698,115.00,260.62,5.49,2.57,5.34,11.81,10.26,1.03,203.0,301.0,175.931158
51363,123.00,221.00,14.03,14.72,17.64,5.03,22.09,1.51,324.0,326.0,251.552346
102949,81.00,121.00,5.30,2.85,5.36,6.17,12.50,1.35,107.0,180.0,136.097518
...,...,...,...,...,...,...,...,...,...,...,...
135147,18.57,62.43,5.38,1.91,5.03,4.79,9.23,0.29,127.0,47.0,126.757579
140757,31.00,83.00,1.41,0.67,2.46,9.38,20.52,0.46,181.0,267.0,148.120766
95688,28.00,56.00,5.38,2.62,5.30,5.44,12.12,0.30,150.0,227.0,142.085064
136279,21.00,27.00,5.40,2.31,5.23,4.60,8.65,0.31,40.0,132.0,88.577629


In [129]:
best_model = linear_regression.get_best_model(X_test_np, Y_test_np)
predictions_df_test['Best Predictions'] = linear_regression.get_predictions(best_model, X_test_np)
predictions_df_test

Unnamed: 0,PM2.5 (µg/m³),PM10 (µg/m³),NO (µg/m³),NO2 (µg/m³),NOx (ppb),NH3 (µg/m³),SO2 (µg/m³),CO (mg/m³),AQI_calculated,AQI_calculated_shifted,Sklearn AQI Predictions,Best Predictions
51427,233.11,428.11,82.72,14.19,68.66,4.17,23.71,3.68,321.0,325.0,320.718149,322.540560
83982,82.00,113.00,11.56,2.93,17.70,11.87,19.27,0.82,121.0,149.0,138.856921,138.859877
88698,115.00,260.62,5.49,2.57,5.34,11.81,10.26,1.03,203.0,301.0,175.931158,177.061788
51363,123.00,221.00,14.03,14.72,17.64,5.03,22.09,1.51,324.0,326.0,251.552346,254.423511
102949,81.00,121.00,5.30,2.85,5.36,6.17,12.50,1.35,107.0,180.0,136.097518,135.795602
...,...,...,...,...,...,...,...,...,...,...,...,...
135147,18.57,62.43,5.38,1.91,5.03,4.79,9.23,0.29,127.0,47.0,126.757579,128.186719
140757,31.00,83.00,1.41,0.67,2.46,9.38,20.52,0.46,181.0,267.0,148.120766,149.586436
95688,28.00,56.00,5.38,2.62,5.30,5.44,12.12,0.30,150.0,227.0,142.085064,143.779442
136279,21.00,27.00,5.40,2.31,5.23,4.60,8.65,0.31,40.0,132.0,88.577629,88.497519


In [131]:
mean_squared_error(Y_test,predictions_df_test['Best Predictions'])

6670.849580886624