# Lasso and Ridge Regression Library Implementation
In this Notebook, we implement Lasso (L1 Regularization) and Ridge regression (L2 regularization) using the pre-defined models from the Scikit-learn library.

In [59]:
# Importing libraries 
import numpy as np
import pandas as pd
import error_define

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Dataset Operations 1

In [60]:
# Importing dataset     
df = pd.read_csv( "../ML-Project-CS361/cleaned_shifted_data.csv" ) 
drop_cols = [0,1,2,12,14,16]
drop_cols = df.columns[drop_cols]
drop_cols # Dropping unecessary columns

Index(['Timestamp', 'Unnamed: 0', 'Station', 'Checks', 'AQI_bucket_calculated',
       'AQI_bucket_calculated_shifted'],
      dtype='object')

In [61]:
# Drop the columns and make the changes in-place
df.drop(columns=drop_cols, inplace=True)

In [62]:
print(df.shape)  # The dataset has a size of 1,74,762 records, 10 features, 1 target variable

(174762, 11)


In [63]:
df.head() 

Unnamed: 0,PM2.5 (µg/m³),PM10 (µg/m³),NO (µg/m³),NO2 (µg/m³),NOx (ppb),NH3 (µg/m³),SO2 (µg/m³),CO (mg/m³),Ozone (µg/m³),AQI_calculated,AQI_calculated_shifted
0,46.0,80.0,1.29,9.16,12.02,27.19,13.56,0.4,15.8,67.0,296.0
1,46.0,80.0,1.74,8.93,12.48,30.29,13.71,0.41,15.52,68.0,297.0
2,45.62,79.92,1.87,8.56,12.17,28.2,13.88,0.41,15.33,68.0,298.0
3,41.0,72.92,1.83,8.72,12.37,26.69,13.77,0.4,15.3,68.0,298.0
4,41.0,79.0,1.69,7.91,11.3,26.83,13.87,0.41,15.49,68.0,299.0


In [64]:
# Randomly sample 32000 rows
df_subset = df
print(df_subset.shape)

(174762, 11)


In [65]:
df_subset.describe()

Unnamed: 0,PM2.5 (µg/m³),PM10 (µg/m³),NO (µg/m³),NO2 (µg/m³),NOx (ppb),NH3 (µg/m³),SO2 (µg/m³),CO (mg/m³),Ozone (µg/m³),AQI_calculated,AQI_calculated_shifted
count,174762.0,174762.0,174762.0,174762.0,174762.0,174762.0,174762.0,174762.0,174762.0,174762.0,174762.0
mean,59.679503,111.238568,9.660644,8.324557,17.762988,9.984327,18.461032,0.689923,24.271172,140.573117,139.053112
std,59.876848,111.774626,20.843351,10.595687,33.777433,7.575767,13.741348,0.625532,22.874962,104.720841,105.081092
min,0.1,1.59,0.01,0.02,0.0,0.01,0.1,0.0,0.01,14.0,11.0
25%,20.0,37.67,1.21,2.52,4.59,4.0,10.08,0.31,12.67,56.0,56.0
50%,39.0,73.0,3.36,4.17,5.45,7.2,14.46,0.49,18.08,101.0,100.0
75%,81.0,148.0,5.5975,10.08,13.28,15.42,22.77,0.84,26.71,214.0,204.0
max,923.08,1000.0,472.55,122.0,488.62,113.3,195.01,9.71,193.57,1109.0,1109.0


In [66]:
# Separating the features and labels/target variables
X = df_subset.drop('AQI_calculated_shifted',axis = 1)  # feature set
Y = df_subset['AQI_calculated_shifted'] # target variable
print(X.shape)
print(Y.shape)

(174762, 10)
(174762,)


In [67]:
Y = Y.values.reshape(-1, 1) # convert to a numpy array
Y 

array([[296.],
       [297.],
       [298.],
       ...,
       [219.],
       [220.],
       [220.]])

In [68]:
# Splitting dataset into train and test set 
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 1 / 3, random_state = 0 )
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(116508, 10)
(58254, 10)
(116508, 1)
(58254, 1)


In [69]:
# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train

array([[ 3.67831173,  2.2237575 , -0.20599341, ...,  3.40249554,
         0.08160396,  1.92882892],
       [-0.29548835, -0.3588719 , -0.26667847, ..., -0.25689331,
        -0.23740736, -0.68081168],
       [ 1.56565852,  0.86541954, -0.35201684, ...,  0.5616542 ,
        -0.98845864,  1.75739267],
       ...,
       [-0.79850102, -0.71758036, -0.40464216, ..., -0.51369252,
        -0.34563555, -0.89034486],
       [-0.84880229, -0.76101143, -0.15526449, ...,  2.64814784,
        -0.94438184, -0.24269683],
       [-0.69370672, -0.52419593, -0.20220059, ..., -0.2247934 ,
        -0.31901491, -0.97606298]])

### L1 Regularization (Lasso)

In [70]:
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
parameters = {'alpha': 0.5,
              'tol': 0.1
              }
model = Lasso(**parameters)

model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
score = r2_score(Y_test, Y_pred)
print("R2 {}".format(score))
mse = mean_squared_error(Y_test, Y_pred)
print("MSE", mse)

R2 0.4029755342896376
MSE 6677.476696000735


In [71]:
# Calculate the custom error metric
from error_define import *
Y_pred1 = Y_pred.reshape(-1, 1)
Y_test1 = Y_test.reshape(-1, 1)
downside_square_error(Y_pred1,Y_test1)

array([4243.16112426])

### Ridge

In [72]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
parameters = {'alpha': 0.5
              }
model = Ridge(**parameters)

model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
score = r2_score(Y_test, Y_pred)
print("R2 {}".format(score))
mse = mean_squared_error(Y_test, Y_pred)
print("MSE", mse)

R2 0.4045024898684959
MSE 6660.398316672472


In [73]:
# Calculate the custom error metric
from error_define import *
Y_pred1 = Y_pred.reshape(-1, 1)
Y_test1 = Y_test.reshape(-1, 1)
downside_square_error(Y_pred1,Y_test1)

array([4240.2167977])

## Dataset Operations 2

In [74]:
from sklearn.preprocessing import OneHotEncoder
df = pd.read_csv("cleaned_shifted_data.csv")
    
oe = OneHotEncoder(sparse=False)
encoded = oe.fit_transform(pd.DataFrame(df['Station']))

In [75]:
oe.get_feature_names()

array(['x0_IITG ', 'x0_LGBI Airport ', 'x0_Pan Bazaar ',
       'x0_Railway Colony '], dtype=object)

In [76]:

one_hot_df = pd.DataFrame(encoded, columns=oe.get_feature_names())
df = pd.concat([df, one_hot_df], axis=1)

df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['year'] = df['Timestamp'].dt.year
df['month'] = df['Timestamp'].dt.month
df['dayofweek'] = df['Timestamp'].dt.day_of_week

drop_cols = [0,1,2,12,14,16]
drop_cols = df.columns[drop_cols]
df.drop(drop_cols,axis=1,inplace=True)

X = df.drop('AQI_calculated_shifted',axis = 1)
y = df['AQI_calculated_shifted']

In [77]:
print(X.shape, y.shape)

(174762, 17) (174762,)


In [78]:
# Splitting dataset into train and test set 
X_train, X_test, Y_train, Y_test = train_test_split( X, y, test_size = 1 / 3, random_state = 0 )
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(116508, 17)
(58254, 17)
(116508,)
(58254,)


In [79]:
# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train

array([[ 3.67831173,  2.2237575 , -0.20599341, ...,  1.19263893,
        -1.54346737, -0.01566533],
       [-0.29548835, -0.3588719 , -0.26667847, ..., -0.95436545,
        -0.96118867, -0.01566533],
       [ 1.56565852,  0.86541954, -0.35201684, ..., -0.23869732,
        -1.25232802,  0.49471947],
       ...,
       [-0.79850102, -0.71758036, -0.40464216, ...,  0.47697081,
         0.49450809,  0.49471947],
       [-0.84880229, -0.76101143, -0.15526449, ...,  1.19263893,
         0.78564744,  1.00510426],
       [-0.69370672, -0.52419593, -0.20220059, ..., -0.95436545,
         0.49450809, -0.01566533]])

### Lasso

In [80]:
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
parameters = {'alpha': 0.5,
              'tol': 0.1
              }
model = Lasso(**parameters)

model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
score = r2_score(Y_test, Y_pred)
print("R2 {}".format(score))
mse = mean_squared_error(Y_test, Y_pred)
print("MSE", mse)

R2 0.4139777006204265
MSE 6554.421924381027


In [81]:
# Calculate the custom error metric
from error_define import *
Y_pred1 = Y_pred.reshape(-1, 1)
Y_test1 = Y_test.values.reshape(-1, 1)
downside_square_error(Y_pred1,Y_test1)

array([4181.79616743])

### Ridge

In [82]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
parameters = {'alpha': 0.5
              }
model = Ridge(**parameters)

model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
score = r2_score(Y_test, Y_pred)
print("R2 {}".format(score))
mse = mean_squared_error(Y_test, Y_pred)
print("MSE", mse)

R2 0.4162744542784833
MSE 6528.733665508954


In [83]:
# Calculate the custom error metric
from error_define import *
Y_pred1 = Y_pred.reshape(-1, 1)
Y_test1 = Y_test.values.reshape(-1, 1)
downside_square_error(Y_pred1,Y_test1)

array([4168.87589923])