
# Predicting Air Pollution Levels using Linear Models 

This notebook demonstrates the steps to use Linear, Ridge and a Lasso Regression models to predict air pollution levels.

In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeCV
from sklearn.metrics import confusion_matrix
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score



Loading the data 

In [6]:
full_data = pd.read_csv("prepped_data/Full_data.csv") 
full_data.columns

Index(['Hour', 'date', 'NO_ugm3', 'NO2_ugm3', 'O3_ugm3', 'CO_mgm3', 'CO2_mgm3',
       'PM25_ugm3', 'SiteID', 'Lat', 'Long', 'day_of_week', 'avgtempC',
       'maxtempC', 'mintempC', 'sunHour', 'uvIndex', 'humidity',
       'winddirDegree', 'windspeedKmph', 'cloudcover', 'precipMM', 'pressure',
       'DCC-AQ1-co', 'DCC-AQ1-no', 'DCC-AQ10-no', 'DCC-AQ13-no', 'DCC-AQ5-no',
       'DCC-AQ6-no', 'DCC-AQ1-no2', 'DCC-AQ10-no2', 'DCC-AQ13-no2',
       'DCC-AQ22-no2', 'DCC-AQ5-no2', 'DCC-AQ6-no2', 'DCC-AQ69-no2',
       'DCC-AQ22-o3', 'DCC-AQ69-o3', 'DCC-AQ10-pm1', 'DCC-AQ13-pm1',
       'DCC-AQ2-pm1', 'DCC-AQ3-pm1', 'DCC-AQ4-pm1', 'DCC-AQ5-pm1',
       'DCC-AQ52-pm1', 'DCC-AQ6-pm1', 'TNO2161-pm1', 'TNO2162-pm1',
       'TNO4435-pm1', 'TNT1088-pm1', 'DCC-AQ10-pm10', 'DCC-AQ13-pm10',
       'DCC-AQ2-pm10', 'DCC-AQ22-pm10', 'DCC-AQ3-pm10', 'DCC-AQ4-pm10',
       'DCC-AQ5-pm10', 'DCC-AQ52-pm10', 'DCC-AQ6-pm10', 'TNO2161-pm10',
       'TNO2162-pm10', 'TNO4435-pm10', 'TNT1088-pm10', 'DCC-AQ10-pm2_

In [7]:
df_test = full_data[full_data['date'] >= '2022-05-01']
df_train = full_data[full_data['date'] < '2022-05-01']

In [8]:
# #Split train into X and Y
Xtrain = df_train.iloc[:, 8:].values
ytrain = df_train["PM25_ugm3"].values

# #Split test into X and Y
Xtest = df_test.iloc[:, 8:].values
ytest = df_test["PM25_ugm3"].values


Hot one encoding

In [9]:
# Assuming you know the names of the categorical columns
categorical_columns = ['SiteID', 'day_of_week']  # List of categorical column names

# Convert arrays back to DataFrame for easier manipulation
Xtrain_df = pd.DataFrame(Xtrain, columns=df_train.columns[8:])
Xtest_df = pd.DataFrame(Xtest, columns=df_test.columns[8:])

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Fit the encoder on the training data
encoder.fit(Xtrain_df[categorical_columns])

# Transform both training and test data
Xtrain_encoded = encoder.transform(Xtrain_df[categorical_columns])
Xtest_encoded = encoder.transform(Xtest_df[categorical_columns])

# Create DataFrames from the encoded arrays, include column names for easier merging
encoded_columns = encoder.get_feature_names_out(categorical_columns)
Xtrain_encoded_df = pd.DataFrame(Xtrain_encoded, columns=encoded_columns)
Xtest_encoded_df = pd.DataFrame(Xtest_encoded, columns=encoded_columns)

# Drop the original categorical columns and concat the new encoded columns
Xtrain_final = pd.concat([Xtrain_df.drop(categorical_columns, axis=1), Xtrain_encoded_df], axis=1)
Xtest_final = pd.concat([Xtest_df.drop(categorical_columns, axis=1), Xtest_encoded_df], axis=1)




In [6]:
# np.random.seed(0)   
# # imputing 
# imputer = SimpleImputer(strategy="mean")
# Xtrain_1 = imputer.fit_transform(Xtrain_final)  # Impute
# Xtest_1 = imputer.transform(Xtest_final)  # Impute

# # Convert back to DataFrame
# Xtrain_1 = pd.DataFrame(Xtrain_1, columns=Xtrain_final.columns)
# Xtest_1 = pd.DataFrame(Xtest_1, columns=Xtest_final.columns)

# # scaling 
# scaler = StandardScaler()
# Xtrain_1 = scaler.fit_transform(Xtrain_1)  # Scale
# Xtest_1 = scaler.transform(Xtest_1)  # Scale

# # Convert back to DataFrame
# Xtrain_1 = pd.DataFrame(Xtrain_1, columns=Xtrain_final.columns)
# Xtest_1 = pd.DataFrame(Xtest_1, columns=Xtest_final.columns)

In [32]:
# np.random.seed(0)   

# imputer = KNNImputer(n_neighbors=5)
# Xtrain_KNN = imputer.fit_transform(Xtrain_final)
# Xtest_KNN = imputer.transform(Xtest_final)

# # Convert back to DataFrame
# Xtrain_2 = pd.DataFrame(Xtrain_KNN, columns=Xtrain_final.columns)
# Xtest_2 = pd.DataFrame(Xtest_KNN, columns=Xtest_final.columns)

# Xtrain = scaler.fit_transform(Xtrain_2)
# Xtest = scaler.transform(Xtest_2)

# # Convert back to DataFrame
# Xtrain_KNN = pd.DataFrame(Xtrain, columns=Xtrain_final.columns)
# Xtest_KNN = pd.DataFrame(Xtest, columns=Xtest_final.columns)

In [10]:
# print(Xtrain_KNN.shape) 
# print(Xtest_KNN.shape)  

print(ytrain.shape)
print(ytest.shape)

# print(Xtrain_1.shape)
# print(Xtest_1.shape)

# print(ytrain.shape)
# print(ytest.shape)

(34923,)
(8272,)


In [11]:
# # mine

Xtrain_KNN_encoded = pd.read_csv("prepped_data/Xtrain_KNN_encoded.csv")
Xtest_KNN_encoded = pd.read_csv("prepped_data/Xtest_KNN_encoded.csv")   


Linear Regression

Linear Regression with KNN imputed data 

In [12]:
linear_model = LinearRegression()

# You can still use TimeSeriesSplit for cross-validation to evaluate model performance
tscv = TimeSeriesSplit(n_splits=5)

# Evaluate the model using cross-validation
scores = cross_val_score(linear_model, Xtrain_KNN_encoded, ytrain, cv=tscv, scoring='neg_mean_squared_error')

# Print the average of the scores (neg_mean_squared_error)
print("Average MSE:", np.mean(scores))

Average MSE: -9.31181087046276e+24


In [13]:
# Initialize the Linear Regression model
linear_model = LinearRegression()

# Fit the model on the training data
linear_model.fit(Xtrain_KNN_encoded, ytrain)

# Predict on the test data
ypred_linear = linear_model.predict(Xtest_KNN_encoded)

# Calculate the mean squared error
mse = mean_squared_error(ytest, ypred_linear)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 35.73540005659166


Ridge 

In [14]:
# Initialize Ridge Regression
ridge = Ridge()
tscv = TimeSeriesSplit(n_splits=5)
# Define parameter grid
param_grid = {
    'alpha': np.logspace(-4, 4, 100)  # This creates 20 logarithmically spaced values between 10^-4 and 10^4.
}


In [15]:
# Setup GridSearchCV
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

In [16]:
# Fit GridSearchCV to find the best model
grid_search.fit(Xtrain_KNN_encoded, ytrain)

# Output the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best score (negative MSE):", grid_search.best_score_)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters: {'alpha': 8302.175681319752}
Best score (negative MSE): -52.22462135905255


In [17]:
best_alpha = grid_search.best_params_['alpha']
print("Best parameters: alpha =", best_alpha)
print("Best score (negative MSE):", grid_search.best_score_)

Best parameters: alpha = 8302.175681319752
Best score (negative MSE): -52.22462135905255


In [18]:
# Initialize the Ridge Regression model with the best alpha
ridge_model = Ridge(alpha= best_alpha)

# Fit the model on the training data
ridge_model.fit(Xtrain_KNN_encoded, ytrain) 

In [19]:
ypred_ridge = ridge_model.predict(Xtest_KNN_encoded)  # Predict on the test data
mse = mean_squared_error(ytest, ypred_ridge)  # Calculate the mean squared error
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 27.823239142819737


Lasso 

In [20]:
# Define parameter grid
lasso = Lasso()
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {
    'alpha': np.logspace(-4, 4, 100)  # Creates 100 logarithmically spaced values between 10^-4 and 10^4.
}



In [21]:
# Setup GridSearchCV
grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=tscv, 
                           scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

In [22]:
# Runtime 30 minutes
# Fit GridSearchCV to find the best model
grid_search.fit(Xtrain_KNN_encoded, ytrain)

# Output the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best score (negative MSE):", grid_search.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters: {'alpha': 0.08111308307896872}
Best score (negative MSE): -53.68917929499321


In [23]:
best_alpha = grid_search.best_params_['alpha']
print("Best parameters: alpha =", best_alpha)
print("Best score (negative MSE):", grid_search.best_score_)

Best parameters: alpha = 0.08111308307896872
Best score (negative MSE): -53.68917929499321


In [24]:
# Initialize the Ridge Regression model with the best alpha
lasso_model = Lasso(alpha = best_alpha)

# Fit the model on the training data
lasso_model.fit(Xtrain_KNN_encoded, ytrain) 

In [25]:
ypred_lasso = lasso_model.predict(Xtest_KNN_encoded)  # Predict on the test data
mse = mean_squared_error(ytest, ypred_lasso)  # Calculate the mean squared error
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 27.09031238631191


In [44]:

print(ypred_linear)
print(ypred_ridge)
print(ypred_lasso)

[ 8.95060329 13.97654323  8.62638454 ...  3.79949431  4.99381117
  5.77860121]
[ 9.27192981 13.00452593  8.58082855 ...  5.03594065  5.63670285
  6.54427142]
[ 9.25954061 10.51681352  7.84400238 ...  5.73270154  5.66078255
  7.39519242]


R squared 

In [47]:
r_squared_linear = r2_score(ytest, ypred_linear)
print("R-squared of Linear Model (using r2_score function):", r_squared_linear)

r_squared_ridge = r2_score(ytest, ypred_ridge)
print("R-squared of Ridge (using r2_score function):", r_squared_ridge)

r_squared_lasso = r2_score(ytest, ypred_lasso)
print("R-squared of Lasso (using r2_score function):", r_squared_lasso)

R-squared of Linear Model (using r2_score function): -0.19014792007349413
R-squared of Ridge (using r2_score function): 0.07336226422274894
R-squared of Lasso (using r2_score function): 0.09777198829027356


In [37]:
# Get feature names (assuming you have them stored in a list)
feature_names = linear_model.feature_names_in_ # Replace with your actual feature names

# Pair feature names with coefficients
feature_coef_pairs = zip(feature_names, linear_model.coef_)

# Sort features by coefficient magnitude
sorted_features = sorted(feature_coef_pairs, key=lambda x: abs(x[1]), reverse=True)

# Print the top N most important features
top_n = 3
for feature, coef in sorted_features[:top_n]:
    print(f"Feature: {feature}, Coefficient: {coef}")

Feature: SiteID_963.0, Coefficient: 129881424610.02441
Feature: SiteID_144.0, Coefficient: 126543674884.56956
Feature: SiteID_819.0, Coefficient: 115011470972.7988


In [38]:
# Get feature names (assuming you have them stored in a list)
feature_names = ridge_model.feature_names_in_ # Replace with your actual feature names

# Pair feature names with coefficients
feature_coef_pairs = zip(feature_names, ridge_model.coef_)

# Sort features by coefficient magnitude
sorted_features = sorted(feature_coef_pairs, key=lambda x: abs(x[1]), reverse=True)

# Print the top N most important features
top_n = 3
for feature, coef in sorted_features[:top_n]:
    print(f"Feature: {feature}, Coefficient: {coef}")

Feature: SiteID_835.0, Coefficient: 0.8307375072683622
Feature: SiteID_830.0, Coefficient: 0.5927670969506216
Feature: SiteID_80.0, Coefficient: 0.5147536938056233


In [43]:
# Get feature names (assuming you have them stored in a list)
feature_names = lasso_model.feature_names_in_ # Replace with your actual feature names

# Pair feature names with coefficients
feature_coef_pairs = zip(feature_names, lasso_model.coef_)

# Sort features by coefficient magnitude
sorted_features = sorted(feature_coef_pairs, key=lambda x: abs(x[1]), reverse=True)

# Print the top N most important features
top_n = 3
for feature, coef in sorted_features[:top_n]:
    print(f"Feature: {feature}, Coefficient: {coef}")

Feature: SiteID_835.0, Coefficient: 0.9776238444346154
Feature: DCC-AQ10-pm4, Coefficient: 0.9603606420838743
Feature: TNT1088-pm2_5, Coefficient: 0.8811241598391167
