
# Predicting Air Pollution Levels using Random Forest

This notebook demonstrates the steps to use a Random Forest model to predict air pollution levels.


In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

Mean Imputation
---

In [None]:
# Option to keep column names

#set random seed
np.random.seed(42)

df_test = pd.read_csv('prepped_data/test_data.csv')
df_train = pd.read_csv('prepped_data/train_data.csv')

# Save the column names
column_names = df_train.columns

# Split train into X and Y
Xtrain = df_train.iloc[:, 8:].copy()  # Retain the column names by using copy
ytrain = df_train["PM25_ugm3"].values

# Split test into X and Y
Xtest = df_test.iloc[:, 8:].copy()  # Retain the column names by using copy
ytest = df_test["PM25_ugm3"].values

# Running a simple imputer and scaler
imputer = SimpleImputer(strategy="mean")
Xtrain_imputed = pd.DataFrame(imputer.fit_transform(Xtrain), columns=Xtrain.columns)  # Assigning column names back
Xtest_imputed = pd.DataFrame(imputer.transform(Xtest), columns=Xtest.columns)  # Assigning column names back

scaler = StandardScaler()
Xtrain_scaled = pd.DataFrame(scaler.fit_transform(Xtrain_imputed), columns=Xtrain_imputed.columns)  # Assigning column names back
Xtest_scaled = pd.DataFrame(scaler.transform(Xtest_imputed), columns=Xtest_imputed.columns)  # Assigning column names back

KNN Imputation

In [10]:
# Option to keep column names
# set random seed
np.random.seed(42)

from sklearn.impute import KNNImputer

# Split train into X and Y
Xtrain_KNN = df_train.iloc[:, 8:].copy()  # Retain the column names by using copy

# Split test into X and Y
Xtest_KNN = df_test.iloc[:, 8:].copy()  # Retain the column names by using copy

# Perform KNN imputation on the train and test data
imputer = KNNImputer(n_neighbors=5)
Xtrain_KNN_imputed = pd.DataFrame(imputer.fit_transform(Xtrain_KNN), columns=Xtrain_KNN.columns)  # Assigning column names back
Xtest_KNN_imputed = pd.DataFrame(imputer.transform(Xtest_KNN), columns=Xtest_KNN.columns)  # Assigning column names back

# Scale the data (not needed for random forest)
# Xtrain_KNN_scaled = pd.DataFrame(scaler.fit_transform(Xtrain_KNN_imputed), columns=Xtrain_KNN_imputed.columns)  # Assigning column names back
# Xtest_KNN_scaled = pd.DataFrame(scaler.transform(Xtest_KNN_imputed), columns=Xtest_KNN_imputed.columns)  # Assigning column names back

In [14]:
Xtrain_KNN_imputed.columns

Index(['SiteID', 'Lat', 'Long', 'day_of_week', 'hour_x', 'avgtempC',
       'maxtempC', 'mintempC', 'sunHour', 'uvIndex', 'humidity',
       'winddirDegree', 'windspeedKmph', 'cloudcover', 'precipMM', 'pressure',
       'DCC-AQ1-co', 'DCC-AQ1-no', 'DCC-AQ10-no', 'DCC-AQ13-no', 'DCC-AQ5-no',
       'DCC-AQ6-no', 'DCC-AQ1-no2', 'DCC-AQ10-no2', 'DCC-AQ13-no2',
       'DCC-AQ22-no2', 'DCC-AQ5-no2', 'DCC-AQ6-no2', 'DCC-AQ69-no2',
       'DCC-AQ22-o3', 'DCC-AQ69-o3', 'DCC-AQ10-pm1', 'DCC-AQ13-pm1',
       'DCC-AQ2-pm1', 'DCC-AQ3-pm1', 'DCC-AQ4-pm1', 'DCC-AQ5-pm1',
       'DCC-AQ52-pm1', 'DCC-AQ6-pm1', 'TNO2161-pm1', 'TNO2162-pm1',
       'TNO4435-pm1', 'TNT1088-pm1', 'DCC-AQ10-pm10', 'DCC-AQ13-pm10',
       'DCC-AQ2-pm10', 'DCC-AQ22-pm10', 'DCC-AQ3-pm10', 'DCC-AQ4-pm10',
       'DCC-AQ5-pm10', 'DCC-AQ52-pm10', 'DCC-AQ6-pm10', 'TNO2161-pm10',
       'TNO2162-pm10', 'TNO4435-pm10', 'TNT1088-pm10', 'DCC-AQ10-pm2_5',
       'DCC-AQ13-pm2_5', 'DCC-AQ2-pm2_5', 'DCC-AQ22-pm2_5', 'DCC-AQ3-pm2_5',
  

In [None]:
# save with column names
Xtrain_KNN_imputed.to_csv('prepped_data/with_column_names/Xtrain_KNN.csv', index=False)
Xtrain_KNN_imputed.to_csv('prepped_data/with_column_names/Xtest_KNN.csv', index=False)
ytrain.to_csv('prepped_data/with_column_names/ytrain.csv', index=False)
ytest.to_csv('prepped_data/with_column_names/ytest.csv', index=False)


## Load Data

Load your training and testing data from CSV files.


In [16]:
df_test = pd.read_csv('prepped_data/test_data.csv')
df_train = pd.read_csv('prepped_data/train_data.csv')

In [18]:
df_test.columns

Index(['Hour', 'date', 'NO_ugm3', 'NO2_ugm3', 'O3_ugm3', 'CO_mgm3', 'CO2_mgm3',
       'PM25_ugm3', 'SiteID', 'Lat', 'Long', 'day_of_week', 'hour_x',
       'avgtempC', 'maxtempC', 'mintempC', 'sunHour', 'uvIndex', 'humidity',
       'winddirDegree', 'windspeedKmph', 'cloudcover', 'precipMM', 'pressure',
       'DCC-AQ1-co', 'DCC-AQ1-no', 'DCC-AQ10-no', 'DCC-AQ13-no', 'DCC-AQ5-no',
       'DCC-AQ6-no', 'DCC-AQ1-no2', 'DCC-AQ10-no2', 'DCC-AQ13-no2',
       'DCC-AQ22-no2', 'DCC-AQ5-no2', 'DCC-AQ6-no2', 'DCC-AQ69-no2',
       'DCC-AQ22-o3', 'DCC-AQ69-o3', 'DCC-AQ10-pm1', 'DCC-AQ13-pm1',
       'DCC-AQ2-pm1', 'DCC-AQ3-pm1', 'DCC-AQ4-pm1', 'DCC-AQ5-pm1',
       'DCC-AQ52-pm1', 'DCC-AQ6-pm1', 'TNO2161-pm1', 'TNO2162-pm1',
       'TNO4435-pm1', 'TNT1088-pm1', 'DCC-AQ10-pm10', 'DCC-AQ13-pm10',
       'DCC-AQ2-pm10', 'DCC-AQ22-pm10', 'DCC-AQ3-pm10', 'DCC-AQ4-pm10',
       'DCC-AQ5-pm10', 'DCC-AQ52-pm10', 'DCC-AQ6-pm10', 'TNO2161-pm10',
       'TNO2162-pm10', 'TNO4435-pm10', 'TNT1088-pm10', 'DCC

In [14]:
import pandas as pd

# Load the datasets
X_train = pd.read_csv('prepped_data/Xtrain_KNN.csv', header=None)
X_test = pd.read_csv('prepped_data/Xtest_KNN.csv', header=None)
y_train = pd.read_csv('prepped_data/ytrain.csv', header=None)
y_test = pd.read_csv('prepped_data/ytest.csv', header=None)

In [15]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,75,76,77,78,79,80,81,82,83,84
0,-0.399399,-1.128775,-1.358738,0.739992,0.668166,-1.330213,-1.137422,-1.953354,0.900165,-0.207077,...,-0.789852,-0.479818,-0.040655,-0.582915,-0.783813,-0.550041,-1.160901,-0.704121,0.549269,-1.570160e+00
1,1.082869,-0.759257,-1.584956,0.739992,0.251265,-1.330213,-1.137422,-1.953354,0.900165,-0.207077,...,-0.875627,-0.626709,-0.065416,-0.606722,-0.655904,-0.547688,-1.141229,-0.441354,-0.407852,3.092100e-16
2,0.579897,-1.188387,-1.714836,0.739992,0.251265,-1.330213,-1.137422,-1.953354,0.900165,-0.207077,...,-0.875627,-0.626709,-0.065416,-0.606722,-0.655904,-0.547688,-1.141229,-0.441354,-0.407852,3.092100e-16
3,0.140213,-1.048941,-2.050665,0.739992,0.251265,-1.330213,-1.137422,-1.953354,0.900165,-0.207077,...,-0.875627,-0.626709,-0.065416,-0.606722,-0.655904,-0.547688,-1.141229,-0.441354,-0.407852,1.027287e+00
4,0.340070,-2.137602,-1.135172,0.739992,0.668166,-1.330213,-1.137422,-1.953354,0.900165,-0.207077,...,-0.789852,-0.479818,-0.040655,-0.582915,-0.783813,-0.550041,-1.160901,-0.704121,0.549269,3.092100e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34918,-1.332061,-0.132409,-1.337824,1.448186,-0.999438,-0.546162,-0.400707,-0.684064,0.900165,-0.207077,...,1.603739,1.230110,-0.496255,-0.678141,1.233214,-0.318336,-0.008083,0.000000,0.062163,4.030834e-01
34919,0.906329,-0.435661,-1.567350,1.448186,-0.999438,-0.546162,-0.400707,-0.684064,0.900165,-0.207077,...,1.603739,1.230110,-0.496255,-0.678141,1.233214,-0.318336,-0.008083,0.000000,0.062163,3.092100e-16
34920,-0.109607,-0.421234,-2.373674,1.448186,-0.999438,-0.546162,-0.400707,-0.684064,0.900165,-0.207077,...,1.603739,1.230110,-0.496255,-0.678141,1.233214,-0.318336,-0.008083,0.000000,0.062163,-1.570160e+00
34921,0.686487,-0.425282,-1.900000,1.448186,-0.999438,-0.546162,-0.400707,-0.684064,0.900165,-0.207077,...,1.603739,1.230110,-0.496255,-0.678141,1.233214,-0.318336,-0.008083,0.000000,0.062163,-4.074289e-01



## Train the Random Forest Model

Here we will configure and train our Random Forest model.


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Train the model
rf.fit(X_train, y_train.values.ravel())  # Use ravel() to convert y_train from a column vector to a 1d array, if necessary

In [4]:
# Features and target variable ('PM25_ugm3')
X_train = train_data.drop('PM25_ugm3', axis=1)
y_train = train_data['PM25_ugm3']

In [5]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=3)
rf_model.fit(X_train, y_train)

ValueError: could not convert string to float: '2021-06-28'


## Predict and Evaluate the Model

Now, let's make predictions on the test set and evaluate the model.


In [None]:

# Prepare test data
X_test = test_data.drop('target', axis=1)
y_test = test_data['target']

# Make predictions
predictions = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

f'Mean Squared Error: {mse}, R² Score: {r2}'



## Feature Importance

Visualize the importance of each feature in making predictions.


In [None]:

# Get feature importances
importances = rf_model.feature_importances_

# Sort the feature importances in descending order and plot them
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]), importances[indices], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.tight_layout()
plt.show()
