
# Predicting Air Pollution Levels using Random Forest

This notebook demonstrates the steps to use a Random Forest model to predict air pollution levels.


In [2]:
# import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd



## Load Data

Load your training and testing data from CSV files.


In [6]:
# Load training data
train_data = pd.read_csv('prepped_data/train_data.csv')

# Load testing data
test_data = pd.read_csv('prepped_data/test_data.csv')

In [7]:
# Display the first few rows of the training data
train_data

Unnamed: 0,date,NO_ugm3,NO2_ugm3,O3_ugm3,CO_mgm3,CO2_mgm3,PM25_ugm3,SiteID,Lat,Long,...,DCC-AQ52-pm4,DCC-AQ6-pm4,DCC-AQ1-so2,DCC-AQ13-so2,DCC-AQ22-so2,TNO2161-tsp,TNO2162-tsp,TNO4435-tsp,TNT1088-tsp,hour_y
0,2021-05-06,-16.485096,4.267365,70.815263,0.275750,786.544417,3.815981,325,53.327618,-6.309232,...,3.89,6.13,2.22,-0.40,0.51,0.18,7.07,8.9,8.78,14
1,2021-05-06,42.372065,30.017355,54.062545,0.284226,812.418643,3.748469,325,53.327618,-6.309232,...,3.36,5.10,2.07,-0.49,0.64,0.20,7.27,11.2,4.30,13
2,2021-05-06,9.866286,22.328920,62.542684,0.289449,800.817056,3.922340,96,53.325333,-6.309004,...,3.89,6.13,2.22,-0.40,0.51,0.18,7.07,8.9,8.78,14
3,2021-05-06,12.882471,23.080419,60.164121,0.292954,802.091506,3.476367,629,53.326740,-6.313319,...,3.36,5.10,2.07,-0.49,0.64,0.20,7.27,11.2,4.30,13
4,2021-05-06,24.053745,26.399404,52.199647,0.318830,802.504068,4.964429,587,53.317764,-6.332609,...,3.36,5.10,2.07,-0.49,0.64,0.20,7.27,11.2,4.30,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35382,2022-04-29,-13.018833,20.744500,56.517333,0.614667,792.976200,16.048500,843,53.348227,-6.301325,...,18.68,18.12,-0.54,-0.76,2.56,2.15,18.79,,6.50,10
35383,2022-04-29,22.585729,24.308723,59.316882,0.355894,802.139021,18.535245,503,53.341829,-6.307583,...,18.68,18.12,-0.54,-0.76,2.56,2.15,18.79,,6.50,10
35384,2022-04-29,64.062625,39.713778,44.604667,0.361111,786.006375,18.134556,211,53.342251,-6.349336,...,18.68,18.12,-0.54,-0.76,2.56,2.15,18.79,,6.50,10
35385,2022-04-29,-4.928278,1.075500,70.017143,0.337611,776.157769,17.647368,553,53.342390,-6.338905,...,18.68,18.12,-0.54,-0.76,2.56,2.15,18.79,,6.50,10


In [8]:
train_data.columns

Index(['date', 'NO_ugm3', 'NO2_ugm3', 'O3_ugm3', 'CO_mgm3', 'CO2_mgm3',
       'PM25_ugm3', 'SiteID', 'Lat', 'Long', 'month', 'day_of_week', 'hour_x',
       'avgtempC', 'maxtempC', 'mintempC', 'sunHour', 'uvIndex', 'humidity',
       'winddirDegree', 'windspeedKmph', 'cloudcover', 'precipMM', 'pressure',
       'DCC-AQ1-co', 'DCC-AQ1-no', 'DCC-AQ10-no', 'DCC-AQ13-no', 'DCC-AQ5-no',
       'DCC-AQ6-no', 'DCC-AQ1-no2', 'DCC-AQ10-no2', 'DCC-AQ13-no2',
       'DCC-AQ22-no2', 'DCC-AQ5-no2', 'DCC-AQ6-no2', 'DCC-AQ69-no2',
       'DCC-AQ22-o3', 'DCC-AQ69-o3', 'DCC-AQ10-pm1', 'DCC-AQ13-pm1',
       'DCC-AQ2-pm1', 'DCC-AQ3-pm1', 'DCC-AQ4-pm1', 'DCC-AQ5-pm1',
       'DCC-AQ52-pm1', 'DCC-AQ6-pm1', 'TNO2161-pm1', 'TNO2162-pm1',
       'TNO4435-pm1', 'TNT1088-pm1', 'DCC-AQ10-pm10', 'DCC-AQ13-pm10',
       'DCC-AQ2-pm10', 'DCC-AQ22-pm10', 'DCC-AQ3-pm10', 'DCC-AQ4-pm10',
       'DCC-AQ5-pm10', 'DCC-AQ52-pm10', 'DCC-AQ6-pm10', 'TNO2161-pm10',
       'TNO2162-pm10', 'TNO4435-pm10', 'TNT1088-pm10', 'DC

In [9]:
train_data.dtypes

date            object
NO_ugm3        float64
NO2_ugm3       float64
O3_ugm3        float64
CO_mgm3        float64
                ...   
TNO2161-tsp    float64
TNO2162-tsp    float64
TNO4435-tsp    float64
TNT1088-tsp    float64
hour_y           int64
Length: 93, dtype: object


## Train the Random Forest Model

Here we will configure and train our Random Forest model.


In [4]:
# Features and target variable ('PM25_ugm3')
X_train = train_data.drop('PM25_ugm3', axis=1)
y_train = train_data['PM25_ugm3']

In [5]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=3)
rf_model.fit(X_train, y_train)

ValueError: could not convert string to float: '2021-06-28'


## Predict and Evaluate the Model

Now, let's make predictions on the test set and evaluate the model.


In [ ]:

# Prepare test data
X_test = test_data.drop('target', axis=1)
y_test = test_data['target']

# Make predictions
predictions = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

f'Mean Squared Error: {mse}, R² Score: {r2}'



## Feature Importance

Visualize the importance of each feature in making predictions.


In [ ]:

# Get feature importances
importances = rf_model.feature_importances_

# Sort the feature importances in descending order and plot them
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]), importances[indices], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.tight_layout()
plt.show()
