
# Predicting Air Pollution Levels using Random Forest

This notebook demonstrates the steps to use a Random Forest model to predict air pollution levels.


In [ ]:
# import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


## Load Data

Load your training and testing data from CSV files.


In [3]:
# Load training data
train_data = pd.read_csv('prepped_data/train_data.csv')

# Load testing data
test_data = pd.read_csv('prepped_data/test_data.csv')

# Display the first few rows of the training data
train_data.head()

Unnamed: 0,date,NO_ugm3,NO2_ugm3,O3_ugm3,CO_mgm3,CO2_mgm3,PM25_ugm3,SiteID,Lat,Long,...,o3_TNO4323,o3_TNO4324,o3_TNO4325,o3_TNO4390,o3_TNO4435,o3_TNO4437,o3_TNO4438,o3_TNO4488,o3_TNT1088,o3_TNT1296
0,2021-06-28,12.044255,0.085176,39.303789,0.351863,878.897362,5.072396,1,53.348754,-6.257607,...,,,,,,,,,,
1,2021-06-28,5.263025,9.600681,29.41339,0.350409,842.139168,7.998587,1,53.348754,-6.257607,...,,,,,,,,,,
2,2021-08-16,10.318074,3.006013,61.380364,0.380456,809.073556,6.122852,1,53.348754,-6.257607,...,,,,,,,,,,
3,2021-10-22,456.213318,,,0.432339,850.479574,5.752877,1,53.348754,-6.257607,...,,,,,,,,,,
4,2021-05-17,-25.8959,12.993164,51.743435,0.317525,816.905789,3.402297,1,53.348754,-6.257607,...,,,,,,,,,,


In [6]:
train_data.dtypes

date           object
NO_ugm3       float64
NO2_ugm3      float64
O3_ugm3       float64
CO_mgm3       float64
               ...   
o3_TNO4437    float64
o3_TNO4438    float64
o3_TNO4488    float64
o3_TNT1088    float64
o3_TNT1296    float64
Length: 295, dtype: object


## Train the Random Forest Model

Here we will configure and train our Random Forest model.


In [4]:
# Features and target variable ('PM25_ugm3')
X_train = train_data.drop('PM25_ugm3', axis=1)
y_train = train_data['PM25_ugm3']

In [5]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=3)
rf_model.fit(X_train, y_train)

ValueError: could not convert string to float: '2021-06-28'


## Predict and Evaluate the Model

Now, let's make predictions on the test set and evaluate the model.


In [ ]:

# Prepare test data
X_test = test_data.drop('target', axis=1)
y_test = test_data['target']

# Make predictions
predictions = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

f'Mean Squared Error: {mse}, R² Score: {r2}'



## Feature Importance

Visualize the importance of each feature in making predictions.


In [ ]:

# Get feature importances
importances = rf_model.feature_importances_

# Sort the feature importances in descending order and plot them
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]), importances[indices], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.tight_layout()
plt.show()
