In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
df = pd.read_csv("dataframe.csv")

In [3]:
df.head(15)

Unnamed: 0,PM2.5,PM10,CO,DEWP,SO2,TEMP,NO2,PRES,O3,WSPM
0,4.0,4.0,300.0,-18.8,4.0,-0.7,7.0,1023.0,77.0,4.4
1,8.0,8.0,300.0,-18.2,4.0,-1.1,7.0,1023.2,77.0,4.7
2,7.0,7.0,300.0,-18.2,5.0,-1.1,10.0,1023.5,73.0,5.6
3,6.0,6.0,300.0,-19.4,11.0,-1.4,11.0,1024.5,72.0,3.1
4,3.0,3.0,300.0,-19.5,12.0,-2.0,12.0,1025.2,72.0,2.0
5,5.0,5.0,400.0,-19.6,18.0,-2.2,18.0,1025.6,66.0,3.7
6,3.0,3.0,500.0,-19.1,18.0,-2.6,32.0,1026.5,50.0,2.5
7,3.0,6.0,500.0,-19.1,19.0,-1.6,41.0,1027.4,43.0,3.8
8,3.0,6.0,500.0,-19.2,16.0,0.1,43.0,1028.3,45.0,4.1
9,3.0,8.0,400.0,-19.3,12.0,1.2,28.0,1028.5,59.0,2.6


In [4]:
df.shape

(210384, 10)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210384 entries, 0 to 210383
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   PM2.5   210384 non-null  float64
 1   PM10    210384 non-null  float64
 2   CO      210384 non-null  float64
 3   DEWP    210384 non-null  float64
 4   SO2     210384 non-null  float64
 5   TEMP    210384 non-null  float64
 6   NO2     210384 non-null  float64
 7   PRES    210384 non-null  float64
 8   O3      210384 non-null  float64
 9   WSPM    210384 non-null  float64
dtypes: float64(10)
memory usage: 16.1 MB


## RadomForestRegressor

In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from scipy.stats import randint, uniform

# Split the data into training and testing sets
X = df.drop("PM2.5", axis=1) # Features
y = df["PM2.5"] # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid to search over

param_grid = {
    'n_estimators': [30, 70],
    'max_depth': [5, 7],
    'min_samples_split': [5, 7],
    'min_samples_leaf': [1, 2],
    'max_features': uniform(0, 1),
    'bootstrap': [True, False],
}

# Create a Random Forest Regressor model and train it on the training set with hyperparameter tuning
rf = RandomForestRegressor(random_state=42)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

# Use the best model to make predictions on the test set
best_rf = rf_random.best_estimator_
y_pred = best_rf.predict(X_test)

r2 = r2_score(y_test, y_pred)
adj_r2 = 1 - (1 - r2) * ((X_test.shape[0] - 1) / (X_test.shape[0] - X_test.shape[1] - 1))
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

print("The accuracy metrics for PM2.5 is:")
print("R-squared:", r2)
print("Adjusted R-squared:", adj_r2)
print("RMSE:", rmse)
print("MAE:", mae)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
The accuracy metrics for PM2.5 is:
R-squared: 0.9075381450940917
Adjusted R-squared: 0.9075183633959873
RMSE: 24.478969946724163
MAE: 15.721167307682345


## KNeighborsRegressor

In [10]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV


X_train, X_test, y_train, y_test = train_test_split(df.drop('PM2.5', axis=1), df['PM2.5'], test_size=0.2, random_state=42)

# creating a KNeighborsRegressor model
model = KNeighborsRegressor()

# defining the hyperparameter grid to search over
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
}

# performing the grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# evaluating the model on the test set
y_pred = best_model.predict(X_test)
y_true = y_test

r2 = r2_score(y_test, y_pred)
adj_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_true, y_pred)

print("The evaluation metrics for PM2.5 is:")
print(f'R-Squared: {r2:.4f}')
print(f'Adjusted R-Squared: {adj_r2:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')

The evaluation metrics for PM2.5 is:
R-Squared: 0.9176
Adjusted R-Squared: 0.9176
RMSE: 23.1052
MAE: 13.9700


## XGBRegressor

In [13]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# separating the features and target variable
X = df[['PM10', 'CO', 'DEWP', 'SO2', 'TEMP', 'NO2', 'PRES', 'O3', 'WSPM']]
y = df['PM2.5']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create an XGBRegressor model with a lower number of estimators
model = xgb.XGBRegressor(n_estimators=50, max_depth=3, learning_rate=0.1, random_state=42)

# train the model on the training data
model.fit(X_train, y_train)

# make predictions on the testing data
y_pred = model.predict(X_test)

# calculating the accuracy metrics for PM2.5 predictions
adj_r2 = 1 - (1 - r2_score(y_test, y_pred)) * ((X_test.shape[0] - 1) / (X_test.shape[0] - X_test.shape[1] - 1))
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

print("The accuracy metrics for XGBRegressor (PM2.5 only) are:")
print(f"Adjusted R-squared: {adj_r2:.4f}")
print(f"R-squared: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

The accuracy metrics for XGBRegressor (PM2.5 only) are:
Adjusted R-squared: 0.9007
R-squared: 0.9007
RMSE: 25.3634
MAE: 16.2878


## GradientBoostingRegressor

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

In [24]:
# select features and target variable
X = df[['PM10', 'CO', 'DEWP', 'SO2', 'TEMP', 'NO2', 'PRES', 'O3', 'WSPM']]
y = df['PM2.5']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create a GradientBoostingRegressor model
model = GradientBoostingRegressor(random_state=42)

# fit the model to the training data
model.fit(X_train, y_train)

# make predictions on the testing data
y_pred = model.predict(X_test)

# calculate the accuracy metrics for PM2.5 prediction
r2 = r2_score(y_test, y_pred)
adj_r2 = 1 - (1 - r2) * ((X_test.shape[0] - 1) / (X_test.shape[0] - X_test.shape[1] - 1))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("The accuracy metrics for GradientBoostingRegressor is:")
print('R-squared: {:.4f}'.format(r2))
print('Adjusted R-squared: {:.4f}'.format(adj_r2))
print('Root Mean Squared Error: {:.4f}'.format(rmse))
print('Mean Absolute Error: {:.4f}'.format(mae))

The accuracy metrics for GradientBoostingRegressor is:
R-squared: 0.9090
Adjusted R-squared: 0.9089
Root Mean Squared Error: 24.2900
Mean Absolute Error: 15.4774
