In [1]:
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
df = pd.read_csv("dataframe.csv")

In [3]:
df

Unnamed: 0,PM2.5,PM10,CO,DEWP,SO2,TEMP,NO2,PRES,O3,WSPM
0,4.0,4.0,300.0,-18.8,4.0,-0.7,7.0,1023.0,77.0,4.4
1,8.0,8.0,300.0,-18.2,4.0,-1.1,7.0,1023.2,77.0,4.7
2,7.0,7.0,300.0,-18.2,5.0,-1.1,10.0,1023.5,73.0,5.6
3,6.0,6.0,300.0,-19.4,11.0,-1.4,11.0,1024.5,72.0,3.1
4,3.0,3.0,300.0,-19.5,12.0,-2.0,12.0,1025.2,72.0,2.0
...,...,...,...,...,...,...,...,...,...,...
210379,14.0,58.0,500.0,-17.0,4.0,12.8,19.0,1009.9,56.0,3.1
210380,27.0,83.0,700.0,-15.5,6.0,11.1,60.0,1010.4,26.0,1.9
210381,22.0,37.0,600.0,-15.9,7.0,10.5,52.0,1010.8,27.0,2.3
210382,9.0,23.0,400.0,-14.9,3.0,8.9,13.0,1010.9,57.0,1.6


In [6]:
X = df.drop('PM2.5', axis=1)
y = df['PM2.5']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Extra Tree

In [7]:
model = ExtraTreesRegressor()

params_grid = {'n_estimators': [50, 100],
               'max_depth': [5, 10, 15],
               'min_samples_split': [2, 5],
               'min_samples_leaf': [1, 2],
               'max_features': ['auto', 'sqrt']}

# Perform grid search
grid_search = GridSearchCV(model, param_grid=params_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=ExtraTreesRegressor(), n_jobs=-1,
             param_grid={'max_depth': [5, 10, 15],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [50, 100]})

In [11]:
# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
adj_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)

# Print results
print("Best model:", best_model)
print("R2 score:", r2)
print("Adjusted R2 score:", adj_r2)
print("Root mean squared error:", rmse)
print("Mean squared error:", mse)
print("Mean absolute error:", mae)

Best model: ExtraTreesRegressor(max_depth=15)
R2 score: 0.9271281800431508
Adjusted R2 score: 0.9271125895237505
Root mean squared error: 21.731597181027514
Mean squared error: 472.2623160384429
Mean absolute error: 13.942114223072032


In [13]:
import pickle

with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

## LGBRegressor

In [14]:
from lightgbm import LGBMRegressor

In [19]:
model = LGBMRegressor()

# Define the hyperparameters grid to search over
params_grid = {'n_estimators': [100, 150],
               'max_depth': [ 10, 15],
               'num_leaves': [31, 63],
               'learning_rate': [0.05, 0.1]}
             #  'min_child_samples': [1, 5, 10, 20],
             #  'subsample': [0.5, 0.7, 0.9, 1],
             #  'colsample_bytree': [0.5, 0.7, 0.9, 1]}

# Perform grid search
grid_search = GridSearchCV(model, param_grid=params_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LGBMRegressor(), n_jobs=-1,
             param_grid={'learning_rate': [0.05, 0.1], 'max_depth': [10, 15],
                         'n_estimators': [100, 150], 'num_leaves': [31, 63]})

In [21]:
# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate metrics
r2 = r2_score(y_test, y_pred)
adj_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Save the best model into a pickle file
with open('best_model_lgbm.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Print results
print("Best model:", best_model)
print("R2 score:", r2)
print("Adjusted R2 score:", adj_r2)
print("Root mean squared error:", rmse)
print("Mean squared error:", mse)
print("Mean absolute error:", mae)

Best model: LGBMRegressor(max_depth=15, n_estimators=150, num_leaves=63)
R2 score: 0.9359300590474346
Adjusted R2 score: 0.9359163516409504
Root mean squared error: 20.376941570788915
Mean squared error: 415.2197477793455
Mean absolute error: 12.822572341806428
