In [8]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", message=".*does not have valid feature names.*")

In [9]:
dataset = pd.read_csv('data/data.csv')
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

X_train, y_train = train_df.drop('Value', axis=1), train_df['Value']
X_test, y_test = test_df.drop('Value', axis=1), test_df['Value']

In [10]:
X_train

Unnamed: 0,PM2.5,PM10,NO2,SO2,CO,O3,hour,is_weekend,lag_1,lag_2,lag_3,rolling_mean_3,rolling_std_3
0,33.552161,61.841399,47.732852,16.364456,1.321475,71.472801,4,0,79.0,64.0,62.0,74.000000,8.660254
1,40.621110,58.295819,51.040441,17.135703,1.624703,54.608085,5,0,79.0,79.0,64.0,79.000000,0.000000
2,41.081425,72.370991,68.991335,14.828186,1.156327,75.530553,6,0,79.0,79.0,79.0,76.666667,4.041452
3,33.468814,55.214149,43.639874,17.818128,1.082760,85.158645,7,0,72.0,79.0,79.0,73.000000,5.567764
4,28.734178,65.622890,35.378090,17.341548,1.078360,76.015710,8,0,68.0,72.0,79.0,69.000000,2.645751
...,...,...,...,...,...,...,...,...,...,...,...,...,...
587,9.040753,28.520307,22.565208,8.268744,0.716847,43.455600,15,1,36.0,38.0,36.0,36.666667,1.154701
588,17.011932,23.926224,17.824597,2.481554,0.238976,40.244416,16,1,36.0,36.0,38.0,36.333333,0.577350
589,14.306488,36.663122,12.549052,5.967363,0.593935,43.104099,17,1,37.0,36.0,36.0,36.333333,0.577350
590,15.081149,18.477731,25.770557,3.577461,0.731687,20.381691,18,1,36.0,37.0,36.0,36.333333,0.577350


In [5]:
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=100, random_state=42),
    "SVR": SVR(kernel='rbf')
}

In [4]:
import pickle

results = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    with open(f'artifacts/{model_name}.pkl', "wb") as file:
        pickle.dump(model, file)
    
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    results[model_name] = {'mae': mae, 'rmse': rmse, 'y_pred': y_pred}
   
    print(f"{model_name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}")

Random Forest - MAE: 1.46, RMSE: 2.10
Gradient Boosting - MAE: 1.63, RMSE: 2.21
XGBoost - MAE: 1.40, RMSE: 2.14
SVR - MAE: 3.86, RMSE: 5.12
