In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from prepare_yerevan_data import prepare_yerevan_data_pm_10
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

In [None]:
air_data = prepare_yerevan_data_pm_10()

In [None]:
pd.set_option('display.max_columns', 30)
air_data.head()

In [None]:
plt.hist(air_data['pm10_delta'], bins=30, edgecolor='k')
plt.show()

In [None]:
corr_matrix = air_data.corr()
print(corr_matrix["pm10_delta"].sort_values(ascending=False))

In [None]:
# Splitting into training and testing
y = air_data['pm10_delta']
X = air_data.drop(columns=['pm10_delta'])
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, shuffle=False)

In [None]:
model = XGBRegressor(
    random_state=42,
    gamma=0.15,
    learning_rate=0.005,
    n_estimators=500,
    max_depth=4
)

In [None]:
# param_grid = {
#     'gamma' : [0.15, 0.2, 0.25], # 0.15
#     'learning_rate' : [0.003, 0.004, 0.005], # 0.005 is best
#     'n_estimators' : [250, 500, 750], # 500 is best
#     'max_depth' : [3, 4] # 4 is best
# }

In [None]:
# tscv = TimeSeriesSplit(n_splits=5)

In [None]:
# GS = GridSearchCV(
#     estimator=model,
#     param_grid=param_grid,
#     cv=tscv,
#     scoring='neg_root_mean_squared_error',
#     refit=True,
#     n_jobs=15
# )

In [None]:
# GS.fit(X_train, y_train)
# GS.best_estimator_

In [29]:
# Fit and predict
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [30]:
# Evaluate
print(f"MAE: {mean_absolute_error(y_test, y_pred):.3f}")
print(f"RMSE: {root_mean_squared_error(y_test, y_pred):.3f}")
print(f"R^2: {r2_score(y_test, y_pred):.3f}")

MAE: 1.138
RMSE: 1.690
R^2: 0.318


In [31]:
# Feature importances for the model
importances = model.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False))

                         Feature  Importance
24          temperature_pressure    0.072713
5                        is_busy    0.066853
11                      hour_cos    0.063753
8                    pm10_lag_3h    0.061346
19  temperature_rolling_24h_mean    0.055664
23              temp_humidity_2m    0.053626
20          pm10_rolling_6h_mean    0.052938
0                 temperature_2m    0.048301
16   temperature_rolling_3h_mean    0.045076
10                      hour_sin    0.042406
6                    pm10_lag_1h    0.040531
3                  precipitation    0.035660
18  temperature_rolling_12h_mean    0.033690
7                    pm10_lag_2h    0.030290
25      temp_humidity_rolling_3h    0.029506
2           relative_humidity_2m    0.029280
9                    pm10_lag_1d    0.029089
14             wind_speed_lag_2h    0.028801
21           pm10_rolling_6h_std    0.027524
17   temperature_rolling_6h_mean    0.025761
12                    stagnation    0.024169
15        