In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from prepare_yerevan_data import prepare_yerevan_data_pm_2_5
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler


In [None]:
# In this file we're training a model to predict the pm2.5 level only
air_data = prepare_yerevan_data_pm_2_5()

In [None]:
corr_matrix = air_data.corr()
print(corr_matrix["pm2_5_delta"].sort_values(ascending=False))

In [None]:
# Spltting into training and testing

X = air_data.drop(columns=['pm2_5_delta'])
y = air_data['pm2_5_delta']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, shuffle=False)


In [None]:

fig, axs = plt.subplots(2, 3)

axs[0, 0].hist(air_data['surface_pressure'], bins=30, edgecolor='k')
axs[0, 0].set_title("Surface pressure dist")
axs[0, 1].hist(air_data['temperature_2m'], bins=30, edgecolor='k')
axs[0, 1].set_title("Temp. at 2 meters dist")
axs[0, 2].hist(air_data['stagnation'], bins=30, edgecolor='k')
axs[0, 2].set_title("Stagnation dist")
axs[1, 0].hist(air_data['precipitation'], bins=30, edgecolor='k')
axs[1, 0].set_title("Precipitation dist")
axs[1, 1].hist(air_data['relative_humidity_2m'], bins=30, edgecolor='k')
axs[1, 1].set_title("Humidity at 2 meters dist")
axs[1, 2].hist(air_data['wind_speed_10m'], bins=30, edgecolor='k')
axs[1, 2].set_title("Wind speed dist")

plt.tight_layout()
plt.show()

In [None]:
plt.hist(y, bins=20, edgecolor='k')

In [None]:
# Selecting a model and fitting (hyperparams are tuned by performing grid search)
model = XGBRegressor(
    random_state=42,
    gamma=0.1,
    learning_rate=0.01,
    n_estimators=1000,
    max_depth=5,
)

In [None]:
# Fit the model
model.fit(X_train, y_train)

In [None]:
# Predicting
y_pred = model.predict(X_test)

In [None]:
# Evaluating
print(f"MAE: {mean_absolute_error(y_test, y_pred):.3f}")
print(f"RMSE: {root_mean_squared_error(y_test, y_pred):.3f}")
print(f"R^2: {r2_score(y_test, y_pred):.3f}")


In [None]:
# Feature importances for our model
importances = model.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False))

In [None]:
plt.title("Residual plot")
plt.hlines(xmin=0, xmax=len(y_pred), y=0)
plt.scatter(np.arange(len(y_pred)), y_pred - y_test, color='blue')