# Forecasting Ugandan Air Quality with XGBoost

# Import Packages and Dataset

In [None]:
# import packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import math
import gc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
rseed = 42

In [None]:
#import Data
df=pd.read_csv("./data/air_quality_final.csv")

# XGBoost Model

In [None]:
X = df.drop('target', axis=1)
y = df['target']
print(f"We have {X.shape[0]} observations in our dataset and {X.shape[1]} features")
print(f"Our target vector has also {y.shape[0]} values")

In [None]:
#dummy-encode the location feature
location = pd.get_dummies(X['location'], prefix='location',drop_first=True)
location.head()
# concatenate dummy-encoded locations feature to original dataframe
X = pd.concat([X, location],axis = 1)

In [None]:
#train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X['location'], random_state=rseed)

In [None]:
#drop multiclass location column
X_train_loc = X_train.location
X_test_loc = X_test.location
X_train.drop('location', axis=1, inplace=True)
X_test.drop('location', axis=1, inplace=True)

## intermission: Simple base model

In [None]:
#train a simple base model
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

y_pred_base = xgb.predict(X_test)

In [None]:
# evaluate model accurray
rmse_base = mean_squared_error(y_test, y_pred_base, squared=False)
r2_base = r2_score(y_test, y_pred_base)

print(f'RMSE on testset: {round(rmse_base,2)}')
print(f'Coefficient of determination on testset: {round(r2_base,2)}')

In [None]:
# calculate residuals
residual_base = y_test - y_pred_base

# compute mean of residuals
np.mean(residual_base)

In [None]:
sns.scatterplot(x=y_pred_base, y=residual_base, hue=X_test_loc)
plt.xlabel('y_pred')
plt.ylabel('residual')
plt.title('Residual plot from XGBoost Regressor');

## back to serious business

In [None]:
xgb_params = {'n_estimators': [150, 175, 200, 225, 250],
                'max_depth': [5, 7, 10, 12, 15, 20],
                'max_leaves': [20, 40, 60, 80, 100, 120]}

grid_xgb = GridSearchCV(XGBRegressor(seed=42), xgb_params, scoring='neg_root_mean_squared_error',
                  cv=5, verbose=0, n_jobs=-1)

grid_xgb.fit(X_train, y_train)

>This model's results:
RMSE: 24.47
R2: .66
Best parameters: {‘max_depth’: 7, ‘max_leaves’: 20, ‘n_estimators’: 225}

In [None]:
# Best score
print('Best score:', round(grid_xgb.best_score_, 3))

# Best parameters
print('Best parameters:', grid_xgb.best_params_)

#### retrain on whole training set

In [None]:
# save best model
xgb_best = grid_xgb.best_estimator_
xgb_best

In [None]:
xgb_best.fit(X_train, y_train)

In [None]:
y_pred_xgb = xgb_best.predict(X_test)

In [None]:
# evaluate model accurray
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f'RMSE on testset: {round(rmse_xgb,2)}')
print(f'Coefficient of determination on testset: {round(r2_xgb,2)}')

In [None]:
# calculate residuals
residual_xgb = y_test - y_pred_xgb

# compute mean of residuals
np.mean(residual_xgb)

In [None]:
sns.scatterplot(x=y_pred_xgb, y=residual_xgb, hue=X_test_loc)
plt.xlabel('y_pred')
plt.ylabel('residual')
plt.title('Residual plot from XGBoost Regressor');

In [None]:
sns.scatterplot(x=y_pred_xgb, y=y_test, color='b')#, hue=X_test_loc)
plt.xlabel('Predicted from Weather Data (in µg / m$^3$)')
plt.ylabel('Measured (in µg / m$^3$)')
plt.title('Measured and Predicted PM$_{2.5}$ Concentration');

### Convert actual and predicted PM2.5 levels from best models into air quality categories

In [None]:
# plotting function for color-coded confusion matrix
import itertools
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.figure(figsize = (6, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, size = 16)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 10)
    plt.yticks(tick_marks, classes, size = 10)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    
    # Labeling the plot
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), fontsize = 12,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.tight_layout()
    plt.ylabel('Measured Category', size = 14)
    plt.xlabel('Predicted Category', size = 14)

In [None]:
# actual y labels
y_test_labels = ['Good' if x < 13 else 'Moderate' if x < 36 else 'Unhealthy (Sensitive)' if x < 56 else 'Unhealthy' if x < 151 
                    else 'Very Unhealthy' if x < 251 else 'Hazardous' for x in y_test]

# predicted y labels by XGBoost Regressor
y_pred_labels = ['Good' if x < 13 else 'Moderate' if x < 36 else 'Unhealthy (Sensitive)' if x < 56 else 'Unhealthy' if x < 151 
                    else 'Very Unhealthy' if x < 251 else 'Hazardous' for x in y_pred_xgb]

### Compute confusion matrix and plot

In [None]:
cm = confusion_matrix(y_test_labels, y_pred_labels)

In [None]:
plot_confusion_matrix(cm, classes=['Good', 'Moderate', 'Unhealthy\n(Sensitive)', 'Unhealthy', 'Very Unhealthy', 'Hazardous'],
                title='Confusion Matrix - XGBoost');

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
xgb_best.feature_importances_

In [None]:
sorted_features = xgb_best.feature_importances_.argsort()

plt.barh(X_train.columns[sorted_features], xgb_best.feature_importances_[sorted_features])

In [None]:
perm_importance = permutation_importance(xgb_best, X_test, y_test)

sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(X_train.columns[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")

## Save model

In [None]:
# make directory
!mkdir -p models/xgb

# save model
xgb_best.save_model('models/xgb/xgb.model')