# Notebook 3 (XGBoost): learning each model on previously prepared data and biases calculations
Data for training originates from stations localized in: Warszawa, Wrocław, Szczecin, Rzeszów.

## All necessary libraries imports

In [1]:
import pandas as pd
from matplotlib import pyplot
import seaborn as sn
import matplotlib.pyplot as plt
import datetime as dt
import numpy as np
from sklearn.metrics import mean_absolute_error
import pickle
import xgboost as xgb

## Files to load

In [5]:
# read the CSV files

# train data - models will be training on data from 2015 to 2020
train_data = pd.read_csv("all_data/data_for_XGB/data_ready_for_training.csv")

# valid data from 2021
valid_data = pd.read_csv("all_data/data_for_XGB/data_ready_for_validation.csv")

# test data from 2022
test_data = pd.read_csv("all_data/data_for_XGB/data_ready_for_testing.csv")

## Functions for models learning 

Defining the function that for each station generates dataframe shifted by given timedelta (given in hours). It will be the target variable or the input data for the model depending on is_x value. If is_x equals True then it will be the input data.

In [8]:
def generate_dfs(raw: pd.DataFrame, is_x: bool, timedelta):
    dfs=[]
    # different stations: Rzeszów, Szczecin, Warszawa and Wrocław
    stations=["EPRZ", "EPSC", "EPWA", "EPWR"]
    for station in stations:
        # each station and only full hours
        wdf=raw[(raw['station']==station) & (raw['minutes']==0)]
        if is_x:
            wdf = wdf[:-timedelta]
        else:
            wdf = wdf[timedelta:]
        dfs.append(wdf.copy())
    # as an output value return concatenated data
    return pd.concat(dfs).reset_index(drop=True)

## Training three models for predictions in next, second and third hour
Each model consists of params such as:
- tree_method="hist", which choose the algorithm,
- n_estimators=200, which is the number of gradient boosted trees,
- max_depth=12, which is maximum tree depth,
- subsample=0.6, which is subsample ratio of the training instance.

In [16]:
# last_pred_hour == 3 means that predictions are from +1 hour till +3 hours
last_pred_hour = 3

# temporary lists for MAE and biases for each hour/ each model
MAE_humid = []
MAE_wind = []
MAE_temp = []
bias_relh = []
bias_sped = []
bias_temp = []

for hour in range(1,last_pred_hour+1):

    # division into explanatory X and response y variables
    x_train_xgb = generate_dfs(train_data, is_x=True, timedelta=hour)
    y_train_xgb = generate_dfs(train_data, is_x=False, timedelta=hour)

    x_valid_xgb = generate_dfs(valid_data, is_x=True, timedelta=hour)
    y_valid_xgb = generate_dfs(valid_data, is_x=False, timedelta=hour)

    x_test_xgb = generate_dfs(test_data, is_x=True, timedelta=hour)
    y_test_xgb = generate_dfs(test_data, is_x=False, timedelta=hour)

    # drop unnecessary columns
    y_train_xgb.drop(axis="columns", inplace=True, labels=["station", "minutes","Unnamed: 0"])
    x_train_xgb.drop(axis="columns", inplace=True, labels=["station", "minutes","Unnamed: 0"])

    y_valid_xgb.drop(axis="columns", inplace=True, labels=["station", "minutes","Unnamed: 0"])
    x_valid_xgb.drop(axis="columns", inplace=True, labels=["station", "minutes","Unnamed: 0"])
    
    y_test_xgb.drop(axis="columns", inplace=True, labels=["station", "minutes","Unnamed: 0"])
    x_test_xgb.drop(axis="columns", inplace=True, labels=["station", "minutes","Unnamed: 0"])

    # create XGBoost regression model
    reg = xgb.XGBRegressor(
        tree_method="hist",
        n_estimators=200,
        n_jobs=16,
        max_depth=12,
        subsample=0.6,
    )
    
    # training the model
    reg.fit(x_train_xgb, y_train_xgb, eval_set=[(x_train_xgb, y_train_xgb)], verbose=False)

    # saving each model to the pikle file
    file_name = 'generated_models/xgb_models/' + "xgb_" + str(hour) + ".pkl"
    with open(file_name, "xb") as f_1:
        pickle.dump(reg, f_1, -1)

    # predictions
    y_pred_xgb = reg.predict(x_test_xgb)
    y_pred_xgb = pd.DataFrame(y_pred_xgb, columns=[ "lon", "lat",  "tmpc",  "relh" ,"sped" ,"day","month", "year", "hour"])

    # MAE error calculations
    MAE_humid.append(mean_absolute_error(y_pred_xgb[["relh"]],y_test_xgb[["relh"]]))
    MAE_wind.append(mean_absolute_error(y_pred_xgb[["sped"]],y_test_xgb[["sped"]]))
    MAE_temp.append(mean_absolute_error(y_pred_xgb[["tmpc"]],y_test_xgb[["tmpc"]]))

    # biases calculations
    bias_relh.append(sum((y_pred_xgb[["relh"]].values - y_test_xgb[["relh"]].values)/len(y_test_xgb))[0])
    bias_sped.append(sum((y_pred_xgb[["sped"]].values - y_test_xgb[["sped"]].values)/len(y_test_xgb))[0])
    bias_temp.append(sum((y_pred_xgb[["tmpc"]].values - y_test_xgb[["tmpc"]].values)/len(y_test_xgb))[0])


# Results
MAE evaluated for each weather component separately.

Error is given:

- For temperature in °C.
- For relative humidity in %.
- For speed of wind in km/h.

In [20]:
MAE = pd.DataFrame()
MAE['hour'] = [1,2,3]
MAE['relh'] = MAE_humid
MAE['skph'] = MAE_wind
MAE['temp'] = MAE_temp

MAE

Unnamed: 0,hour,relh,skph,temp
0,1,4.881282,4.127334,0.84881
1,2,6.394932,4.918111,1.173693
2,3,7.546078,5.458958,1.470576


## Biases for each weather condition

In [24]:
biases = pd.DataFrame()
biases['relh'] = bias_relh
biases['skph'] = bias_sped
biases['temp'] = bias_temp

# save data to root/notebooks/generated_models/xgb_models/biases_xgboost folder
biases.to_csv("generated_models/xgb_models/biases_xgboost")

# display
biases

Unnamed: 0,relh,skph,temp
0,0.199654,-0.274403,-0.023632
1,0.224826,-0.206573,-0.031782
2,0.475803,-0.201343,-0.004216


Add biases to the generated models in pikle files.

In [25]:
for i in range(3):
        file_name = 'generated_models/xgb_models/' + "xgb_" + (str)(i+1) + ".pkl"
        print(file_name)
        with open(file_name, "ab") as f_1:
                pickle.dump(biases[biases.index==i], f_1, -1)

generated_models/xgb_models/xgb_1.pkl
generated_models/xgb_models/xgb_2.pkl
generated_models/xgb_models/xgb_3.pkl
