# Training and forecast of ML models

In [2]:
import pandas as pd

In [21]:
ts60 = pd.read_csv('../VEOLIA/artifacts/timeseries_60min.csv', index_col=0, parse_dates=True)
# load60 = ts60['Diff Load Activa Total (60 minuto)'].dropna()
# ts60.head()

df = ts60[['Diff Load Activa Total (60 minuto)','TEMPERATURA EXTERIOR (60 minuto)']].dropna()
df.head()

Unnamed: 0_level_0,Diff Load Activa Total (60 minuto),TEMPERATURA EXTERIOR (60 minuto)
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-05-15 00:00:00,2600.0,13.7725
2021-05-15 01:00:00,2620.0,13.31
2021-05-15 02:00:00,2610.0,12.385
2021-05-15 03:00:00,2620.0,11.9225
2021-05-15 04:00:00,2570.0,11.46


In [22]:
df['year'] = df.index.year
df['month'] = df.index.month
df['month_day'] = df.index.day
df['week_day'] = df.index.weekday
df['hour'] = df.index.hour


In [23]:
df.head(150)

Unnamed: 0_level_0,Diff Load Activa Total (60 minuto),TEMPERATURA EXTERIOR (60 minuto),year,month,month_day,week_day,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-05-15 00:00:00,2600.0,13.7725,2021,5,15,5,0
2021-05-15 01:00:00,2620.0,13.3100,2021,5,15,5,1
2021-05-15 02:00:00,2610.0,12.3850,2021,5,15,5,2
2021-05-15 03:00:00,2620.0,11.9225,2021,5,15,5,3
2021-05-15 04:00:00,2570.0,11.4600,2021,5,15,5,4
...,...,...,...,...,...,...,...
2021-05-21 01:00:00,2110.0,13.6850,2021,5,21,4,1
2021-05-21 02:00:00,2140.0,11.9950,2021,5,21,4,2
2021-05-21 03:00:00,2400.0,11.2600,2021,5,21,4,3
2021-05-21 04:00:00,2700.0,9.9925,2021,5,21,4,4


In [24]:
df.shape

(1128, 7)

## Regression models for load forecasting

In [25]:
X = df[['month', 'week_day', 'hour', 'TEMPERATURA EXTERIOR (60 minuto)']]
y = df['Diff Load Activa Total (60 minuto)']

In [26]:
from sklearn.model_selection import train_test_split

# random_state=23 in order to be led to reproducible results
# split 75%, 25%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)
print(X_train)
print(y_train)
print(X_test)
print(y_test)

                     month  week_day  hour  TEMPERATURA EXTERIOR (60 minuto)
datetime                                                                    
2021-06-03 23:00:00      6         3    23                            10.210
2021-05-15 19:00:00      5         5    19                            17.575
2021-06-06 19:00:00      6         6    19                            19.810
2021-05-17 23:00:00      5         0    23                             9.880
2021-06-25 17:00:00      6         4    17                            26.735
...                    ...       ...   ...                               ...
2021-05-16 07:00:00      5         6     7                            13.310
2021-06-23 14:00:00      6         2    14                            18.795
2021-06-28 08:00:00      6         0     8                            30.955
2021-06-14 22:00:00      6         0    22                            18.565
2021-06-08 19:00:00      6         1    19                            29.545

### Decision Tree Regression

In [27]:
from sklearn import tree

tree_reg = tree.DecisionTreeRegressor()
tree_reg = tree_reg.fit(X_train, y_train)

y_train_pred_dec_tree = tree_reg.predict(X_train)
y_test_pred_dec_tree = tree_reg.predict(X_test)


In [29]:
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error



In [36]:
MSE_train3 = mean_squared_error(y_train, y_train_pred_dec_tree)
MSE_test3 = mean_squared_error(y_test, y_test_pred_dec_tree)

MAPE_train3 = mean_absolute_percentage_error(y_train, y_train_pred_dec_tree)
MAPE_test3 = mean_absolute_percentage_error(y_test, y_test_pred_dec_tree)


print(MSE_test3, round(MAPE_test3 * 100, 2),'%')

151873.75886538663 13.85 %


### Random Forest Regression

In [37]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor()
regr.fit(X_train, y_train)

y_train_pred_rf = regr.predict(X_train)
y_test_pred_rf = regr.predict(X_test)

In [38]:
MSE_train_rf = mean_squared_error(y_train, y_train_pred_rf)
MSE_test_rf = mean_squared_error(y_test, y_test_pred_rf)

MAPE_train_rf = mean_absolute_percentage_error(y_train, y_train_pred_rf)
MAPE_test_rf = mean_absolute_percentage_error(y_test, y_test_pred_rf)


print(MSE_test_rf, round(MAPE_test_rf * 100, 2),'%')

93079.2890780943 11.15 %


### Gradient Boosting Regression

In [41]:
from sklearn.ensemble import GradientBoostingRegressor

reg = GradientBoostingRegressor(random_state=0)
reg.fit(X_train, y_train)

y_train_pred_gb = reg.predict(X_train)
y_test_pred_gb = reg.predict(X_test)

In [42]:
MSE_train_gb = mean_squared_error(y_train, y_train_pred_gb)
MSE_test_gb = mean_squared_error(y_test, y_test_pred_gb)

MAPE_train_gb = mean_absolute_percentage_error(y_train, y_train_pred_gb)
MAPE_test_gb = mean_absolute_percentage_error(y_test, y_test_pred_gb)


print(MSE_test_gb, round(MAPE_test_gb * 100, 2),'%')

88107.48844671606 10.74 %


### LightGBM

In [44]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.2.1-py3-none-win_amd64.whl (1.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.2.1


In [45]:
import lightgbm as lgb

#### Todo:
1) LightGBM models
2) xgboost model
3) try with more complete weather data
4) parameter tuning
5) cross_validation