# Notebook 3/3 - Models training & dev.

### General objective of the project
- Predict daily solar energy on next day for the city of Kraainem, in Belgium

### Notebook objective
- Train ML algorythms to predict next day 'daily solar energy'
- Select best performing algorythm

### Import  modules

In [None]:
# import modules
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

# import modules for models training and evaluation 
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

### Load data

In [2]:
# import cleaned and formated data
load_path = '../data/formated/'
sol_fin_small_load_name = 'sol_fin_small.csv'

sol_s = pd.read_csv(
    load_path + sol_fin_small_load_name,
    index_col=0
)

In [3]:
# check dataframe first rows
sol_s.head(3)

Unnamed: 0_level_0,Year_month_t,Temp_t,CloudOp_t,CloudOp_t_mean_wk,Humid_t,Humid_t_min_wk,Humid_t_max_wk,Humid_t_mean_wk,Press_t,Press_t_min_wk,...,DHI_t,DHI_t_min_wk,DHI_t_max_wk,DHI_t_mean_wk,DHI_t_std_wk,GHI_t,GHI_t_max_wk,GHI_t_mean_wk,GHI_t_std_wk,GHI_t+1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-07,2019-01,5.3875,72.0,63.066468,94.2875,73.922222,94.7125,87.234722,1021.95,1020.6875,...,42.25,32.5,52.5,42.571429,7.78073,42.25,53.25,43.051587,7.883205,62.0
2019-01-08,2019-01,5.8125,56.0875,61.939683,82.775,73.922222,94.2875,85.529365,1010.2875,1010.2875,...,57.5,32.5,57.5,45.625,8.937508,62.0,62.0,46.623016,10.049912,115.25
2019-01-09,2019-01,3.975,25.5,59.109524,81.2875,80.833333,94.2875,86.581548,1011.2375,1010.2875,...,85.0,32.5,85.0,50.958333,17.446993,115.25,115.25,56.150794,27.917983,65.666667


In [4]:
# drop Year_month_t
sol_s.drop(
    labels='Year_month_t',
    axis=1,
    inplace=True
)

In [5]:
# check that all features are in proper format for ML algo
sol_s.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1075 entries, 2019-01-07 to 2021-12-16
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Temp_t             1075 non-null   float64
 1   CloudOp_t          1075 non-null   float64
 2   CloudOp_t_mean_wk  1075 non-null   float64
 3   Humid_t            1075 non-null   float64
 4   Humid_t_min_wk     1075 non-null   float64
 5   Humid_t_max_wk     1075 non-null   float64
 6   Humid_t_mean_wk    1075 non-null   float64
 7   Press_t            1075 non-null   float64
 8   Press_t_min_wk     1075 non-null   float64
 9   Press_t_mean_wk    1075 non-null   float64
 10  Press_t_std_wk     1075 non-null   float64
 11  WindSpd_t          1075 non-null   float64
 12  WindSpd_t_mean_wk  1075 non-null   float64
 13  WindSpd_t_std_wk   1075 non-null   float64
 14  DNI_t              1075 non-null   float64
 15  DNI_t_min_wk       1075 non-null   float64
 16  DHI_t         

### Split training, cross-validation and test data

In [6]:
# shuffle dataset
sol_s = shuffle(sol_s)

In [7]:
# split dataset into train, cross-validation and test subsets with the following proportions: 60/20/20
train_sol, cval_sol, test_sol = np.split(
    sol_s,
    [
        int(len(sol_s)*.6),
        int(len(sol_s)*.8)
    ]
)

In [8]:
# save the 3 datasets
save_path = '../data/ml_ready/'
train_sol_save_name = 'train_sol.csv'
cval_sol_save_name = 'cval_sol.csv'
test_sol_save_name = 'test_sol.csv'

train_sol.to_csv(save_path + train_sol_save_name)
cval_sol.to_csv(save_path + cval_sol_save_name)
test_sol.to_csv(save_path + test_sol_save_name)

### Split dependant/independant features

In [9]:
# create a function to separate target feature and independant features
def target_and_inde_feat_sets(df, feat):
    ind_feats = df.drop([feat], axis=1)
    dep_feat = df[feat]
    return ind_feats, dep_feat

In [10]:
# create target feature and independant features series/dataframe for each subset
X_train, y_train = target_and_inde_feat_sets(train_sol, 'GHI_t+1')
X_cval, y_cval = target_and_inde_feat_sets(cval_sol, 'GHI_t+1')
X_test, y_test = target_and_inde_feat_sets(test_sol, 'GHI_t+1')

### Train models

In [11]:
# create simple pipelines for the 3 models to test
# Steps includes:
# 1. Preprocessing with StandardScaler (only for Linear & Ridge regressors)
# 2. Training models on train sets

In [12]:
# Linear Regression pipeline
lr_pipeline = Pipeline(
    [
        ('Scaler', StandardScaler()),
        ('Training', LinearRegression()),
    ]
)

# Decision Tree Regressor pipeline
dtr_pipeline = Pipeline(
    [
        ('Scaler', StandardScaler()),
        ('Training', DecisionTreeRegressor())
    ]
)

# Random Forest Regressor pipeline
rfr_pipeline = Pipeline(
    [
        ('Scaler', StandardScaler()),
        ('Training', RandomForestRegressor())
    ]
)

In [13]:
# create a list with the pipelines
reg_pipelines = [lr_pipeline, dtr_pipeline, rfr_pipeline]

In [14]:
# metrics to select best performing model
r2_score = 0
mse = 0
regressor = 0
pipeline = ''

In [15]:
# dictionary of pipelines and models
reg_pipelines_dict = {
    0: 'Linear Regression',
    1: 'Decision Tree Regressor',
    2: 'Random Forest Regessor'
}

# fit pipelines
for pipe in reg_pipelines:
    pipe.fit(X_train, y_train)
    
for i, model in enumerate(reg_pipelines):
    print('\n{} mse: {}'.format(reg_pipelines_dict[i], mean_squared_error(model.predict(X_cval), y_cval)))
    print('{} r2 score: {}'.format(reg_pipelines_dict[i], model.score(X_cval, y_cval)))


Linear Regression mse: 5543.806800314607
Linear Regression r2 score: 0.6727863856209046

Decision Tree Regressor mse: 12009.005642330716
Decision Tree Regressor r2 score: 0.29118919852997305

Random Forest Regessor mse: 5636.345007024059
Random Forest Regessor r2 score: 0.6673244779144816


### Select best performing model

In [16]:
# select best performing model
for i, model in enumerate(reg_pipelines):
    cost = mean_squared_error(model.predict(X_cval), y_cval)
    if cost < mse:
        mse = cost
        pipeline = model
        regressor = i
print('Best performing Regressor: {}'.format(reg_pipelines_dict[regressor]))
        

Best performing Regressor: Linear Regression
