# 2. Modelling SVR Linear

---

In [16]:
## load modules and run mlflow_logging.ipynb to get function to track model information on MLFLow
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import LinearSVR, SVR
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from datetime import datetime
import pickle
import os


from sklearn.model_selection import GridSearchCV

from modeling.functions import modelling, log_to_mlflow, get_features, save_models, load_models, save_results

In [3]:
data = pd.read_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', parse_dates=['TIMESTAMP'])
data.head()
data.dropna(inplace=True)
data.info()
RSEED = 42

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175265 entries, 0 to 175433
Data columns (total 19 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   ZONEID      175265 non-null  int64         
 1   TIMESTAMP   175265 non-null  datetime64[ns]
 2   TARGETVAR   175265 non-null  float64       
 3   U10         175265 non-null  float64       
 4   V10         175265 non-null  float64       
 5   U100        175265 non-null  float64       
 6   V100        175265 non-null  float64       
 7   HOUR        175265 non-null  int64         
 8   MONTH       175265 non-null  int64         
 9   WEEKDAY     175265 non-null  int64         
 10  IS_HOLIDAY  175265 non-null  int64         
 11  WS10        175265 non-null  float64       
 12  WS100       175265 non-null  float64       
 13  WD10        175265 non-null  float64       
 14  WD100       175265 non-null  float64       
 15  WD100CARD   175265 non-null  object        
 16  WD

In [4]:
data = pd.get_dummies(data, columns = ['WD100CARD','WD10CARD'])
data.head()

Unnamed: 0,ZONEID,TIMESTAMP,TARGETVAR,U10,V10,U100,V100,HOUR,MONTH,WEEKDAY,...,WD10CARD_NNW,WD10CARD_NW,WD10CARD_S,WD10CARD_SE,WD10CARD_SSE,WD10CARD_SSW,WD10CARD_SW,WD10CARD_W,WD10CARD_WNW,WD10CARD_WSW
0,1,2012-01-01 01:00:00,0.0,2.1246,-2.681966,2.86428,-3.666076,1,1,6,...,0,1,0,0,0,0,0,0,0,0
1,1,2012-01-01 02:00:00,0.054879,2.521695,-1.79696,3.344859,-2.464761,2,1,6,...,0,1,0,0,0,0,0,0,0,0
2,1,2012-01-01 03:00:00,0.110234,2.67221,-0.822516,3.508448,-1.214093,3,1,6,...,0,0,0,0,0,0,0,0,1,0
3,1,2012-01-01 04:00:00,0.165116,2.457504,-0.143642,3.215233,-0.355546,4,1,6,...,0,0,0,0,0,0,0,1,0,0
4,1,2012-01-01 05:00:00,0.15694,2.245898,0.389576,2.957678,0.332701,5,1,6,...,0,0,0,0,0,0,0,1,0,0


In [5]:
## train-test-split
#data_train, data_test = train_test_split(data, test_size=0.25, random_state=RSEED, stratify=data.ZONEID)
data_train = data[data.TIMESTAMP <= '2013-07-01 00:00:00']
data_test = data[data.TIMESTAMP > '2013-07-01 00:00:00']

In [6]:
# define features and feature dict
feature_dict = get_features(data)

# features = feature_dict['all']

In [7]:
# # define zone
# zone = 1

In [8]:
# # split train and test data in feature and TARGETVAR parts and cut data to desired zones
# X_train = data_train[data_train.ZONEID == zone][features]
# y_train = data_train[data_train.ZONEID == zone].TARGETVAR

# X_test = data_test[data_test.ZONEID == zone][features]
# y_test = data_test[data_test.ZONEID == zone].TARGETVAR

In [9]:
# # Scale data
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [10]:
# model = SVR()
# for ii in range(5, 6):
#     model = SVR(kernel='poly', degree=ii, C=1, cache_size=100)
#     model.fit(X_train, y_train)
#     y_pred_train = model.predict(X_train)
#     y_pred_train = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred_train]
#     y_pred = model.predict(X_test)
#     y_pred = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred]
#     print(ii,': train',mean_squared_error(y_train, y_pred_train, squared=False),
#             ', test:', mean_squared_error(y_test, y_pred, squared=False))

In [11]:

# model = SVR(kernel='rbf', gamma='auto', C=10)
# model.fit(X_train, y_train)
# y_pred_train = model.predict(X_train)
# y_pred_train = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred_train]
# y_pred = model.predict(X_test)
# y_pred = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred]
# print('train',mean_squared_error(y_train, y_pred_train, squared=False),
#         ', test:', mean_squared_error(y_test, y_pred, squared=False))

In [14]:
model = SVR()
scaler = MinMaxScaler()

model_dict = {}
results = {}
results_train = {}
param_grid = [ 
    # {           'C': [0.1, 1, 5, 10], 
    #             'degree': [5, 6, 7],
    #             'kernel': ['poly']
    #             },
    {           'C': [0.1, 1, 10, 100] ,
                'kernel': ['rbf']
                }
]


for key in feature_dict.keys():
    print(f'Features: {key}')
    results_train[key],results[key], model_dict[key] = modelling(data_train, data_test, feature_dict[key], 
                                                                    model = model, 
                                                                    scaler = scaler, 
                                                                    print_scores = True, 
                                                                    log = True, 
                                                                    infotext_mlflow = None, 
                                                                    save_model = True, 
                                                                    perform_gridCV = True, 
                                                                    param_grid = param_grid, 
                                                                    n_jobs = 3)
    results[key] = {k : np.round(value,5) for k,value in results[key].items()}

Features: all
Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.0, 1.0

ZONEID 1
Fitting 5 folds for each of 2 candidates, totalling 10 fits




[CV] END ..................................C=0.1, kernel=rbf; total time=   3.5s
[CV] END ..................................C=0.1, kernel=rbf; total time=   3.7s
[CV] END ..................................C=0.1, kernel=rbf; total time=   3.7s
[CV] END ..................................C=0.1, kernel=rbf; total time=   3.0s
[CV] END ....................................C=1, kernel=rbf; total time=   3.1s
[CV] END ..................................C=0.1, kernel=rbf; total time=   3.2s
[CV] END ....................................C=1, kernel=rbf; total time=   3.2s
[CV] END ....................................C=1, kernel=rbf; total time=   3.2s
[CV] END ....................................C=1, kernel=rbf; total time=   3.2s
[CV] END ....................................C=1, kernel=rbf; total time=   2.6s
Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.01, 1.0

ZONEID 2
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] END .......................

In [15]:
save_models(model_dict)

'../saved_models/211203_1300_SVR'

In [17]:
save_results(results_train, results, '../saved_models/211203_1300_SVR')

In [18]:
results

{'all': {'ZONE1': 0.18379,
  'ZONE2': 0.17131,
  'ZONE3': 0.15374,
  'ZONE4': 0.17472,
  'ZONE5': 0.17509,
  'ZONE6': 0.19027,
  'ZONE7': 0.1453,
  'ZONE8': 0.19496,
  'ZONE9': 0.16548,
  'ZONE10': 0.20816,
  'TOTAL': 0.17724},
 'no_deg': {'ZONE1': 0.18327,
  'ZONE2': 0.17138,
  'ZONE3': 0.15346,
  'ZONE4': 0.17465,
  'ZONE5': 0.17519,
  'ZONE6': 0.18982,
  'ZONE7': 0.14497,
  'ZONE8': 0.19516,
  'ZONE9': 0.16502,
  'ZONE10': 0.20796,
  'TOTAL': 0.17705},
 'no_deg_norm': {'ZONE1': 0.18302,
  'ZONE2': 0.17136,
  'ZONE3': 0.15347,
  'ZONE4': 0.17486,
  'ZONE5': 0.17527,
  'ZONE6': 0.18985,
  'ZONE7': 0.1445,
  'ZONE8': 0.19519,
  'ZONE9': 0.16464,
  'ZONE10': 0.20778,
  'TOTAL': 0.17697},
 'no_deg_norm_U10V10': {'ZONE1': 0.18371,
  'ZONE2': 0.17187,
  'ZONE3': 0.15391,
  'ZONE4': 0.17505,
  'ZONE5': 0.17594,
  'ZONE6': 0.19059,
  'ZONE7': 0.14549,
  'ZONE8': 0.19546,
  'ZONE9': 0.16494,
  'ZONE10': 0.20789,
  'TOTAL': 0.17744},
 'no_deg_norm_WS10': {'ZONE1': 0.18434,
  'ZONE2': 0.17218,


In [None]:
results
features = []
zones = []
train_score = []
test_score = []

for key in results_train.keys():
    for zone in results_train[key].keys():
        features.append(key)
        zones.append(zone)
        train_score.append(results_train[key][zone])
        test_score.append(results[key][zone])

df = pd.DataFrame({'features':features,'zone': zones,'train_score': train_score,'test_score': test_score})
path = '../saved_models/211201_1220_SVR'
file_name = path.split('/')[-1] + '.csv'
df.to_csv(path + '/' + file_name, index=False)