# 2. Modelling SVR Linear

---

In [12]:
## load modules and run mlflow_logging.ipynb to get function to track model information on MLFLow
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import LinearSVR, SVR
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler


from sklearn.model_selection import GridSearchCV

from modeling.functions import modelling, log_to_mlflow, get_features, save_models, load_models

In [2]:
data = pd.read_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', parse_dates=['TIMESTAMP'])
data.head()
data.dropna(inplace=True)
data.info()
RSEED = 42

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175265 entries, 0 to 175433
Data columns (total 19 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   ZONEID      175265 non-null  int64         
 1   TIMESTAMP   175265 non-null  datetime64[ns]
 2   TARGETVAR   175265 non-null  float64       
 3   U10         175265 non-null  float64       
 4   V10         175265 non-null  float64       
 5   U100        175265 non-null  float64       
 6   V100        175265 non-null  float64       
 7   HOUR        175265 non-null  int64         
 8   MONTH       175265 non-null  int64         
 9   WEEKDAY     175265 non-null  int64         
 10  IS_HOLIDAY  175265 non-null  int64         
 11  WS10        175265 non-null  float64       
 12  WS100       175265 non-null  float64       
 13  WD10        175265 non-null  float64       
 14  WD100       175265 non-null  float64       
 15  WD100CARD   175265 non-null  object        
 16  WD

In [3]:
data = pd.get_dummies(data, columns = ['WD100CARD','WD10CARD'])
data.head()

Unnamed: 0,ZONEID,TIMESTAMP,TARGETVAR,U10,V10,U100,V100,HOUR,MONTH,WEEKDAY,...,WD10CARD_NNW,WD10CARD_NW,WD10CARD_S,WD10CARD_SE,WD10CARD_SSE,WD10CARD_SSW,WD10CARD_SW,WD10CARD_W,WD10CARD_WNW,WD10CARD_WSW
0,1,2012-01-01 01:00:00,0.0,2.1246,-2.681966,2.86428,-3.666076,1,1,6,...,0,1,0,0,0,0,0,0,0,0
1,1,2012-01-01 02:00:00,0.054879,2.521695,-1.79696,3.344859,-2.464761,2,1,6,...,0,1,0,0,0,0,0,0,0,0
2,1,2012-01-01 03:00:00,0.110234,2.67221,-0.822516,3.508448,-1.214093,3,1,6,...,0,0,0,0,0,0,0,0,1,0
3,1,2012-01-01 04:00:00,0.165116,2.457504,-0.143642,3.215233,-0.355546,4,1,6,...,0,0,0,0,0,0,0,1,0,0
4,1,2012-01-01 05:00:00,0.15694,2.245898,0.389576,2.957678,0.332701,5,1,6,...,0,0,0,0,0,0,0,1,0,0


In [4]:
## train-test-split
data_train, data_test = train_test_split(data, test_size=0.25, random_state=RSEED, stratify=data.ZONEID)

In [5]:
# define features and feature dict
feature_dict = get_features(data)

features = feature_dict['all']

In [6]:
# define zone
zone = 1

In [7]:
# split train and test data in feature and TARGETVAR parts and cut data to desired zones
X_train = data_train[data_train.ZONEID == zone][features]
y_train = data_train[data_train.ZONEID == zone].TARGETVAR

X_test = data_test[data_test.ZONEID == zone][features]
y_test = data_test[data_test.ZONEID == zone].TARGETVAR

In [8]:
# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
model = SVR()
for ii in range(5, 6):
    model = SVR(kernel='poly', degree=ii, C=1, cache_size=100)
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_train = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred_train]
    y_pred = model.predict(X_test)
    y_pred = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred]
    print(ii,': train',mean_squared_error(y_train, y_pred_train, squared=False),
            ', test:', mean_squared_error(y_test, y_pred, squared=False))

5 : train 0.15264726030869183 , test: 0.1649081581158808


In [72]:

model = SVR(kernel='rbf', gamma='auto', C=10)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_train = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred_train]
y_pred = model.predict(X_test)
y_pred = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred]
print('train',mean_squared_error(y_train, y_pred_train, squared=False),
        ', test:', mean_squared_error(y_test, y_pred, squared=False))

train 0.14915516924468272 , test: 0.16193507617770822


In [74]:
model = SVR()
scaler = MinMaxScaler()

model_dict = {}
results = {}
results_train = {}
param_grid = [ 
    {           'C': [0.1, 1, 5, 10], 
                'degree': [5, 6, 7],
                'kernel': ['poly']
                },
    {           'C': [0.1, 1, 5, 10], 
                'kernel': ['rbf']
                }
]

key='all'

print(f'Features: {key}')
results_train[key],results[key], model_dict[key] = modelling(data_train, data_test, feature_dict[key], model = model, scaler=scaler, print_scores=True, log=None, infotext_mlflow=None, save_models= True, perform_gridCV= True, param_grid= param_grid)
results[key] = {k : np.round(value,5) for k,value in results[key].items()}

Features: all
Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.0, 1.02

ZONEID 1
Fitting 5 folds for each of 16 candidates, totalling 80 fits




[CV] END .......................C=0.1, degree=5, kernel=poly; total time=   5.1s
[CV] END .......................C=0.1, degree=5, kernel=poly; total time=   5.1s
[CV] END .......................C=0.1, degree=5, kernel=poly; total time=   5.2s
[CV] END .......................C=0.1, degree=6, kernel=poly; total time=   5.3s
[CV] END .......................C=0.1, degree=5, kernel=poly; total time=   5.3s
[CV] END .......................C=0.1, degree=5, kernel=poly; total time=   5.4s
[CV] END .......................C=0.1, degree=6, kernel=poly; total time=   5.6s
[CV] END .......................C=0.1, degree=6, kernel=poly; total time=   6.0s
[CV] END .......................C=0.1, degree=6, kernel=poly; total time=   4.8s
[CV] END .......................C=0.1, degree=6, kernel=poly; total time=   5.0s
[CV] END .......................C=0.1, degree=7, kernel=poly; total time=   5.3s
[CV] END .......................C=0.1, degree=7, kernel=poly; total time=   5.5s
[CV] END ...................

In [75]:
results

{'all': {'ZONE1': 0.16469,
  'ZONE2': 0.1368,
  'ZONE3': 0.14366,
  'ZONE4': 0.1656,
  'ZONE5': 0.1691,
  'ZONE6': 0.16831,
  'ZONE7': 0.12504,
  'ZONE8': 0.15403,
  'ZONE9': 0.15276,
  'ZONE10': 0.19288,
  'TOTAL': 0.15835}}

knn
'all': {'ZONE1': 0.15672,\
  'ZONE2': 0.13847,\
  'ZONE3': 0.14915,\
  'ZONE4': 0.17287,\
  'ZONE5': 0.17664,\
  'ZONE6': 0.1737,\
  'ZONE7': 0.13239,\
  'ZONE8': 0.15114,\
  'ZONE9': 0.15061,\
  'ZONE10': 0.19886,\
  'TOTAL': 0.1612},

In [76]:
model_dict

{'all': {1: GridSearchCV(estimator=SVR(), n_jobs=-1,
               param_grid=[{'C': [0.1, 1, 5, 10], 'degree': [5, 6, 7],
                            'kernel': ['poly']},
                           {'C': [0.1, 1, 5, 10], 'kernel': ['rbf']}],
               scoring='neg_root_mean_squared_error', verbose=2),
  2: GridSearchCV(estimator=SVR(), n_jobs=-1,
               param_grid=[{'C': [0.1, 1, 5, 10], 'degree': [5, 6, 7],
                            'kernel': ['poly']},
                           {'C': [0.1, 1, 5, 10], 'kernel': ['rbf']}],
               scoring='neg_root_mean_squared_error', verbose=2),
  3: GridSearchCV(estimator=SVR(), n_jobs=-1,
               param_grid=[{'C': [0.1, 1, 5, 10], 'degree': [5, 6, 7],
                            'kernel': ['poly']},
                           {'C': [0.1, 1, 5, 10], 'kernel': ['rbf']}],
               scoring='neg_root_mean_squared_error', verbose=2),
  4: GridSearchCV(estimator=SVR(), n_jobs=-1,
               param_grid=[{'C': [0.1,

In [84]:
print('Best parameters')
for key, value in model_dict['all'].items():
    print(f'Zone: {key}: {value.best_estimator_}')

Best parameters
Zone: 1: SVR(C=10)
Zone: 2: SVR(C=10)
Zone: 3: SVR(C=10)
Zone: 4: SVR(C=10)
Zone: 5: SVR(C=10)
Zone: 6: SVR(C=10)
Zone: 7: SVR(C=10)
Zone: 8: SVR(C=5, degree=6, kernel='poly')
Zone: 9: SVR(C=10)
Zone: 10: SVR(C=10)


In [103]:
feature_dict.keys()

dict_keys(['all', 'no_deg', 'no_deg_norm', 'no_deg_norm_U10V10', 'no_deg_norm_WS10', 'no_comp', 'no_comp_plus_100Norm', 'no_deg_comp', 'no_ten', 'no_card', 'no_deg_comp_ten'])

In [125]:
model_dict_bak = model_dict.copy()

In [131]:
for key in model_dict_bak.keys():
    for zone,value in model_dict_bak[key].items():
        model_dict_bak[key][zone] = value.best_estimator_

In [150]:
k1 = list(model_dict_bak)[0]
k2 = list(model_dict_bak[k1])[0]
m = model_dict_bak[k1][k2]
#k2
m.__class__.__name__

'SVR'

In [165]:
import os
print(os.getcwd())
os.mkdir('../saved_models/test/test1')

/Users/fklein/neuefische/capstone/Capstone_WindPowerPredicting/notebooks


In [169]:
save_models(model_dict_bak)

In [173]:
os.listdir('../saved_models/211130_1547_SVR')

['all']

In [174]:
os.listdir()

['2_modelling_svr_rbf_ferdinand.ipynb',
 'aemo_download.ipynb',
 '2_Modelling_Jerome.ipynb',
 '1_EDA_Moritz_2.ipynb',
 'eda_ferdinand.ipynb',
 '1_EDA_Jerome.ipynb',
 '2_Modelling_Random_Forest.ipynb',
 '1_EDA_Moritz.ipynb',
 '__pycache__',
 '1_Modelling_Jerome.ipynb',
 '2_Baseline_model.ipynb',
 'mlruns',
 '2_modelling_svr_linear_msp.ipynb',
 '3_Modelling_LightGBM_Jerome.ipynb',
 '2_MODELING_Moritz.ipynb',
 '.ipynb_checkpoints',
 '0_concatenate_train_test.ipynb',
 '1_EDA_c.ipynb',
 '2_modelling_knn_regressor.ipynb',
 '2_modelling_ferdinand.ipynb',
 'EDA-and-modeling.ipynb']