In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced

import numpy as np
import pandas as pd
import os
from collections import Counter



In [0]:
os.chdir('/content/gdrive/My Drive/thesis')

In [0]:
data = pd.read_pickle('master_dataset_24112019.pkl')

In [0]:
data.columns

Index(['de', 'dk1', 'dk2', 'no2', 'se3', 'se4', 'system', 'CentralProd_dk1',
       'ElectricBoilerCon_dk1', 'ExchangeContinent_dk1',
       'ExchangeGreatBelt_dk1', 'ExchangeNordicCountries_dk1', 'GrossCon_dk1',
       'LocalPowerProd_dk1', 'NetCon_dk1', 'OnshoreWindPower_dk1',
       'SolarPowerProd_dk1', 'CentralProd_dk2', 'ElectricBoilerCon_dk2',
       'ExchangeContinent_dk2', 'ExchangeNordicCountries_dk2', 'GrossCon_dk2',
       'LocalPowerProd_dk2', 'NetCon_dk2', 'OnshoreWindPower_dk2',
       'SolarPowerProd_dk2', 'sun', 'temp', 'wind', 'hour', 'weekday',
       'Offshore_dk1', 'Offshore_dk2'],
      dtype='object')

In [0]:
#data = data.drop(['NetCon_dk2', 'NetCon_dk1'], axis=1)

In [0]:
target = pd.DataFrame()
target['1_h'] = data['dk2'].shift(-1)

In [0]:
features = data.iloc[:-1, :]
target = target[:-1]


In [0]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mean.fit(target)


SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='most_frequent', verbose=0)

In [0]:
target_imp = imp_mean.transform(target)

In [0]:
print('low_price_cutoff:', np.percentile(target_imp, 5), 'high_price_cutoff:', np.percentile(target_imp, 95))

low_price_cutoff: 97.44 high_price_cutoff: 444.3925


In [0]:
def convert_to_bin(x):
  if x < 97.44:
    return -1
  elif x > 444.39:
    return 0
  else:
    return 1
  

In [0]:
target = pd.DataFrame(target_imp, index=target.index, columns=target.columns)

In [0]:
target['1_h'] = target['1_h'].apply(lambda x: convert_to_bin(x))

In [0]:
imp_mean2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mean2.fit(features)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='most_frequent', verbose=0)

In [0]:
features = imp_mean2.transform(features)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(features, target)

In [0]:
X_smote, y_smote = SMOTE().fit_sample(features, target)

  y = column_or_1d(y, warn=True)


In [0]:
smote_pipeline = make_pipeline_imb(SMOTE(), RandomForestClassifier())
smote_model = smote_pipeline.fit(X_train, y_train)
smote_prediction = smote_model.predict(X_test)

  y = column_or_1d(y, warn=True)


In [0]:
def print_results(headline, true_value, pred):
  print(headline)
  print(f"accuracy: {accuracy_score(true_value, pred)}")
  #print(f"precision: {precision_score(true_value, pred)}")
  #print(f"recall: {recall_score(true_value, pred)}")
  #print(f"f1: {f1_score(true_value, pred)}")

In [0]:
print(classification_report_imbalanced(y_test, smote_prediction))

                   pre       rec       spe        f1       geo       iba       sup

         -1       0.72      0.92      0.98      0.81      0.95      0.90       844
          0       0.67      0.88      0.98      0.76      0.93      0.85       860
          1       0.99      0.96      0.90      0.97      0.93      0.86     15315

avg / total       0.96      0.95      0.91      0.95      0.93      0.86     17019



In [0]:
print(f"SMOTE score {smote_pipeline.score(X_test, y_test)}")
print_results("SMOTE CLASSIFICATION", y_test, smote_prediction)

SMOTE score 0.9499970621070568
SMOTE CLASSIFICATION
accuracy: 0.9499970621070568


In [0]:
confusion_matrix(y_test, smote_prediction)

array([[  777,     0,    67],
       [    0,   755,   105],
       [  304,   375, 14636]])

In [0]:
print(Counter(y_smote))

Counter({1: 61268, 0: 61268, -1: 61268})


In [0]:
smote_model2 = RandomForestClassifier()
x_smote_train, x_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote)
smote_model2.fit(x_smote_train, y_smote_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
y_pred = smote_model2.predict(x_smote_test)
confusion_matrix(y_smote_test, y_pred)

array([[15205,     0,    51],
       [    0, 15231,    66],
       [  311,   429, 14658]])

In [0]:
real_pred = smote_model2.predict(X_test)

In [0]:
confusion_matrix(real_pred, y_test)

array([[  841,     0,   117],
       [    0,   857,   129],
       [    3,     3, 15069]])

In [0]:
print(f"Precision, low prices {882/(882+97)}")
print(f"Precision, high prices {859/(859+149)}")
print(f"Precision, normal prices {15028/(15028+4)}")
      

Precision, low prices 0.9009193054136875
Precision, high prices 0.8521825396825397
Precision, normal prices 0.9997339010111762


+24 hours

In [0]:
target = pd.DataFrame()
target['24_h'] = data['dk2'].shift(-24)

In [0]:
features = data.iloc[:-24, :]
target = target[:-24]

In [0]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mean.fit(target)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='most_frequent', verbose=0)

In [0]:
target_imp = imp_mean.transform(target)

In [0]:
target = pd.DataFrame(target_imp, index=target.index, columns=target.columns)

In [0]:
target['24_h'] = target['24_h'].apply(lambda x: convert_to_bin(x))

In [0]:
imp_mean2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mean2.fit(features)
features = imp_mean2.transform(features)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(features, target)

In [0]:
X_smote, y_smote = SMOTE().fit_sample(features, target)

  y = column_or_1d(y, warn=True)


In [0]:
smote_model3 = RandomForestClassifier()
x_smote_train, x_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote)
smote_model3.fit(x_smote_train, y_smote_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
y_pred = smote_model3.predict(x_smote_test)
confusion_matrix(y_smote_test, y_pred)

array([[15187,     3,   113],
       [    5, 15043,   138],
       [  483,   597, 14365]])

In [0]:
real_pred = smote_model3.predict(X_test)

In [0]:
confusion_matrix(real_pred, y_test)

array([[  874,     0,   144],
       [    0,   828,   190],
       [   11,     6, 14961]])

In [0]:
print(f"Precision, low prices {811/(811+141)}")
print(f"Precision, high prices {809/(809+167)}")
print(f"Precision, normal prices {15079/(15079+6+9)}")

Precision, low prices 0.851890756302521
Precision, high prices 0.8288934426229508
Precision, normal prices 0.9990062276401219


+48 Hours

In [0]:
target = pd.DataFrame()
target['48_h'] = data['dk2'].shift(-48)

In [0]:
features = data.iloc[:-48, :]
target = target[:-48]

In [0]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mean.fit(target)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='most_frequent', verbose=0)

In [0]:
target_imp = imp_mean.transform(target)

In [0]:
target = pd.DataFrame(target_imp, index=target.index, columns=target.columns)

In [0]:
target['48_h'] = target['48_h'].apply(lambda x: convert_to_bin(x))

In [0]:
imp_mean2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mean2.fit(features)
features = imp_mean2.transform(features)

In [0]:
X_smote, y_smote = SMOTE().fit_sample(features, target)

  y = column_or_1d(y, warn=True)


In [0]:
X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote)
X_train, X_test, y_train, y_test = train_test_split(features, target)

In [0]:
smote_model4 = RandomForestClassifier()
x_smote_train, x_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote)
smote_model4.fit(x_smote_train, y_smote_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
y_pred = smote_model3.predict(x_smote_test)
confusion_matrix(y_smote_test, y_pred)

array([[10041,    74,  5208],
       [   61, 10189,  5009],
       [  514,   539, 14286]])

In [0]:
real_pred = smote_model4.predict(X_test)

In [0]:
confusion_matrix(real_pred, y_test)

array([[  850,     0,   111],
       [    0,   822,   157],
       [   11,     5, 15052]])

In [0]:
print(f"Precision, low prices {794/(794+116)}")
print(f"Precision, high prices {799/(799+179)}")
print(f"Precision, normal prices {15101/(15101+6+13)}")

Precision, low prices 0.8725274725274725
Precision, high prices 0.8169734151329243
Precision, normal prices 0.9987433862433862


In [0]:
import pickle 
filehandler = open('/content/gdrive/My Drive/thesis/rf+1.pickle', 'wb') 
pickle.dump(smote_model2, filehandler)

filehandler = open('/content/gdrive/My Drive/thesis/rf+24.pickle', 'wb') 
pickle.dump(smote_model3, filehandler)

filehandler = open('/content/gdrive/My Drive/thesis/rf+48.pickle', 'wb') 
pickle.dump(smote_model4, filehandler)

In [0]:

filehandler = open('/content/gdrive/My Drive/thesis/rf+48.pickle', 'wb') 
pickle.dump(smote_model4, filehandler)