In [134]:
from gplearn.genetic import SymbolicTransformer
from gplearn.fitness import make_fitness
from gplearn.functions import make_function

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path

In [135]:
import cufflinks as cf
cf.go_offline()

In [136]:
%run ../../SVR_library/SVR_MAPE_Library.ipynb

### Functions

In [137]:
# independent variables
def independent():
    # date dependent variables
    fecha = pd.DataFrame(consumo.index.year.values, index = consumo.index, columns = ["year"])
    fecha.year = fecha.year/fecha.year.max()
    fecha["week"] = consumo.index.week/53
    fecha["day"] = consumo.index.day/31
    fecha["dayofyear"] = consumo.index.dayofyear/366
    
    fecha["fin"]=consumo.index.weekday
    fecha["entre"]=consumo.index.weekday
    fecha["fin"]=fecha["fin"].replace([0,1,2,3,4,5,6],[.5,0,0,0,.5,1,1])
    fecha["entre"]=fecha["entre"].replace([0,1,2,3,4,5,6],[.5,1,1,1,.5,0,0])
    return fecha

In [138]:
# custom metric
def _mape(y, y_pred, w):
    diffs = np.abs(np.divide((np.maximum(0.001, y) - np.maximum(0.001, y_pred)),np.maximum(0.001, y)))
    
    return 100. * np.average(diffs, weights=w)

mape = make_fitness(_mape, greater_is_better=False)

### Load

In [139]:
path_consumo = Path().resolve().parents[1] / "Data" / "Data1.xlsx"
consumo = pd.read_excel(path_consumo)
consumo = consumo.set_index("fecha").loc["2007-01-01":"2020-03-30"]

In [140]:
# X1 =  independent()

In [223]:
t = np.arange(1, consumo.size+1).reshape(-1, 1)
t_norm = t/t.max()

In [142]:
X_train, X_test, y_train1, y_test = train_test_split(t, consumo, test_size = 365, shuffle = False)

scaler = MaxAbsScaler(); scaler.fit(X_train); X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

scaler1 = MaxAbsScaler(); scaler1.fit(y_train1)
y_train = scaler1.transform(y_train1).reshape(-1)

### model

In [171]:
function_set = ['add', 'sub', 'mul', 'div', 'sin', 'cos', 'abs']
model = SymbolicTransformer(generations=10, population_size=10000,
                         hall_of_fame=100, n_components=10, 
                         init_method = "half and half", p_hoist_mutation = 0.05,
                         p_crossover = 0.7, p_subtree_mutation = 0.2,
                         function_set=function_set, metric = mape,
                         parsimony_coefficient=0.0005,
                         max_samples=0.95, verbose=1,
                         random_state=0)
model.fit(X_train, y_train)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    18.85          14055.4        7          6.76783          6.71235      3.96m
   1     9.75          43.6029        6          6.73647          6.23201      2.84m
   2    10.67          55.5528        8          6.69382          6.17705      2.61m
   3    12.95          38.8569       10           6.5244           7.7643      2.36m
   4    14.72          28.5939       11          6.56778          6.94152      2.18m
   5    19.09          25.5569       11          6.52726          7.71013      1.89m
   6    22.48          21.4031       53          6.52099          7.51407      1.57m
   7    25.34          19.6536       72          6.51842          7.96595      1.16m
   8    32.67          16.4934       57          6.48652          6.58738  

SymbolicTransformer(const_range=(-1.0, 1.0), feature_names=None,
                    function_set=['add', 'sub', 'mul', 'div', 'sin', 'cos',
                                  'abs'],
                    generations=10, hall_of_fame=100, init_depth=(2, 6),
                    init_method='half and half', low_memory=False,
                    max_samples=0.95,
                    metric=<gplearn.fitness._Fitness object at 0x0000022F0C344E48>,
                    n_components=10, n_jobs=1, p_crossover=0.7,
                    p_hoist_mutation=0.05, p_point_mutation=0.01,
                    p_point_replace=0.05, p_subtree_mutation=0.2,
                    parsimony_coefficient=0.0005, population_size=10000,
                    random_state=0, stopping_criteria=1.0, tournament_size=20,
                    verbose=1, warm_start=False)

In [176]:
X_new = model.transform(X_train)
X_new = pd.DataFrame(X_new, index = X1.index)

In [221]:
X_new

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2007-01-01,0.988546,0.992925,0.985501,0.988980,0.975885,0.973842,0.974732,0.988227,0.977949,0.978139,0.989170,0.965701,0.977440,0.976941,0.987143,0.981177,0.989263,0.968596,0.980092,0.957758
2007-01-02,0.999532,0.998790,0.998018,0.999816,0.997069,0.997832,0.997769,0.999881,0.999682,0.999807,0.999644,0.998153,0.999802,1.000000,0.999912,0.998240,0.997053,0.997691,0.999943,0.998643
2007-01-03,0.983019,0.999863,0.969189,0.977655,0.997876,0.988297,0.964658,0.985321,0.989985,0.980968,0.982786,0.988040,0.978977,0.996769,0.995918,0.994891,0.984994,0.998579,0.992919,0.996523
2007-01-04,0.957519,0.989892,0.954972,0.961622,0.960064,0.988133,0.959887,0.984093,0.966346,0.963668,0.953216,0.963940,0.986161,0.989051,0.972056,0.974652,0.970970,0.964298,0.977398,0.953006
2007-01-05,0.960579,0.976661,0.964067,0.955074,0.954988,0.958679,0.980765,0.962252,0.955145,0.958790,0.954952,0.958775,0.963521,0.980521,0.962808,0.962572,0.956063,0.978446,0.981783,0.980372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-26,0.999993,1.000000,0.999983,0.999947,0.999980,0.999993,0.999963,0.999981,0.999998,0.999949,0.999947,0.999955,0.999975,0.999951,0.999946,0.999970,0.999947,0.999946,0.999951,0.999961
2020-03-27,0.999947,0.999947,0.999981,0.999998,0.999952,0.999977,0.999955,0.999990,0.999951,0.999962,0.999946,0.999999,0.999979,1.000000,0.999953,0.999997,0.999950,0.999981,0.999999,0.999994
2020-03-28,0.999957,0.999999,0.999985,0.999960,1.000000,0.999998,0.999994,0.999953,0.999993,0.999994,0.999980,0.999972,0.999993,0.999979,0.999958,0.999952,0.999990,0.999956,0.999995,0.999983
2020-03-29,0.999952,0.999948,0.999995,0.999948,0.999982,0.999962,0.999976,0.999999,0.999956,0.999986,0.999998,0.999999,0.999947,0.999965,0.999970,0.999958,0.999954,0.999964,0.999993,0.999955


In [177]:
df = pd.concat([X_new, X1], axis = 1)

In [178]:
df = df/df.max()

In [232]:
train_, test_ = train_test_split(X_norm, shuffle = False, test_size = 365)

### fit

In [233]:
hyp = {'kernel': 'rbf',
 'C': 1.5622596668986724,
 'epsilon': 0.01,
 'mu': 0.8894683237212756,
 'lmbda': 0.7147423174236999,
 'gamma': 0.009632563464115184
      }

run = SVR_general_cvxopt(**hyp)
run.fit(train_.to_numpy(), y_train)

AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'

In [181]:
pred = run.predict(train_)

In [182]:
pred = scaler1.inverse_transform(pred.reshape(-1, 1))

In [183]:
# plotting = pd.DataFrame(pred, index = y_test.index, columns = ["prediction"])
# plotting["real"] = y_test.values

In [184]:
plotting = pd.DataFrame(pred, index = y_train1.index, columns = ["prediction"])
plotting["real"] = scaler1.inverse_transform(y_train.reshape(-1, 1))

In [185]:
plotting[["real", "prediction"]].iplot()

In [156]:
plotting.iplot()

In [96]:
np.mean(np.abs((y_test - pred)/y_test))*100

MWh    5.878022
dtype: float64

In [117]:
np.mean()

array([[129416.53293563],
       [133105.64092507],
       [133699.5933646 ],
       ...,
       [190850.81401896],
       [173167.50688543],
       [172937.12312732]])

In [120]:
bob = scaler1.inverse_transform(y_train.reshape(-1, 1))

In [121]:
np.mean(np.abs((bob - pred)/bob))*100

4.779871727657652

# Load model

In [186]:
from pathlib import Path

In [195]:
path_to_model = str(Path().resolve().parent / "Models" / "GTransformer" / "GT_just_time_overfit.pkl")

In [196]:
with open(path_to_model, 'rb') as f:
    older = pickle.load(f)

In [224]:
X_new = older.transform(t_norm)
X_new = pd.DataFrame(X_new, index = X1.index)
X_new = X_new/X_new.max()

In [225]:
train_, test_ = train_test_split(X_new, shuffle = False, test_size = 365)

In [226]:
hyp = {'kernel': 'rbf',
 'C': 1.5622596668986724,
 'epsilon': 0.01,
 'mu': 0.8894683237212756,
 'lmbda': 0.7147423174236999,
 'gamma': 0.009632563464115184
      }

run = SVR_general_cvxopt(**hyp)
run.fit(train_.to_numpy(), y_train)

<__main__.SVR_general_cvxopt at 0x22f3349cc88>

In [227]:
pred = run.predict(train_)

In [228]:
pred = scaler1.inverse_transform(pred.reshape(-1, 1))

In [229]:
plotting = pd.DataFrame(pred, index = y_train1.index, columns = ["prediction"])
plotting["real"] = scaler1.inverse_transform(y_train.reshape(-1, 1))

In [230]:
plotting[["real", "prediction"]].iplot()