In [309]:
from EAExperiment import EAExperiment
import pandas as pd
import functools
import numpy as np
from metrics import evaluate
import math

from sklearn.model_selection import RepeatedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

In [310]:
uniform(loc=2, scale=15)

<scipy.stats._distn_infrastructure.rv_frozen at 0x7fa4082b7860>

In [311]:
# Decission tree
#distributions_dt = dict(max_depth=list(range(2,16)))
regrt_model = DecisionTreeRegressor(max_depth=5)

# AdaBoost with Decission tree
#distributions_ada = {'estimator__base_estimator__max_depth':list(range(2,16))}
ada_regrt_model = MultiOutputRegressor(AdaBoostRegressor(DecisionTreeRegressor(),
                          n_estimators=300), n_jobs=-1)

ada_regrt_chain_model = RegressorChain(AdaBoostRegressor(DecisionTreeRegressor(max_depth=5),
                          n_estimators=300))

# Gradient boosting
grboost_model = MultiOutputRegressor(GradientBoostingRegressor(n_estimators=300,loss='ls', learning_rate=0.1,
                                                               max_depth=5), n_jobs=-1)
grboost_chain_model = RegressorChain(GradientBoostingRegressor(n_estimators=300,loss='ls', learning_rate=0.1,
                                                               max_depth=5))
# Gaussian process
kernel = DotProduct() + WhiteKernel()
gausspr_model = MultiOutputRegressor(GaussianProcessRegressor(kernel=kernel), n_jobs=-1)
gausspr_chain_model = RegressorChain(GaussianProcessRegressor(kernel=kernel))

#Random forest
rand_forest_model = RandomForestRegressor(n_estimators=300,max_depth=5)

models = {
    "decision_tree_regressor": regrt_model,
    "adaboost_tree_regressor": ada_regrt_model,
    "adaboost_tree_regressor_chain":ada_regrt_chain_model,
    "gaussian_process_regressor": gausspr_model,
    "gaussian_process_regressor_chain":gausspr_chain_model,
    "gradient_boost": grboost_model,
    "gaussian_process_regressor_chain":grboost_chain_model,
    "random_forest": rand_forest_model
}

In [312]:
def create_propositional_table_dataframe(df,w, target, h):
    columns =[]
    for i in range(w,0,-1):
        columns.extend([s + "_lag"+str(i-1) for s in df.columns])
    for t in target:
        for j in range(h):
            columns.append(t+"_ahead"+str(j+1))
    dataframe = pd.DataFrame(columns=columns)
    return dataframe
    
def create_propositional_table(df, w, h, target):
    columns =[]
    for i in range(w,0,-1):
        columns.extend([s + "_lag"+str(i-1) for s in df.columns])
    for t in target:
        for j in range(h):
            columns.append(t+"_ahead"+str(j+1))
    dataframe = pd.DataFrame(columns=columns)
    
    indexes = []
    for i in range((len(df)-w-h+1)):
        window = df.iloc[i:(i+w)]
        row = window.values.reshape(1, len(window.columns)*len(window))
        targets = {}
        for t in target:
            row = np.append(row, df[t].iloc[(i+w):(i+w+h)])
            
        dataframe.loc[i]=row.reshape(1, len(row))[0]
        indexes.append(window.index[-1])
    
    dataframe = dataframe.set_index(pd.Series(indexes))
    
    return dataframe

In [313]:
def calculate_metrics(y_test, y_pred, target, model_name, metrics = ('mae', 'mape', 'rmse','mse')):
    horizons = y_test.columns.values
    index_horizons = np.append(horizons,target+"_mean")
    index = [np.array([model_name for i in range(7)]), index_horizons]
    metrics = pd.DataFrame(evaluate(y_test, y_pred, metrics=metrics))
    metrics.loc[6] = metrics.values.mean(axis=0)
    metrics = metrics.set_index(index)
    
    return metrics

In [314]:
def execute_baseline(df_propositional_table, models, target, train_split=0.75):
    tran_split = int(df_propositional_table.shape[0]*0.75)

    X_train = df_propositional_table.iloc[:tran_split, :-6]
    y_train = df_propositional_table.iloc[:tran_split, -6:]

    X_test = df_propositional_table.iloc[tran_split:, :-6]
    y_test = df_propositional_table.iloc[tran_split:, -6:]
    
    test_metrics_global = None
    train_metrics_global = None
    
    results_global = None
    for name, model in models.items():
        
        best_model = None
        last_mean_mae = float('inf')
        print("Training "+name+"....")
        rkf = RepeatedKFold(n_splits=2, n_repeats=5)
        for train_index, test_index in rkf.split(X_train):
            X_train_fold, X_test_fold = X_train.iloc[train_index,:], X_train.iloc[test_index,:]
            y_train_fold, y_test_fold = y_train.iloc[train_index,:], y_train.iloc[test_index,:]
            model.fit(X_train_fold,y_train_fold)
            mean_mae = calculate_metrics(y_test_fold, model.predict(X_test_fold), target, name)["mae"][6]
            if mean_mae < last_mean_mae:
                best_model = model
        
        test_pred = best_model.predict(X_test)
        train_pred = best_model.predict(X_train)
        
        test_metrics = calculate_metrics(y_test, test_pred, target, name)
        train_metrics = calculate_metrics(y_train, train_pred, target, name)
        
        if test_metrics_global is None and train_metrics_global is None:
            test_metrics_global = test_metrics
            train_metrics_global = train_metrics
        else:
            test_metrics_global = test_metrics_global.append(test_metrics)
            train_metrics_global = train_metrics_global.append(train_metrics)
        
        test_pred = pd.DataFrame(test_pred, columns=y_test.columns, index=y_test.index)
        results_model = pd.concat({"Real": y_test, "Pred": test_pred}, axis=1, names=["Type", "Horizon"])
        results = pd.concat({name: results_model}, axis=1, names=["Model", "Type", "Horizon"])
        if results_global is None:
            results_global = results
        else:
            results_global = pd.concat([results_global, results], axis=1, join='inner')
            
        
    return train_metrics_global, test_metrics_global, results_global
            

In [315]:
def execute(df, window, horizon, targets, models, train_split=0.75):
    for target in targets:
        print("============== "+target+" ==============")
        df_propositional = create_propositional_table(df, 12, 6, [target])
        train_metrics, test_metrics, results = execute_baseline(df_propositional, models, target, train_split)
        train_metrics.to_pickle("metrics/train_metrics_"+target)
        test_metrics.to_pickle("metrics/test_metrics_"+target)
        results.to_pickle("results/results_"+target)

In [351]:
df = pd.read_csv('event_table_selected_PESO20.csv')
df_no_selected = pd.read_csv('data.csv')
df = df.set_index("PERIODO")
df_no_selected = df_no_selected.set_index("PERIODO")
df["PESO_20"] = df_no_selected["PESO_20"]

In [292]:
%%time
execute(df, 12, 6, ["PESO_7", "PESO_8", "PESO_20"], models)

Training decision_tree_regressor....
Training adaboost_tree_regressor....
Training adaboost_tree_regressor_chain....
Training gaussian_process_regressor....
Training gaussian_process_regressor_chain....
Training gradient_boost....
Training random_forest....
Training decision_tree_regressor....
Training adaboost_tree_regressor....
Training adaboost_tree_regressor_chain....
Training gaussian_process_regressor....
Training gaussian_process_regressor_chain....
Training gradient_boost....
Training random_forest....
Training decision_tree_regressor....
Training adaboost_tree_regressor....
Training adaboost_tree_regressor_chain....
Training gaussian_process_regressor....
Training gaussian_process_regressor_chain....
Training gradient_boost....
Training random_forest....
CPU times: user 3min 48s, sys: 777 ms, total: 3min 48s
Wall time: 4min 25s


In [347]:
results_p7 = pd.read_pickle("results/results_PESO_7")

In [359]:
metrics_p7 = pd.read_pickle("metrics3/test_metrics_PESO_7")

In [360]:
metrics_p7#.loc["gaussian_process_regressor_chain"]

Unnamed: 0,Unnamed: 1,mae,mape,rmse,mse
decision_tree_regressor,PESO_7_ahead1,2519328.0,0.871162,3441350.0,11842890000000.0
decision_tree_regressor,PESO_7_ahead2,3173852.0,0.769528,3901300.0,15220140000000.0
decision_tree_regressor,PESO_7_ahead3,3926146.0,0.58382,5154895.0,26572940000000.0
decision_tree_regressor,PESO_7_ahead4,5025092.0,0.739654,6345557.0,40266090000000.0
decision_tree_regressor,PESO_7_ahead5,4551259.0,0.664463,5995425.0,35945120000000.0
decision_tree_regressor,PESO_7_ahead6,4131102.0,1.882354,4584213.0,21015010000000.0
decision_tree_regressor,PESO_7_mean,3887796.0,0.918497,4903790.0,25143700000000.0
adaboost_tree_regressor,PESO_7_ahead1,2603512.0,0.476844,3833479.0,14695560000000.0
adaboost_tree_regressor,PESO_7_ahead2,3381675.0,0.55299,4522002.0,20448500000000.0
adaboost_tree_regressor,PESO_7_ahead3,4233586.0,0.579497,5608208.0,31452000000000.0


In [352]:
%%time
execute(df, 12, 6, ["PESO_20"], models)

Training decision_tree_regressor....
Training adaboost_tree_regressor....
Training adaboost_tree_regressor_chain....
Training gaussian_process_regressor....
Training gaussian_process_regressor_chain....
Training gradient_boost....
Training random_forest....
CPU times: user 50.7 s, sys: 309 ms, total: 51 s
Wall time: 1min


In [354]:
metrics = pd.read_pickle("metrics/test_metrics_PESO_20")
metrics.to_excel('metrics/test_metrics_PESO_20.xlsx')

In [301]:
results_p7

Model,decision_tree_regressor,decision_tree_regressor,decision_tree_regressor,decision_tree_regressor,decision_tree_regressor,decision_tree_regressor,decision_tree_regressor,decision_tree_regressor,decision_tree_regressor,decision_tree_regressor,...,random_forest,random_forest,random_forest,random_forest,random_forest,random_forest,random_forest,random_forest,random_forest,random_forest
Type,Pred,Pred,Pred,Pred,Pred,Pred,Real,Real,Real,Real,...,Pred,Pred,Pred,Pred,Real,Real,Real,Real,Real,Real
Horizon,PESO_7_ahead1,PESO_7_ahead2,PESO_7_ahead3,PESO_7_ahead4,PESO_7_ahead5,PESO_7_ahead6,PESO_7_ahead1,PESO_7_ahead2,PESO_7_ahead3,PESO_7_ahead4,...,PESO_7_ahead3,PESO_7_ahead4,PESO_7_ahead5,PESO_7_ahead6,PESO_7_ahead1,PESO_7_ahead2,PESO_7_ahead3,PESO_7_ahead4,PESO_7_ahead5,PESO_7_ahead6
Ago/17,1724137.0,1449678.0,1448556.0,1284669.0,1572633.0,1461600.0,866668.0,1182016.0,1300326.0,3265438.0,...,2974239.0,2165487.0,2426601.0,2318915.0,866668.0,1182016.0,1300326.0,3265438.0,10891638.0,4426497.0
Sep/17,9780713.0,2429763.0,1459708.0,1421997.0,676069.0,1061285.0,1182016.0,1300326.0,3265438.0,10891638.0,...,2883173.0,2496621.0,2568816.0,2345645.0,1182016.0,1300326.0,3265438.0,10891638.0,4426497.0,3188769.0
Oct/17,1724137.0,1449678.0,1448556.0,1284669.0,1572633.0,1461600.0,1300326.0,3265438.0,10891638.0,4426497.0,...,2845402.0,2539683.0,2349065.0,2005625.0,1300326.0,3265438.0,10891638.0,4426497.0,3188769.0,2974767.0
Nov/17,1724137.0,1449678.0,1448556.0,1284669.0,1572633.0,1461600.0,3265438.0,10891638.0,4426497.0,3188769.0,...,2977158.0,2487952.0,2481128.0,2094912.0,3265438.0,10891638.0,4426497.0,3188769.0,2974767.0,4204833.0
Dic/17,563552.0,934037.0,8356420.0,866668.0,1182016.0,1300326.0,10891638.0,4426497.0,3188769.0,2974767.0,...,3086795.0,2601858.0,2469998.0,2101953.0,10891638.0,4426497.0,3188769.0,2974767.0,4204833.0,4927018.0
Ene/18,1724137.0,1449678.0,1448556.0,1284669.0,1572633.0,1461600.0,4426497.0,3188769.0,2974767.0,4204833.0,...,3058066.0,2711476.0,2338837.0,1857146.0,4426497.0,3188769.0,2974767.0,4204833.0,4927018.0,9058866.0
Feb/18,1724137.0,1449678.0,1448556.0,1284669.0,1572633.0,1461600.0,3188769.0,2974767.0,4204833.0,4927018.0,...,3464389.0,2054809.0,2193141.0,1841496.0,3188769.0,2974767.0,4204833.0,4927018.0,9058866.0,8036733.0
Mar/18,1724137.0,1449678.0,1448556.0,1284669.0,1572633.0,1461600.0,2974767.0,4204833.0,4927018.0,9058866.0,...,3013885.0,1961403.0,1761365.0,2005701.0,2974767.0,4204833.0,4927018.0,9058866.0,8036733.0,13186744.0
Abr/18,1724137.0,1449678.0,1448556.0,1284669.0,1572633.0,1461600.0,4204833.0,4927018.0,9058866.0,8036733.0,...,3164837.0,1897863.0,1730889.0,1826873.0,4204833.0,4927018.0,9058866.0,8036733.0,13186744.0,739023.0
May/18,1724137.0,1449678.0,1448556.0,1284669.0,1572633.0,1461600.0,4927018.0,9058866.0,8036733.0,13186744.0,...,3144177.0,1898616.0,1565735.0,1843624.0,4927018.0,9058866.0,8036733.0,13186744.0,739023.0,2108616.0


In [212]:
results_p7["adaboost_tree_regressor"]

Type,Pred,Pred,Pred,Pred,Pred,Pred,Real,Real,Real,Real,Real,Real
Horizon,PESO_7_ahead1,PESO_7_ahead2,PESO_7_ahead3,PESO_7_ahead4,PESO_7_ahead5,PESO_7_ahead6,PESO_7_ahead1,PESO_7_ahead2,PESO_7_ahead3,PESO_7_ahead4,PESO_7_ahead5,PESO_7_ahead6
Ago/17,1318255.0,1495317.0,1569340.0,3047421.0,2767615.0,3663372.0,866668.0,1182016.0,1300326.0,3265438.0,10891638.0,4426497.0
Sep/17,1495317.0,1569340.0,3047421.0,2487279.0,2487279.0,3265438.0,1182016.0,1300326.0,3265438.0,10891638.0,4426497.0,3188769.0
Oct/17,2370005.0,2715935.0,2715935.0,2487279.0,2767615.0,3265438.0,1300326.0,3265438.0,10891638.0,4426497.0,3188769.0,2974767.0
Nov/17,2767615.0,2556898.0,2487279.0,2715935.0,2487279.0,2370005.0,3265438.0,10891638.0,4426497.0,3188769.0,2974767.0,4204833.0
Dic/17,2556898.0,2556898.0,2767615.0,2450521.0,2450521.0,1310932.0,10891638.0,4426497.0,3188769.0,2974767.0,4204833.0,4927018.0
Ene/18,2487279.0,2767615.0,2487279.0,2429763.0,1569340.0,1182016.0,4426497.0,3188769.0,2974767.0,4204833.0,4927018.0,9058866.0
Feb/18,2767615.0,2767615.0,2450521.0,1495317.0,1300326.0,1303861.0,3188769.0,2974767.0,4204833.0,4927018.0,9058866.0,8036733.0
Mar/18,3047421.0,2450521.0,2240413.0,1300326.0,1300326.0,1300326.0,2974767.0,4204833.0,4927018.0,9058866.0,8036733.0,13186744.0
Abr/18,2458521.0,2240413.0,1459708.0,1300326.0,1325578.0,1310932.0,4204833.0,4927018.0,9058866.0,8036733.0,13186744.0,739023.0
May/18,2370005.0,1569340.0,1310932.0,2240413.0,1293670.0,1325578.0,4927018.0,9058866.0,8036733.0,13186744.0,739023.0,2108616.0


In [210]:
results_p7["random_forest"]

Type,Pred,Pred,Pred,Pred,Pred,Pred,Real,Real,Real,Real,Real,Real
Horizon,PESO_7_ahead1,PESO_7_ahead2,PESO_7_ahead3,PESO_7_ahead4,PESO_7_ahead5,PESO_7_ahead6,PESO_7_ahead1,PESO_7_ahead2,PESO_7_ahead3,PESO_7_ahead4,PESO_7_ahead5,PESO_7_ahead6
Ago/17,2871721.0,2323600.0,2154212.0,2887837.0,2390239.0,4467761.0,866668.0,1182016.0,1300326.0,3265438.0,10891638.0,4426497.0
Sep/17,2731094.0,2363147.0,2224438.0,2283120.0,2709601.0,4216478.0,1182016.0,1300326.0,3265438.0,10891638.0,4426497.0,3188769.0
Oct/17,3369098.0,2379092.0,2121660.0,2696978.0,2552853.0,4209863.0,1300326.0,3265438.0,10891638.0,4426497.0,3188769.0,2974767.0
Nov/17,2571847.0,2537592.0,2685582.0,2947895.0,2566605.0,2428614.0,3265438.0,10891638.0,4426497.0,3188769.0,2974767.0,4204833.0
Dic/17,2439312.0,2480874.0,3248451.0,2870912.0,2249967.0,2518275.0,10891638.0,4426497.0,3188769.0,2974767.0,4204833.0,4927018.0
Ene/18,2853320.0,3219084.0,3067899.0,2207741.0,2575393.0,2315695.0,4426497.0,3188769.0,2974767.0,4204833.0,4927018.0,9058866.0
Feb/18,2689336.0,3514933.0,2766087.0,2357693.0,1979541.0,2422002.0,3188769.0,2974767.0,4204833.0,4927018.0,9058866.0,8036733.0
Mar/18,2900830.0,2805964.0,2800471.0,1872804.0,2026133.0,2384861.0,2974767.0,4204833.0,4927018.0,9058866.0,8036733.0,13186744.0
Abr/18,2786272.0,3220011.0,2772291.0,2051086.0,2263457.0,2284025.0,4204833.0,4927018.0,9058866.0,8036733.0,13186744.0,739023.0
May/18,3063699.0,2946198.0,2591194.0,2748274.0,1769189.0,2036073.0,4927018.0,9058866.0,8036733.0,13186744.0,739023.0,2108616.0
