# Importações

In [11]:
import pandas as pd
import numpy as np
import joblib as jb
import glob
import re
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Leitura e separação das bases em X e Y

In [8]:
train_lvl0 = pd.read_csv('train_lvl0.csv')
train_lvl1 = pd.read_csv('train_lvl1.csv')
val = pd.read_csv("valid.csv")

X_train0, Y_train0 = train_lvl0.drop(columns=['Date','Sales']), train_lvl0['Sales']
X_train1, Y_train1 = train_lvl1.drop(columns=['Date','Sales']), train_lvl1['Sales']
X_val, Y_val = val.drop(columns=['Date','Sales']), val['Sales']

# Carregamento de todas as previsões feitas 
Conforme dito no notebook anterior, para gerar o stacking precisamos de diversidades. Então aqui iremos carregar todas as previsões feitas pelos modelos para o train1 e val e transformar em um dataframe de previsões

In [9]:
preds_train1 = glob.glob("./preds_train1/*.pkl.z")
preds_val = glob.glob("./preds_val/*.pkl.z")

df_train1 = []
for p_name in preds_train1:
    p = jb.load(p_name)
    p_name = re.search(r"train1\\(.*)\.pkl.z", p_name).group(1)
    p_df = pd.DataFrame(p, columns=[p_name])
    df_train1.append(p_df)
    
    
df_val = [] 
for p_name in preds_val:
    p = jb.load(p_name)
    p_name = re.search(r"val\\(.*)\.pkl.z", p_name).group(1)
    p_df = pd.DataFrame(p, columns=[p_name])
    df_val.append(p_df)
    
df_train1 = pd.concat(df_train1, axis=1)
df_val = pd.concat(df_val, axis=1)

Por mais que pareça um código complexo, apenas pense que as previsões feitas anteriormente foram carregadas e salvas em um Dataframe.

In [4]:
df_train1

Unnamed: 0,knn_10_minkowski,knn_11_chebyshev,knn_14_euclidean,knn_16_chebyshev,knn_17_minkowski,knn_18_chebyshev,knn_18_manhattan,knn_19_minkowski,knn_1_minkowski,knn_20_minkowski,...,svm_rbf_6.516901533306854,svm_rbf_8.37717975902004,svm_rbf_8.713863857748523,svm_rbf_9.57583607363516,svm_rbf_9.820982105641626,svm_sigmoid_0.1,svm_sigmoid_3.9053789021977297,svm_sigmoid_4.851774006513068,svm_sigmoid_8.458230910952073,svm_sigmoid_8.487792213962845
0,0.0,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,...,5351.740118,5282.484058,5268.629669,5231.898183,5221.974874,5611.761214,5503.203275,5535.891156,5172.707286,5170.519624
1,0.0,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,...,5348.593133,5276.470186,5261.804533,5223.642213,5213.280032,5612.894864,5713.721651,6023.320924,6717.741809,6721.479542
2,0.0,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,...,5723.016116,5697.708680,5692.501522,5684.970012,5681.777608,5617.463435,5725.137941,5933.551046,6383.014555,6385.593516
3,0.0,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,...,5348.922222,5277.034199,5262.430511,5224.380275,5214.051340,5612.800796,5697.470085,5986.079942,6600.530271,6603.819594
4,0.0,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,...,5072.731894,4991.480763,4974.728283,4956.430076,4944.337477,5623.346193,6639.796804,8134.680750,13675.317320,13707.664735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,9737.6,9696.272727,9688.928571,9646.4375,9695.117647,9671.277778,9623.000000,9658.736842,10067.0,9622.55,...,5762.115686,5814.087850,5823.247699,5843.351591,5849.775143,5614.881901,5310.534001,4860.335876,2597.852273,2585.384829
1196,9488.0,9614.363636,9560.428571,9498.7500,9462.705882,9535.888889,9420.166667,9420.368421,8580.0,9488.35,...,5770.280539,5816.296464,5822.839181,5839.122828,5844.155303,5618.180077,6053.045616,6619.824223,8259.655016,8269.076663
1197,10127.3,10125.727273,10079.642857,9999.4375,10037.529412,10079.611111,10046.000000,10077.631579,9935.0,10153.85,...,5847.854801,5861.149900,5863.349090,5873.589321,5875.713961,5620.105938,5815.046895,6020.653910,6440.250892,6442.889495
1198,9881.3,9892.272727,10009.928571,9985.6250,9962.882353,9960.222222,9959.055556,10016.052632,9740.0,10009.95,...,5690.473158,5722.068871,5727.522009,5738.195801,5741.948797,5614.222567,5287.324645,4835.643156,2572.397087,2559.869280


# Seleção das melhores combinações de modelos
Uma vez carregados, precisamos avaliar as previsões e encontrar quais delas combinadas geram o melhor resultado. Começamos sem selecionar nenhuma, pois não sabemos qual a melhor. 

Para esse projeto, utilizarei uma ridge regression por ser um modelo que performa relativamente rápido, para cada previsão gerada no passo 2, a ridge irá tentar utilizar como feature, e naturalmente as que conseguirem os melhores resultados serão exibidas.

Em suma: estamos procurando as colunas, ou seja previsões feitas no notebook anterior, que nos dê o melhor resultado.

In [29]:
selected = [
'lgbm_110_14_0.03248168667854209_215',
'lgbm_361_59_0.05378358562195618_276',
'rf_176_3_48',
'lgbm_24_1_0.03199227519485914_200',
'lgbm_185_96_0.014121042963223253_287',
'svm_rbf_0.6614584754426877',
'svm_rbf_8.713863857748523',
'lgbm_238_80_0.05209570020716538_268',
'lgbm_427_1_0.0001_236',
'rf_139_17_34',
'lgbm_55_48_0.018714601098343324_274']

best_score = 1500
for col in df_train1.columns:
    if col not in selected:
        Xtr = df_train1[selected+[col]].copy()
        Xval = df_val[selected+[col]].copy()
    
        mdl = Ridge(alpha=1.)
        mdl.fit(Xtr, Y_train1)

        p = mdl.predict(Xval)
        c = np.sqrt(mean_squared_error(Y_val, p))
        
        if c < best_score:
            print(col, c)
            best_score = c
 

knn_10_minkowski 390.129382996949
knn_11_chebyshev 388.93511643775764
knn_1_minkowski 387.42093013859164
lgbm_113_39_0.09026958770538755_245 385.7641282711954
lgbm_2_1_0.1_200 385.76223579443257
lgbm_313_39_0.029823707193792785_206 385.76125275958765
svm_rbf_4.828884661481364 385.70183145688515
svm_rbf_6.516901533306854 385.59638907040056
svm_rbf_8.37717975902004 385.56147059827964


Perceba que ainda não foi otimizado 100%, porém o ganho estava baixo, decidi parar. Dependendo do contexto pode-se extrair até o limite.