## Stacking

In [1]:
import pandas as pd, numpy as np, os
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
PATH = '../preds/'
FILES = os.listdir(PATH)

oof_files = np.sort( [f for f in FILES if 'oof' in f] )
oof_preds = [np.load(PATH + k) for k in oof_files]

print(f'We have {len(oof_preds)} OOF files...')
print(oof_files)

We have 7 OOF files...
['model_four_extra_oof.npy' 'model_four_oof.npy' 'model_one_extra_oof.npy'
 'model_one_oof.npy' 'model_three_oof.npy' 'model_two_extra_oof.npy'
 'model_two_oof.npy']


In [3]:
x = np.vstack(oof_preds).T
x.shape

(9875, 7)

In [4]:
x

array([[46.484375  , 34.5703125 , 48.6328125 , ..., 44.140625  ,
        42.96875   , 47.0703125 ],
       [28.90625   , 26.953125  , 27.27050781, ..., 25.1953125 ,
        29.1015625 , 26.171875  ],
       [63.28125   , 66.796875  , 65.234375  , ..., 59.765625  ,
        64.0625    , 60.9375    ],
       ...,
       [31.25      , 33.203125  , 31.71386719, ..., 33.3984375 ,
        30.859375  , 29.8828125 ],
       [31.25      , 33.984375  , 29.00390625, ..., 28.7109375 ,
        30.2734375 , 29.8828125 ],
       [46.6796875 , 53.515625  , 42.91992188, ..., 38.4765625 ,
        44.7265625 , 35.9375    ]])

In [5]:
df = pd.read_csv('../data/train_5folds.csv')
y_true = []
other_feats = []
for i in range(5):
    y_true.extend(df[df.kfold == i].Pawpularity.values.tolist())
    other_feats.extend(df[df.kfold == i][[
        "Subject Focus", "Eyes", "Face", "Near", "Action", "Accessory", "Group", "Collage", "Human", "Occlusion", "Info", "Blur"
    ]].values)
y_true = np.array(y_true)
x_feats = np.array(other_feats)

In [6]:
x_feats.shape

(9875, 12)

In [7]:
x_feats

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 1, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 1, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 1, 0, 0],
       [0, 1, 1, ..., 0, 0, 0]])

In [8]:
x = np.concatenate([x, x_feats], axis=1)

In [9]:
from sklearn.linear_model import LassoCV

In [10]:
lr = LassoCV(fit_intercept=True, normalize=True, cv=5, random_state=1010)

In [11]:
lr.fit(X=x, y=y_true)

LassoCV(cv=5, normalize=True, random_state=1010)

In [12]:
lr.alpha_, lr.coef_

(0.003197738558156792,
 array([ 0.24601791,  0.        ,  0.77225827,  0.03290889,  0.01687018,
         0.00907935,  0.        , -0.        ,  0.22275635,  0.        ,
         0.        , -0.        , -0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        , -0.85592442]))

In [13]:
y_pred = lr.predict(X=x)

In [14]:
mean_squared_error(y_true, y_pred, squared=False)

17.29643758677153

In [15]:
all = []
for k in range(x.shape[1]):
    rmse = mean_squared_error(y_true, x[:, k], squared=False)
    all.append(rmse)
    print('Model %i has OOF RMSE = %.4f'%(k, rmse))
    
m = [np.argmin(all)]; w = []

Model 0 has OOF RMSE = 17.6440
Model 1 has OOF RMSE = 17.8464
Model 2 has OOF RMSE = 17.3966
Model 3 has OOF RMSE = 17.5400
Model 4 has OOF RMSE = 17.7900
Model 5 has OOF RMSE = 17.5590
Model 6 has OOF RMSE = 17.6844
Model 7 has OOF RMSE = 43.2540
Model 8 has OOF RMSE = 42.6031
Model 9 has OOF RMSE = 42.4844
Model 10 has OOF RMSE = 42.5227
Model 11 has OOF RMSE = 43.2686
Model 12 has OOF RMSE = 43.2167
Model 13 has OOF RMSE = 43.1621
Model 14 has OOF RMSE = 43.2344
Model 15 has OOF RMSE = 43.1322
Model 16 has OOF RMSE = 43.1276
Model 17 has OOF RMSE = 43.2248
Model 18 has OOF RMSE = 43.2187


In [16]:
np.argmin(all)

2

In [17]:
import pickle
s = pickle.dumps(lr)

In [18]:
clf2 = pickle.loads(s)
clf2.coef_

array([ 0.24601791,  0.        ,  0.77225827,  0.03290889,  0.01687018,
        0.00907935,  0.        , -0.        ,  0.22275635,  0.        ,
        0.        , -0.        , -0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -0.85592442])