## Stacking

In [1]:
import pandas as pd, numpy as np, os
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [24]:
PATH = '../preds/'
FILES = os.listdir(PATH)

oof_files = np.sort( [f for f in FILES if 'oof' in f and 'extra' in f])
oof_preds = [np.load(PATH + k) for k in oof_files]

print(f'We have {len(oof_preds)} OOF files...')
print(oof_files)

We have 4 OOF files...
['model_four_extra_oof.npy' 'model_one_extra_oof.npy'
 'model_three_extra_oof.npy' 'model_two_extra_oof.npy']


## OOF predictions

In [25]:
x = np.vstack(oof_preds).T
x.shape

(9875, 4)

In [26]:
x

array([[46.2890625 , 49.73144531, 45.5078125 , 42.578125  ],
       [27.734375  , 26.80664062, 27.34375   , 27.734375  ],
       [62.5       , 66.06445312, 61.71875   , 64.0625    ],
       ...,
       [30.46875   , 31.68945312, 30.2734375 , 30.078125  ],
       [30.6640625 , 27.95410156, 28.7109375 , 29.4921875 ],
       [45.1171875 , 41.91894531, 45.8984375 , 43.5546875 ]])

## Statistical properties of OOF predictions

In [27]:
preds_min = x.min(axis=1).reshape(-1, 1)
preds_max = x.max(axis=1).reshape(-1, 1)
preds_mean = x.mean(axis=1).reshape(-1, 1)
preds_median = np.median(x, axis=1).reshape(-1, 1)
preds_std = x.std(axis=1).reshape(-1, 1)
from scipy.stats import skew
preds_skew = skew(x, axis=1).reshape(-1, 1)


x = np.concatenate([x, preds_min, preds_max, preds_mean, preds_median, preds_std, preds_skew], axis=1)

In [28]:
x.shape

(9875, 10)

## Meta-data provided by organizers

In [29]:
df = pd.read_csv('../data/train_5folds.csv')
y_true = []
other_feats = []
for i in range(5):
    y_true.extend(df[df.kfold == i].Pawpularity.values.tolist())
    other_feats.extend(df[df.kfold == i][[
        "Subject Focus", "Eyes", "Face", "Near", "Action", "Accessory", "Group", "Collage", "Human", "Occlusion", "Info", "Blur"
    ]].values)
y_true = np.array(y_true)
x_feats = np.array(other_feats)

In [30]:
x_feats.shape

(9875, 12)

In [31]:
x_feats

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 1, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 1, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 1, 0, 0],
       [0, 1, 1, ..., 0, 0, 0]])

In [32]:
x = np.concatenate([x, x_feats], axis=1)
x.shape

(9875, 22)

## More image meta-data

In [33]:
# from PIL import Image


# img_feats = [] 
# for i in range(5):
#     image_fpaths = [f'../data/train/{i}.jpg' for i in df[df.kfold == i].Id.tolist()]
#     img_shape = np.array([np.array(Image.open(img).size) for img in image_fpaths])
#     img_aspect_ratio = (img_shape[:, 0] / img_shape[:, 1]).reshape(-1, 1)
#     img_mean = np.array([np.array(Image.open(img)).mean() for img in image_fpaths]).reshape(-1, 1)
#     img_feats.extend(np.hstack([img_shape, img_aspect_ratio, img_mean]))

# img_feats = np.array(img_feats)

In [34]:
# img_feats

In [35]:
# x = np.concatenate([x, img_feats], axis=1)
# x.shape

## Train meta-model

In [36]:
from sklearn.linear_model import LassoCV

In [37]:
lr = LassoCV(fit_intercept=True, normalize=True, cv=5, random_state=1010)

In [38]:
lr.fit(X=x, y=y_true)

LassoCV(cv=5, normalize=True, random_state=1010)

In [39]:
lr.alpha_, lr.coef_

(0.003196425199405967,
 array([ 0.07028167,  0.6474719 ,  0.16458434,  0.        ,  0.        ,
         0.        ,  0.        ,  0.17509676,  0.        , -0.        ,
        -0.        ,  0.21162696,  0.        ,  0.        , -0.        ,
        -0.        , -0.        ,  0.        ,  0.        ,  0.        ,
        -0.        , -0.90937878]))

In [40]:
y_pred = lr.predict(X=x)

In [19]:
mean_squared_error(y_true, y_pred, squared=False)

17.29402381421835

In [20]:
all = []
for k in range(x.shape[1]):
    rmse = mean_squared_error(y_true, x[:, k], squared=False)
    all.append(rmse)
    print('Model %i has OOF RMSE = %.4f'%(k, rmse))
    
m = [np.argmin(all)]; w = []

Model 0 has OOF RMSE = 17.5856
Model 1 has OOF RMSE = 17.8464
Model 2 has OOF RMSE = 17.3695
Model 3 has OOF RMSE = 17.5400
Model 4 has OOF RMSE = 17.5203
Model 5 has OOF RMSE = 17.7900
Model 6 has OOF RMSE = 17.5156
Model 7 has OOF RMSE = 17.6844
Model 8 has OOF RMSE = 17.8071
Model 9 has OOF RMSE = 18.0992
Model 10 has OOF RMSE = 17.4113
Model 11 has OOF RMSE = 17.4124
Model 12 has OOF RMSE = 41.1028
Model 13 has OOF RMSE = 43.3820
Model 14 has OOF RMSE = 43.2540
Model 15 has OOF RMSE = 42.6031
Model 16 has OOF RMSE = 42.4844
Model 17 has OOF RMSE = 42.5227
Model 18 has OOF RMSE = 43.2686
Model 19 has OOF RMSE = 43.2167
Model 20 has OOF RMSE = 43.1621
Model 21 has OOF RMSE = 43.2344
Model 22 has OOF RMSE = 43.1322
Model 23 has OOF RMSE = 43.1276
Model 24 has OOF RMSE = 43.2248
Model 25 has OOF RMSE = 43.2187


In [21]:
np.argmin(all)

2

In [22]:
import pickle
s = pickle.dumps(lr)

In [23]:
clf2 = pickle.loads(s)
clf2.coef_

array([ 0.11437708,  0.01340851,  0.70717551,  0.        ,  0.21885051,
        0.        ,  0.        ,  0.        ,  0.00258076,  0.        ,
        0.        ,  0.        ,  0.        , -0.        , -0.        ,
        0.21417893,  0.        ,  0.        , -0.        , -0.        ,
       -0.        ,  0.        ,  0.        ,  0.        , -0.        ,
       -0.89852717])