In [1]:
import numpy as np
import pandas as pd

import gc
from sklearn.model_selection import KFold
from cuml.metrics import mean_absolute_error, mean_squared_error

import seaborn as sns

In [2]:
OUTPUT_DICT = ''

ID = 'Id'
TARGET_COLS = ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']
SEED = 2020

N_FOLD = 5

In [3]:
base_path = '/media/hiroki/share/kaggle_data/trends-assessment-prediction/'
train = pd.read_csv(base_path+'train_scores.csv', dtype={'Id':str})\
            .dropna().reset_index(drop=True) # to make things easy
reveal_ID = pd.read_csv(base_path+'reveal_ID_site2.csv', dtype={'Id':str})
ICN_numbers = pd.read_csv(base_path+'ICN_numbers.csv')
loading = pd.read_csv(base_path+'loading.csv', dtype={'Id':str})
fnc = pd.read_csv(base_path+'fnc.csv', dtype={'Id':str})
sample_submission = pd.read_csv(base_path+'sample_submission.csv', dtype={'Id':str})

In [4]:
fnc_features, loading_features = list(fnc.columns[1:]), list(loading.columns[1:])

In [5]:
sample_submission['ID_num'] = sample_submission[ID].apply(lambda x: int(x.split('_')[0]))
test = pd.DataFrame({ID: sample_submission['ID_num'].unique().astype(str)})
del sample_submission['ID_num']; gc.collect()
test.head()

Unnamed: 0,Id
0,10003
1,10006
2,10010
3,10011
4,10012


In [6]:
# merge
train = train.merge(loading, on=ID, how='left')
train = train.merge(fnc, on=ID, how='left')
train

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2,IC_01,IC_07,IC_05,IC_16,...,CBN(13)_vs_DMN(94),CBN(18)_vs_DMN(94),CBN(4)_vs_DMN(94),CBN(7)_vs_DMN(94),CBN(18)_vs_CBN(13),CBN(4)_vs_CBN(13),CBN(7)_vs_CBN(13),CBN(4)_vs_CBN(18),CBN(7)_vs_CBN(18),CBN(7)_vs_CBN(4)
0,10001,57.436077,30.571975,62.553736,53.325130,51.427998,0.006070,0.014466,0.004136,0.000658,...,-0.149279,0.552841,0.131046,0.335446,0.394867,-0.042853,0.124627,-0.060712,0.515964,0.290488
1,10002,59.580851,50.969456,67.470628,60.651856,58.311361,0.009087,0.009291,0.007049,-0.002076,...,-0.214216,-0.039792,0.143014,-0.189962,0.498373,0.444231,0.592438,0.028649,0.705524,0.248327
2,10004,71.413018,53.152498,58.012103,52.418389,62.536641,0.004675,0.000957,0.006154,-0.000429,...,-0.130339,0.309540,0.141469,0.030853,0.344394,0.214097,0.317556,0.012435,0.665937,0.081358
3,10007,38.617381,49.197021,65.674285,40.151376,34.096421,0.005192,0.010585,0.012160,-0.000920,...,-0.150218,0.408926,0.072004,0.157582,0.532046,0.355448,0.462675,0.161005,0.703679,0.293607
4,10008,35.326582,15.769168,65.782269,44.643805,50.448485,0.007745,0.009748,0.009356,-0.004219,...,-0.080562,0.005339,-0.386757,0.020546,0.518383,0.408071,0.465851,0.112785,0.574596,0.178531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5429,21746,14.257265,21.358872,61.165998,51.778483,54.640179,-0.001115,0.007108,0.008652,0.003596,...,-0.249481,0.205351,0.012067,0.310750,0.426335,0.193812,0.158720,0.055471,0.568766,0.160516
5430,21747,55.456978,68.169675,29.907995,55.349257,54.019517,0.007263,0.016489,0.012704,0.004357,...,-0.119170,0.201846,-0.008290,0.119828,0.551936,0.598931,0.511816,0.303312,0.704483,0.461588
5431,21750,48.948756,55.114811,60.878271,38.617246,50.679885,0.005996,0.003873,0.012353,0.000242,...,-0.103786,0.375065,0.104857,0.262614,0.502715,0.322353,0.458041,0.343754,0.705207,0.341224
5432,21752,66.532630,59.844808,72.303110,55.458281,46.870235,0.000627,0.011407,0.010957,0.000534,...,0.229712,0.431489,0.039062,0.119474,0.523894,0.445209,0.332011,0.228977,0.560968,0.263504


In [7]:
# merge
test = test.merge(loading, on=ID, how='left')
test = test.merge(fnc, on=ID, how='left')
test

Unnamed: 0,Id,IC_01,IC_07,IC_05,IC_16,IC_26,IC_06,IC_10,IC_09,IC_18,...,CBN(13)_vs_DMN(94),CBN(18)_vs_DMN(94),CBN(4)_vs_DMN(94),CBN(7)_vs_DMN(94),CBN(18)_vs_CBN(13),CBN(4)_vs_CBN(13),CBN(7)_vs_CBN(13),CBN(4)_vs_CBN(18),CBN(7)_vs_CBN(18),CBN(7)_vs_CBN(4)
0,10003,0.008151,0.014684,0.010444,-0.005293,-0.002913,0.015042,0.017745,0.003930,-0.008021,...,-0.154941,0.136850,-0.022361,0.137625,0.677972,0.409412,0.563892,0.438684,0.618204,0.284474
1,10006,0.000334,0.005311,0.010053,0.006920,-0.000065,0.015310,0.016543,0.004794,0.003982,...,-0.053606,0.240957,0.270419,0.367692,0.354501,0.486364,0.416908,0.285274,0.693490,0.448526
2,10010,0.007103,0.006144,0.009770,-0.002884,-0.001346,0.015651,0.011613,-0.003291,0.013423,...,-0.244332,0.272077,0.193523,0.192254,0.563982,0.124482,0.488926,0.083368,0.774299,0.129327
3,10011,0.004362,0.010240,0.010167,0.004492,-0.001623,0.017381,0.014680,0.007453,0.008786,...,-0.099726,0.557121,0.042626,0.179456,0.416546,0.445402,0.436909,0.165182,0.591561,0.306678
4,10012,-0.007521,-0.003918,0.008434,-0.001145,0.002017,0.015065,0.019616,0.004140,-0.003744,...,-0.025230,0.203298,0.173427,0.046047,0.561599,0.418268,0.609517,0.218285,0.790285,0.301010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5872,21745,0.005406,0.006275,0.012252,0.003518,0.001400,0.015054,0.015373,0.001532,0.003546,...,0.179080,0.580813,0.202241,0.254870,0.458581,0.434638,0.587167,0.009854,0.472956,0.342085
5873,21748,0.004240,0.009213,0.010981,0.000443,-0.003072,0.010702,0.014673,0.005523,0.005780,...,-0.106345,0.234340,0.138985,0.343382,0.708744,0.312812,0.536501,0.214803,0.849512,0.204741
5874,21749,0.004783,0.017910,0.012128,-0.005683,-0.011613,0.017000,0.007230,0.001315,0.008788,...,-0.165575,0.170154,-0.029638,0.383761,0.398305,0.578621,0.357127,0.009479,0.609545,0.317230
5875,21751,0.003835,0.015067,0.015428,-0.002030,0.001205,0.012396,0.011026,-0.001491,0.005310,...,-0.087604,0.131902,-0.047932,0.022317,0.583869,0.596734,0.515209,0.379589,0.568422,0.439016


In [8]:
def metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))

In [9]:
pd.concat([pd.DataFrame(pred_train_targets_svr['age']), pd.DataFrame(pred_train_targets_svr['domain1_var1'])])# Giving less importance to FNC features since they are easier to overfit due to high dimensionality.
FNC_SCALE = 1/600

train[fnc_features] *= FNC_SCALE
test[fnc_features] *= FNC_SCALE

In [11]:
# 学習データに対する「目的変数を知らない」予測値と、テストデータに対する予測値を返す関数
def predict_cv(train_x, train_y, test_x, model, target_name):
    preds = []
    preds_test = []
    va_idxes = []
    
    score = []
    mae = []
    rmse = []

    # shuffleしなくても良い
    kf = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    # クロスバリデーションで学習・予測を行い、予測値とインデックスを保存する
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx].values, train_x.iloc[va_idx].values
        tr_y, va_y = train_y.iloc[tr_idx].values, train_y.iloc[va_idx].values
        model.fit(tr_x, tr_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)
        
        score.append(metric(va_y, pred))
        mae.append(mean_absolute_error(va_y, pred))
        rmse.append(np.sqrt(mean_squared_error(va_y, pred)))
        
    score_cv = np.array(score).mean()
    mae_cv = np.array(mae).mean()
    rmse_cv = np.array(rmse).mean()
    print("{0}_score:{1}".format(target_name, np.round(score_cv, 8)))
    print("{0}_mae:{1}".format(target_name, np.array(mae_cv).mean()))
    print("{0}_rmse:{1}".format(target_name, np.array(rmse_cv).mean()))
    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test, score_cv

# 1st layer

## SVR

In [13]:
import cudf
import cupy as cp
from cuml import SVR

In [14]:
pred_train_targets_svr = {}
pred_test_targets_svr = {}

overal_score = 0

print("N_FOLD:{}".format(N_FOLD))

for target, c, w in [("age", 60, 0.3),
                     ("domain1_var1", 12, 0.175),
                     ("domain1_var2", 8, 0.175),
                     ("domain2_var1", 9, 0.175),
                     ("domain2_var2", 12, 0.175)]:
    train_df = train[train[target].notnull()]
    train_x = train_df.drop([ID]+TARGET_COLS, axis=1)
    train_y = train_df[target]
    test_x = test.drop(ID, axis=1)
    svr = SVR(C=c, cache_size=3000.0)
    
    print("-----{}-----".format(target))
    pred_train, preds_test, score_cv = predict_cv(train_x, train_y, test_x, svr, target)
    overal_score += w*score_cv
    pred_train_targets_svr[target] = pred_train
    pred_test_targets_svr[target] = preds_test
print('--------------------------------------------')
print("Overal score:", np.round(overal_score, 8))

N_FOLD:5
-----age-----
age_score:0.14590207
age_mae:7.2390394318143
age_rmse:9.175141152229084
-----domain1_var1-----
domain1_var1_score:0.15164345
domain1_var1_mae:7.805857038680881
domain1_var1_rmse:9.727022448525
-----domain1_var2-----
domain1_var2_score:0.15156137
domain1_var2_mae:8.979138175832578
domain1_var2_rmse:11.444830122148222
-----domain2_var1-----
domain2_var1_score:0.18084712
domain2_var1_mae:8.543474387970898
domain2_var1_rmse:10.867463713297898
-----domain2_var2-----
domain2_var2_score:0.17560892
domain2_var2_mae:9.11649856732418
domain2_var2_rmse:11.709338197245064
--------------------------------------------
Overal score: 0.15921127


In [15]:
pred_train_targets_df = pd.DataFrame(pred_train_targets_svr)

In [16]:
pred_train_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5434.0,5434.0,5434.0,5434.0,5434.0
mean,49.756591,51.684566,60.034987,47.794895,52.699956
std,9.616155,3.150787,1.45069,2.715417,2.264172
min,19.59714,41.813037,54.773875,39.559544,44.934667
25%,42.853693,49.502985,59.066519,45.913357,51.205777
50%,49.421178,51.622424,60.050923,47.787443,52.624013
75%,56.5505,53.825189,60.991181,49.680695,54.187764
max,86.278869,65.535914,65.248721,56.192658,61.137384


In [17]:
pred_test_targets_df = pd.DataFrame(pred_test_targets_svr)

In [18]:
pred_test_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.918755,51.879521,60.014677,47.550219,52.863188
std,9.856628,3.145006,1.322402,2.71733,2.170802
min,18.687302,41.814341,55.278092,37.703003,44.988479
25%,41.720555,49.630928,59.108011,45.636272,51.400837
50%,48.539336,51.847639,60.014374,47.581931,52.820171
75%,55.92218,53.995373,60.897448,49.483745,54.325552
max,80.51812,63.114481,65.785272,56.910917,62.418266


In [19]:
pred_test_targets_df

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
0,50.411353,50.348111,61.207718,47.959141,57.738043
1,62.459979,54.689190,59.781110,50.131558,51.801375
2,40.445822,50.390250,60.405882,45.916636,53.264350
3,49.077023,52.706590,60.247071,47.914782,51.232910
4,52.209707,55.269716,57.576883,45.619566,57.853458
...,...,...,...,...,...
5872,40.410701,50.862752,58.190021,46.411171,54.031084
5873,56.466351,57.728526,60.282598,50.195867,53.129349
5874,41.175712,47.540070,60.155436,44.928243,51.782127
5875,40.757747,51.188919,58.664907,45.871041,57.121445


## baysian ridge

In [20]:
from sklearn.linear_model import BayesianRidge

In [21]:
pred_train_targets_bayes_ridge = {}
pred_test_targets_bayes_ridge = {}

overal_score = 0

print("N_FOLD:{}".format(N_FOLD))

for target, w in [("age", 0.3),
                  ("domain1_var1", 0.175),
                  ("domain1_var2", 0.175),
                  ("domain2_var1", 0.175),
                  ("domain2_var2", 0.175)]:
    train_df = train[train[target].notnull()]
    train_x = train_df.drop([ID]+TARGET_COLS, axis=1)
    train_y = train_df[target]
    bayes_ridge = BayesianRidge(n_iter = 3000)
    
    print("-----{}-----".format(target))
    pred_train, preds_test, score_cv = predict_cv(train_x, train_y, test_x, bayes_ridge, target)
    overal_score += w*score_cv
    pred_train_targets_bayes_ridge[target] = pred_train
    pred_test_targets_bayes_ridge[target] = preds_test
print('--------------------------------------------')
print("Overal score:", np.round(overal_score, 8))

N_FOLD:5
-----age-----
age_score:0.14526516
age_mae:7.207417841669853
age_rmse:9.126005818504765
-----domain1_var1-----
domain1_var1_score:0.15116258
domain1_var1_mae:7.781071142202071
domain1_var1_rmse:9.690126804955433
-----domain1_var2-----
domain1_var2_score:0.15119996
domain1_var2_mae:8.957814390743689
domain1_var2_rmse:11.371331159837206
-----domain2_var1-----
domain2_var1_score:0.18118111
domain2_var1_mae:8.559304565826103
domain2_var1_rmse:10.83695071377573
-----domain2_var2-----
domain2_var2_score:0.17586259
domain2_var2_mae:9.129777893423746
domain2_var2_rmse:11.671634200924942
--------------------------------------------
Overal score: 0.15897564


In [22]:
pred_train_targets_df = pd.DataFrame(pred_train_targets_bayes_ridge)

In [23]:
pred_train_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5434.0,5434.0,5434.0,5434.0,5434.0
mean,49.602764,51.478672,59.24572,47.260059,51.920395
std,9.884848,3.191612,0.66447,2.298371,1.753767
min,20.137104,39.739576,56.38701,40.158413,42.086996
25%,42.519328,49.339751,58.815656,45.651432,50.75878
50%,49.287075,51.514309,59.267038,47.176543,51.888849
75%,56.335995,53.645717,59.686343,48.793224,53.07566
max,81.873726,62.403192,61.481104,57.66163,58.766078


In [24]:
pred_test_targets_df = pd.DataFrame(pred_test_targets_bayes_ridge)

In [25]:
pred_test_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.932814,51.576092,59.262143,47.000256,52.043891
std,10.186062,3.250484,0.628288,2.322438,1.72459
min,16.206375,40.558533,56.604829,38.692719,44.712542
25%,41.559869,49.40801,58.847564,45.39486,50.892149
50%,48.595813,51.631866,59.277452,46.957778,52.013774
75%,56.16743,53.812505,59.692671,48.615446,53.187733
max,84.178173,63.706069,61.64414,55.679888,58.784127


## BaggingRegressor

In [26]:
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import Ridge

In [27]:
pred_train_targets_bagging_ridge = {}
pred_test_targets_bagging_ridge = {}

overal_score = 0

print("N_FOLD:{}".format(N_FOLD))

for target, w in [("age", 0.3),
                  ("domain1_var1", 0.175),
                  ("domain1_var2", 0.175),
                  ("domain2_var1", 0.175),
                  ("domain2_var2", 0.175)]:
    train_df = train[train[target].notnull()]
    train_x = train_df.drop([ID]+TARGET_COLS, axis=1)
    train_y = train_df[target]
    
    bagging_ridge = BaggingRegressor(Ridge(alpha = 0.0001), n_estimators=30, random_state=42, max_samples=0.3, max_features=0.3)
    
    print("-----{}-----".format(target))
    pred_train, preds_test, score_cv = predict_cv(train_x, train_y, test_x, bagging_ridge, target)
    overal_score += w*score_cv
    pred_train_targets_bagging_ridge[target] = pred_train
    pred_test_targets_bagging_ridge[target] = preds_test
print('--------------------------------------------')
print("Overal score:", np.round(overal_score, 8))

N_FOLD:5
-----age-----
age_score:0.15799659
age_mae:7.838922114175061
age_rmse:9.932694176452122
-----domain1_var1-----
domain1_var1_score:0.15228472
domain1_var1_mae:7.83879033835089
domain1_var1_rmse:9.757681943988004
-----domain1_var2-----
domain1_var2_score:0.15133189
domain1_var2_mae:8.965705670033726
domain1_var2_rmse:11.389041157223858
-----domain2_var1-----
domain2_var1_score:0.18158228
domain2_var1_mae:8.578347769915688
domain2_var1_rmse:10.864066488143113
-----domain2_var2-----
domain2_var2_score:0.17591359
domain2_var2_mae:9.132456695822324
domain2_var2_rmse:11.671964149504078
--------------------------------------------
Overal score: 0.16309366


In [28]:
pred_train_targets_df = pd.DataFrame(pred_train_targets_bayes_ridge)

In [29]:
pred_train_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5434.0,5434.0,5434.0,5434.0,5434.0
mean,49.602764,51.478672,59.24572,47.260059,51.920395
std,9.884848,3.191612,0.66447,2.298371,1.753767
min,20.137104,39.739576,56.38701,40.158413,42.086996
25%,42.519328,49.339751,58.815656,45.651432,50.75878
50%,49.287075,51.514309,59.267038,47.176543,51.888849
75%,56.335995,53.645717,59.686343,48.793224,53.07566
max,81.873726,62.403192,61.481104,57.66163,58.766078


In [30]:
pred_test_targets_df = pd.DataFrame(pred_test_targets_bayes_ridge)

In [31]:
pred_test_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.932814,51.576092,59.262143,47.000256,52.043891
std,10.186062,3.250484,0.628288,2.322438,1.72459
min,16.206375,40.558533,56.604829,38.692719,44.712542
25%,41.559869,49.40801,58.847564,45.39486,50.892149
50%,48.595813,51.631866,59.277452,46.957778,52.013774
75%,56.16743,53.812505,59.692671,48.615446,53.187733
max,84.178173,63.706069,61.64414,55.679888,58.784127


# 2nd layer

In [32]:
train_x_2 = pd.DataFrame({'pred_svr_age':pred_train_targets_svr['age'],
                         'pred_svr_d1v1':pred_train_targets_svr['domain1_var1'],
                         'pred_svr_d1v2':pred_train_targets_svr['domain1_var2'],
                         'pred_svr_d2v1':pred_train_targets_svr['domain2_var1'],
                         'pred_svr_d2v2':pred_train_targets_svr['domain2_var2'],
                         
                         'pred_bayesRidge_age':pred_train_targets_bayes_ridge['age'],
                         'pred_bayesRidge_d1v1':pred_train_targets_bayes_ridge['domain1_var1'],
                         'pred_bayesRidge_d1v2':pred_train_targets_bayes_ridge['domain1_var2'],
                         'pred_bayesRidge_d2v1':pred_train_targets_bayes_ridge['domain2_var1'],
                         'pred_bayesRidge_d2v2':pred_train_targets_bayes_ridge['domain2_var2'],
                          
                         'pred_baggingRidge_age':pred_train_targets_bagging_ridge['age'],
                         'pred_baggingRidge_d1v1':pred_train_targets_bagging_ridge['domain1_var1'],
                         'pred_baggingRidge_d1v2':pred_train_targets_bagging_ridge['domain1_var2'],
                         'pred_baggingRidge_d2v1':pred_train_targets_bagging_ridge['domain2_var1'],
                         'pred_baggingRidge_d2v2':pred_train_targets_bagging_ridge['domain2_var2']
                        })

In [33]:
test_x_2 = pd.DataFrame({'pred_svr_age':pred_test_targets_svr['age'],
                         'pred_svr_d1v1':pred_test_targets_svr['domain1_var1'],
                         'pred_svr_d1v2':pred_test_targets_svr['domain1_var2'],
                         'pred_svr_d2v1':pred_test_targets_svr['domain2_var1'],
                         'pred_svr_d2v2':pred_test_targets_svr['domain2_var2'],
                         
                         'pred_bayesRidge_age':pred_test_targets_bayes_ridge['age'],
                         'pred_bayesRidge_d1v1':pred_test_targets_bayes_ridge['domain1_var1'],
                         'pred_bayesRidge_d1v2':pred_test_targets_bayes_ridge['domain1_var2'],
                         'pred_bayesRidge_d2v1':pred_test_targets_bayes_ridge['domain2_var1'],
                         'pred_bayesRidge_d2v2':pred_test_targets_bayes_ridge['domain2_var2'],
                         
                         'pred_baggingRidge_age':pred_test_targets_bagging_ridge['age'],
                         'pred_baggingRidge_d1v1':pred_test_targets_bagging_ridge['domain1_var1'],
                         'pred_baggingRidge_d1v2':pred_test_targets_bagging_ridge['domain1_var2'],
                         'pred_baggingRidge_d2v1':pred_test_targets_bagging_ridge['domain2_var1'],
                         'pred_baggingRidge_d2v2':pred_test_targets_bagging_ridge['domain2_var2']
                        })

In [34]:
layer2_pred_train_targets_bayes_ridge = {}
layer2_pred_test_targets_bayes_ridge = {}

overal_score = 0

print("N_FOLD:{}".format(N_FOLD))

for target, w in [("age", 0.3),
                  ("domain1_var1", 0.175),
                  ("domain1_var2", 0.175),
                  ("domain2_var1", 0.175),
                  ("domain2_var2", 0.175)]:
    train_df = train[train[target].notnull()]
    train_x = train_df.drop([ID]+TARGET_COLS, axis=1)
    train_y = train_df[target]
    bayes_ridge = BayesianRidge(n_iter = 3000)
    
    print("-----{}-----".format(target))
    pred_train, preds_test, score_cv = predict_cv(train_x_2, train_y, test_x_2, bayes_ridge, target)
    overal_score += w*score_cv
    layer2_pred_train_targets_bayes_ridge[target] = pred_train
    layer2_pred_test_targets_bayes_ridge[target] = preds_test
print('--------------------------------------------')
print("Overal score:", np.round(overal_score, 8))

N_FOLD:5
-----age-----
age_score:0.14510632
age_mae:7.199566962064249
age_rmse:9.1160619555286
-----domain1_var1-----
domain1_var1_score:0.151129
domain1_var1_mae:7.779299505199782
domain1_var1_rmse:9.691431646154399
-----domain1_var2-----
domain1_var2_score:0.15121437
domain1_var2_mae:8.958656702784129
domain1_var2_rmse:11.369383538529679
-----domain2_var1-----
domain2_var1_score:0.18057214
domain2_var1_mae:8.530607215385938
domain2_var1_rmse:10.815959374550518
-----domain2_var2-----
domain2_var2_score:0.17556317
domain2_var2_mae:9.114211764701796
domain2_var2_rmse:11.661130446762595
--------------------------------------------
Overal score: 0.15876566


In [35]:
pred_train_targets_df = pd.DataFrame(layer2_pred_train_targets_bayes_ridge)
pred_test_targets_df = pd.DataFrame(layer2_pred_test_targets_bayes_ridge)

In [36]:
pred_train_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5434.0,5434.0,5434.0,5434.0,5434.0
mean,49.612167,51.465311,59.244271,47.24532,51.912713
std,9.894449,3.174988,0.691857,2.410307,1.776745
min,19.58953,39.6973,55.611006,39.458787,45.569581
25%,42.485356,49.283749,58.806034,45.568056,50.707155
50%,49.292493,51.428648,59.241847,47.197768,51.871526
75%,56.437534,53.650631,59.687501,48.922274,53.069279
max,84.043153,62.740266,62.213361,55.509986,58.466418


In [37]:
pred_test_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.849216,51.435882,59.25254,46.996384,51.975751
std,10.158112,3.221579,0.647048,2.438233,1.719815
min,17.037788,40.959759,56.207286,38.349536,45.996452
25%,41.482585,49.191905,58.825313,45.303527,50.761723
50%,48.573732,51.447586,59.260455,46.964382,51.92939
75%,56.035993,53.660597,59.694892,48.72937,53.148141
max,81.888253,63.043876,61.476976,55.462391,58.508493


# Submit

In [38]:
test_df = pd.concat([test['Id'], pred_test_targets_df], axis=1)

In [39]:
test_df

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
0,10003,54.379942,50.805333,58.642268,48.588047,54.355870
1,10006,62.070426,54.788949,58.719483,49.343097,51.817882
2,10010,38.303167,48.857278,59.744070,44.429201,52.580145
3,10011,49.931340,51.740729,59.654704,48.077404,50.749302
4,10012,52.571061,54.887477,56.910900,45.601957,55.877979
...,...,...,...,...,...,...
5872,21745,40.251730,50.564149,58.835991,44.809557,53.447342
5873,21748,54.532708,55.896991,59.038907,46.567235,51.824052
5874,21749,39.958163,47.137177,59.790544,45.549204,52.674112
5875,21751,40.228197,50.260018,58.743895,44.212546,54.906710


In [40]:
test_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.849216,51.435882,59.25254,46.996384,51.975751
std,10.158112,3.221579,0.647048,2.438233,1.719815
min,17.037788,40.959759,56.207286,38.349536,45.996452
25%,41.482585,49.191905,58.825313,45.303527,50.761723
50%,48.573732,51.447586,59.260455,46.964382,51.92939
75%,56.035993,53.660597,59.694892,48.72937,53.148141
max,81.888253,63.043876,61.476976,55.462391,58.508493


In [41]:
sub_df = pd.melt(test_df[["Id", "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"]], id_vars=["Id"], value_name="Predicted")
sub_df["Id"] = sub_df["Id"].astype("str") + "_" +  sub_df["variable"].astype("str")

sub_df = sub_df.drop("variable", axis=1).sort_values("Id")
assert sub_df.shape[0] == test_df.shape[0]*5
sub_df.head(10)

Unnamed: 0,Id,Predicted
0,10003_age,54.379942
5877,10003_domain1_var1,50.805333
11754,10003_domain1_var2,58.642268
17631,10003_domain2_var1,48.588047
23508,10003_domain2_var2,54.35587
1,10006_age,62.070426
5878,10006_domain1_var1,54.788949
11755,10006_domain1_var2,58.719483
17632,10006_domain2_var1,49.343097
23509,10006_domain2_var2,51.817882


In [None]:
sub_df.to_csv("submission_15fold_stacking.csv", index=False)