In [1]:
import numpy as np
import pandas as pd

import gc
from sklearn.model_selection import KFold
from cuml.metrics import mean_absolute_error, mean_squared_error

import seaborn as sns

In [2]:
OUTPUT_DICT = ''

ID = 'Id'
TARGET_COLS = ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']
SEED = 2020

N_FOLD = 5

In [3]:
base_path = '/media/hiroki/share/kaggle_data/trends-assessment-prediction/'
train = pd.read_csv(base_path+'train_scores.csv', dtype={'Id':str})\
            .dropna().reset_index(drop=True) # to make things easy
reveal_ID = pd.read_csv(base_path+'reveal_ID_site2.csv', dtype={'Id':str})
ICN_numbers = pd.read_csv(base_path+'ICN_numbers.csv')
loading = pd.read_csv(base_path+'loading.csv', dtype={'Id':str})
fnc = pd.read_csv(base_path+'fnc.csv', dtype={'Id':str})
sample_submission = pd.read_csv(base_path+'sample_submission.csv', dtype={'Id':str})

In [4]:
fnc_features, loading_features = list(fnc.columns[1:]), list(loading.columns[1:])

In [5]:
sample_submission['ID_num'] = sample_submission[ID].apply(lambda x: int(x.split('_')[0]))
test = pd.DataFrame({ID: sample_submission['ID_num'].unique().astype(str)})
del sample_submission['ID_num']; gc.collect()
test.head()

Unnamed: 0,Id
0,10003
1,10006
2,10010
3,10011
4,10012


In [6]:
# merge
train = train.merge(loading, on=ID, how='left')
train = train.merge(fnc, on=ID, how='left')
train

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2,IC_01,IC_07,IC_05,IC_16,...,CBN(13)_vs_DMN(94),CBN(18)_vs_DMN(94),CBN(4)_vs_DMN(94),CBN(7)_vs_DMN(94),CBN(18)_vs_CBN(13),CBN(4)_vs_CBN(13),CBN(7)_vs_CBN(13),CBN(4)_vs_CBN(18),CBN(7)_vs_CBN(18),CBN(7)_vs_CBN(4)
0,10001,57.436077,30.571975,62.553736,53.325130,51.427998,0.006070,0.014466,0.004136,0.000658,...,-0.149279,0.552841,0.131046,0.335446,0.394867,-0.042853,0.124627,-0.060712,0.515964,0.290488
1,10002,59.580851,50.969456,67.470628,60.651856,58.311361,0.009087,0.009291,0.007049,-0.002076,...,-0.214216,-0.039792,0.143014,-0.189962,0.498373,0.444231,0.592438,0.028649,0.705524,0.248327
2,10004,71.413018,53.152498,58.012103,52.418389,62.536641,0.004675,0.000957,0.006154,-0.000429,...,-0.130339,0.309540,0.141469,0.030853,0.344394,0.214097,0.317556,0.012435,0.665937,0.081358
3,10005,66.532630,,,52.108977,69.993075,-0.000398,0.006878,0.009051,0.000369,...,-0.139525,0.394932,0.040443,0.428334,0.498837,0.266755,0.227379,0.028984,0.752343,0.087898
4,10007,38.617381,49.197021,65.674285,40.151376,34.096421,0.005192,0.010585,0.012160,-0.000920,...,-0.150218,0.408926,0.072004,0.157582,0.532046,0.355448,0.462675,0.161005,0.703679,0.293607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5872,21746,14.257265,21.358872,61.165998,51.778483,54.640179,-0.001115,0.007108,0.008652,0.003596,...,-0.249481,0.205351,0.012067,0.310750,0.426335,0.193812,0.158720,0.055471,0.568766,0.160516
5873,21747,55.456978,68.169675,29.907995,55.349257,54.019517,0.007263,0.016489,0.012704,0.004357,...,-0.119170,0.201846,-0.008290,0.119828,0.551936,0.598931,0.511816,0.303312,0.704483,0.461588
5874,21750,48.948756,55.114811,60.878271,38.617246,50.679885,0.005996,0.003873,0.012353,0.000242,...,-0.103786,0.375065,0.104857,0.262614,0.502715,0.322353,0.458041,0.343754,0.705207,0.341224
5875,21752,66.532630,59.844808,72.303110,55.458281,46.870235,0.000627,0.011407,0.010957,0.000534,...,0.229712,0.431489,0.039062,0.119474,0.523894,0.445209,0.332011,0.228977,0.560968,0.263504


In [7]:
# merge
test = test.merge(loading, on=ID, how='left')
test = test.merge(fnc, on=ID, how='left')
test

Unnamed: 0,Id,IC_01,IC_07,IC_05,IC_16,IC_26,IC_06,IC_10,IC_09,IC_18,...,CBN(13)_vs_DMN(94),CBN(18)_vs_DMN(94),CBN(4)_vs_DMN(94),CBN(7)_vs_DMN(94),CBN(18)_vs_CBN(13),CBN(4)_vs_CBN(13),CBN(7)_vs_CBN(13),CBN(4)_vs_CBN(18),CBN(7)_vs_CBN(18),CBN(7)_vs_CBN(4)
0,10003,0.008151,0.014684,0.010444,-0.005293,-0.002913,0.015042,0.017745,0.003930,-0.008021,...,-0.154941,0.136850,-0.022361,0.137625,0.677972,0.409412,0.563892,0.438684,0.618204,0.284474
1,10006,0.000334,0.005311,0.010053,0.006920,-0.000065,0.015310,0.016543,0.004794,0.003982,...,-0.053606,0.240957,0.270419,0.367692,0.354501,0.486364,0.416908,0.285274,0.693490,0.448526
2,10010,0.007103,0.006144,0.009770,-0.002884,-0.001346,0.015651,0.011613,-0.003291,0.013423,...,-0.244332,0.272077,0.193523,0.192254,0.563982,0.124482,0.488926,0.083368,0.774299,0.129327
3,10011,0.004362,0.010240,0.010167,0.004492,-0.001623,0.017381,0.014680,0.007453,0.008786,...,-0.099726,0.557121,0.042626,0.179456,0.416546,0.445402,0.436909,0.165182,0.591561,0.306678
4,10012,-0.007521,-0.003918,0.008434,-0.001145,0.002017,0.015065,0.019616,0.004140,-0.003744,...,-0.025230,0.203298,0.173427,0.046047,0.561599,0.418268,0.609517,0.218285,0.790285,0.301010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5872,21745,0.005406,0.006275,0.012252,0.003518,0.001400,0.015054,0.015373,0.001532,0.003546,...,0.179080,0.580813,0.202241,0.254870,0.458581,0.434638,0.587167,0.009854,0.472956,0.342085
5873,21748,0.004240,0.009213,0.010981,0.000443,-0.003072,0.010702,0.014673,0.005523,0.005780,...,-0.106345,0.234340,0.138985,0.343382,0.708744,0.312812,0.536501,0.214803,0.849512,0.204741
5874,21749,0.004783,0.017910,0.012128,-0.005683,-0.011613,0.017000,0.007230,0.001315,0.008788,...,-0.165575,0.170154,-0.029638,0.383761,0.398305,0.578621,0.357127,0.009479,0.609545,0.317230
5875,21751,0.003835,0.015067,0.015428,-0.002030,0.001205,0.012396,0.011026,-0.001491,0.005310,...,-0.087604,0.131902,-0.047932,0.022317,0.583869,0.596734,0.515209,0.379589,0.568422,0.439016


In [8]:
def metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))

In [9]:
# Giving less importance to FNC features since they are easier to overfit due to high dimensionality.
FNC_SCALE = 1/600

train[fnc_features] *= FNC_SCALE
test[fnc_features] *= FNC_SCALE

In [11]:
# 学習データに対する「目的変数を知らない」予測値と、テストデータに対する予測値を返す関数
def predict_cv(train_x, train_y, test_x, model, target_name):
    preds = []
    preds_test = []
    va_idxes = []
    
    score = []
    mae = []
    rmse = []

    # shuffleしなくても良い
    kf = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    # クロスバリデーションで学習・予測を行い、予測値とインデックスを保存する
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx].values, train_x.iloc[va_idx].values
        tr_y, va_y = train_y.iloc[tr_idx].values, train_y.iloc[va_idx].values
        model.fit(tr_x, tr_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)
        
        score.append(metric(va_y, pred))
        mae.append(mean_absolute_error(va_y, pred))
        rmse.append(np.sqrt(mean_squared_error(va_y, pred)))
        
    score_cv = np.array(score).mean()
    mae_cv = np.array(mae).mean()
    rmse_cv = np.array(rmse).mean()pred_test_targets_bayes_ridge
    print("{0}_score:{1}".format(target_name, np.round(score_cv, 8)))
    print("{0}_mae:{1}".format(target_name, np.array(mae_cv).mean()))
    print("{0}_rmse:{1}".format(target_name, np.array(rmse_cv).mean()))
    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test, score_cv

# 1st layer

## SVR

In [12]:
import cudf
import cupy as cp
from cuml import SVRpred_test_targets_bayes_ridge

In [13]:
pred_train_targets_svr = {}
pred_test_targets_svr = {}

overal_score = 0

print("N_FOLD:{}".format(N_FOLD))

for target, c, w in [("age", 60, 0.3),
                     ("domain1_var1", 12, 0.175),
                     ("domain1_var2", 8, 0.175),
                     ("domain2_var1", 9, 0.175),
                     ("domain2_var2", 12, 0.175)]:
    train_df = train[train[target].notnull()]
    train_x = train_df.drop([ID]+TARGET_COLS, axis=1)
    train_y = train_df[target]
    test_x = test.drop(ID, axis=1)
    svr = SVR(C=c, cache_size=3000.0)
    
    print("-----{}-----".format(target))
    pred_train, preds_test, score_cv = predict_cv(train_x, train_y, test_x, svr, target)
    overal_score += w*score_cv
    pred_train_targets_svr[target] = pred_train
    pred_test_targets_svr[target] = preds_test
print('--------------------------------------------')
print("Overal score:", np.round(overal_score, 8))

N_FOLD:15
-----age-----
age_score:0.1452783
age_mae:7.207349133355973
age_rmse:9.136950732922411
-----domain1_var1-----
domain1_var1_score:0.15140409
domain1_var1_mae:7.7929600960279535
domain1_var1_rmse:9.704187432461952
-----domain1_var2-----
domain1_var2_score:0.1513435
domain1_var2_mae:8.965799935216493
domain1_var2_rmse:11.425679942749944
-----domain2_var1-----
domain2_var1_score:0.18062684
domain2_var1_mae:8.530591385957175
domain2_var1_rmse:10.850669022322423
-----domain2_var2-----
domain2_var2_score:0.17535461
domain2_var2_mae:9.101265334912783
domain2_var2_rmse:11.683568465155526
--------------------------------------------
Overal score: 0.15886107


In [14]:
pred_train_targets_df = pd.DataFrame(pred_train_targets_svr)

In [15]:
pred_train_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5434.0,5434.0,5434.0,5434.0,5434.0
mean,49.757576,51.682374,60.018243,47.788974,52.64677
std,9.637329,3.176761,1.41574,2.740586,2.271021
min,19.869896,41.659028,55.034532,39.569122,44.0632
25%,42.790369,49.499707,59.049677,45.829975,51.115216
50%,49.468533,51.574268,60.021174,47.783556,52.579807
75%,56.597715,53.770375,60.95738,49.733627,54.143079
max,85.81163,65.909269,65.236427,56.161751,61.374605


In [16]:
pred_test_targets_df = pd.DataFrame(pred_test_targets_svr)

In [17]:
pred_test_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.940178,51.873588,60.024772,47.555929,52.829573
std,9.887389,3.20758,1.393332,2.787892,2.235273
min,18.954403,41.650065,54.998182,37.374629,44.763223
25%,41.726084,49.589592,59.077491,45.594611,51.321817
50%,48.580191,51.846779,60.01996,47.614666,52.782163
75%,55.901392,54.046297,60.959252,49.555422,54.337012
max,80.469767,63.505815,66.21593,57.074454,62.631554


In [18]:
pred_test_targets_df

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
0,50.465272,50.271700,61.373086,47.946877,57.586411
1,63.160814,54.870469,59.782276,50.287676,51.857249
2,40.357364,50.439308,60.373848,45.884510,53.246553
3,49.500477,52.800544,60.265182,48.058581,51.293763
4,52.355369,55.447235,57.535187,45.488950,57.945911
...,...,...,...,...,...
5872,40.165435,50.780032,57.977438,46.455590,53.900746
5873,56.289352,57.741846,60.263662,50.209970,53.551774
5874,41.055600,47.478358,59.911345,44.774934,51.690605
5875,40.378236,51.049574,58.559863,45.786377,57.479335


## baysian ridge

In [19]:
from sklearn.linear_model import BayesianRidge

In [20]:
pred_train_targets_bayes_ridge = {}
pred_test_targets_bayes_ridge = {}

overal_score = 0

print("N_FOLD:{}".format(N_FOLD))

for target, w in [("age", 0.3),
                  ("domain1_var1", 0.175),
                  ("domain1_var2", 0.175),
                  ("domain2_var1", 0.175),
                  ("domain2_var2", 0.175)]:
    train_df = train[train[target].notnull()]
    train_x = train_df.drop([ID]+TARGET_COLS, axis=1)
    train_y = train_df[target]
    bayes_ridge = BayesianRidge(n_iter = 3000)
    
    print("-----{}-----".format(target))
    pred_train, preds_test, score_cv = predict_cv(train_x, train_y, test_x, bayes_ridge, target)
    overal_score += w*score_cv
    pred_train_targets_bayes_ridge[target] = pred_train
    pred_test_targets_bayes_ridge[target] = preds_test
print('--------------------------------------------')
print("Overal score:", np.round(overal_score, 8))

N_FOLD:15
-----age-----
age_score:0.14444292
age_mae:7.1662966873320215
age_rmse:9.07777317740673
-----domain1_var1-----
domain1_var1_score:0.15093666
domain1_var1_mae:7.769059160281727
domain1_var1_rmse:9.66923543640214
-----domain1_var2-----
domain1_var2_score:0.15115506
domain1_var2_mae:8.95463820100667
domain1_var2_rmse:11.35930370626475
-----domain2_var1-----
domain2_var1_score:0.18116492
domain2_var1_mae:8.556072922977268
domain2_var1_rmse:10.82871058050255
-----domain2_var2-----
domain2_var2_score:0.17561443
domain2_var2_mae:9.115200791444975
domain2_var2_rmse:11.6477185932534
--------------------------------------------
Overal score: 0.15863531


In [21]:
pred_train_targets_df = pd.DataFrame(pred_train_targets_bayes_ridge)

In [22]:
pred_train_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5434.0,5434.0,5434.0,5434.0,5434.0
mean,49.614598,51.475157,59.248583,47.248574,51.914203
std,9.929603,3.220202,0.690998,2.341179,1.798656
min,19.853281,40.355812,56.105136,39.670987,41.142573
25%,42.542922,49.294161,58.802876,45.585239,50.716012
50%,49.300944,51.491135,59.271348,47.17423,51.90142
75%,56.43077,53.664411,59.7147,48.822739,53.09158
max,82.438412,62.160007,62.35296,57.322385,58.928327


In [23]:
pred_test_targets_df = pd.DataFrame(pred_test_targets_bayes_ridge)

In [24]:
pred_test_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.928243,51.594471,59.261868,47.002884,52.051816
std,10.248659,3.291547,0.677247,2.388787,1.80602
min,16.32973,40.394305,56.39023,38.356014,44.443069
25%,41.486712,49.377338,58.815535,45.353836,50.848985
50%,48.553426,51.633218,59.280266,46.943469,52.024232
75%,56.15107,53.860653,59.727911,48.669089,53.24859
max,84.251802,63.97069,61.871802,55.96833,59.220321


## BaggingRegressor

In [25]:
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import Ridge

In [26]:
pred_train_targets_bagging_ridge = {}
pred_test_targets_bagging_ridge = {}

overal_score = 0

print("N_FOLD:{}".format(N_FOLD))

for target, w in [("age", 0.3),
                  ("domain1_var1", 0.175),
                  ("domain1_var2", 0.175),
                  ("domain2_var1", 0.175),
                  ("domain2_var2", 0.175)]:
    train_df = train[train[target].notnull()]
    train_x = train_df.drop([ID]+TARGET_COLS, axis=1)
    train_y = train_df[target]
    
    bagging_ridge = BaggingRegressor(Ridge(alpha = 0.0001), n_estimators=30, random_state=42, max_samples=0.3, max_features=0.3)
    
    print("-----{}-----".format(target))
    pred_train, preds_test, score_cv = predict_cv(train_x, train_y, test_x, bagging_ridge, target)
    overal_score += w*score_cv
    pred_train_targets_bagging_ridge[target] = pred_train
    pred_test_targets_bagging_ridge[target] = preds_test
print('--------------------------------------------')
print("Overal score:", np.round(overal_score, 8))

N_FOLD:15
-----age-----
age_score:0.15748811
age_mae:7.81282443848854
age_rmse:9.890274363630633
-----domain1_var1-----
domain1_var1_score:0.15200004
domain1_var1_mae:7.823531572497332
domain1_var1_rmse:9.735462024432662
-----domain1_var2-----
domain1_var2_score:0.15138495
domain1_var2_mae:8.96839834198824
domain1_var2_rmse:11.386551849010033
-----domain2_var1-----
domain2_var1_score:0.18134983
domain2_var1_mae:8.564986276622077
domain2_var1_rmse:10.849709520219077
-----domain2_var2-----
domain2_var2_score:0.17508709
domain2_var2_mae:9.08791185825324
domain2_var2_rmse:11.632783374499606
--------------------------------------------
Overal score: 0.16271527


In [27]:
pred_train_targets_df = pd.DataFrame(pred_train_targets_bayes_ridge)

In [28]:
pred_train_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5434.0,5434.0,5434.0,5434.0,5434.0
mean,49.614598,51.475157,59.248583,47.248574,51.914203
std,9.929603,3.220202,0.690998,2.341179,1.798656
min,19.853281,40.355812,56.105136,39.670987,41.142573
25%,42.542922,49.294161,58.802876,45.585239,50.716012
50%,49.300944,51.491135,59.271348,47.17423,51.90142
75%,56.43077,53.664411,59.7147,48.822739,53.09158
max,82.438412,62.160007,62.35296,57.322385,58.928327


In [29]:
pred_test_targets_df = pd.DataFrame(pred_test_targets_bayes_ridge)

In [30]:
pred_test_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.928243,51.594471,59.261868,47.002884,52.051816
std,10.248659,3.291547,0.677247,2.388787,1.80602
min,16.32973,40.394305,56.39023,38.356014,44.443069
25%,41.486712,49.377338,58.815535,45.353836,50.848985
50%,48.553426,51.633218,59.280266,46.943469,52.024232
75%,56.15107,53.860653,59.727911,48.669089,53.24859
max,84.251802,63.97069,61.871802,55.96833,59.220321


# 2nd layer

In [31]:
train_x_2 = pd.DataFrame({'pred_svr_age':pred_train_targets_svr['age'],
                         'pred_svr_d1v1':pred_train_targets_svr['domain1_var1'],
                         'pred_svr_d1v2':pred_train_targets_svr['domain1_var2'],
                         'pred_svr_d2v1':pred_train_targets_svr['domain2_var1'],
                         'pred_svr_d2v2':pred_train_targets_svr['domain2_var2'],
                         
                         'pred_bayesRidge_age':pred_train_targets_bayes_ridge['age'],
                         'pred_bayesRidge_d1v1':pred_train_targets_bayes_ridge['domain1_var1'],
                         'pred_bayesRidge_d1v2':pred_train_targets_bayes_ridge['domain1_var2'],
                         'pred_bayesRidge_d2v1':pred_train_targets_bayes_ridge['domain2_var1'],
                         'pred_bayesRidge_d2v2':pred_train_targets_bayes_ridge['domain2_var2'],
                          
                         'pred_baggingRidge_age':pred_train_targets_bagging_ridge['age'],
                         'pred_baggingRidge_d1v1':pred_train_targets_bagging_ridge['domain1_var1'],
                         'pred_baggingRidge_d1v2':pred_train_targets_bagging_ridge['domain1_var2'],
                         'pred_baggingRidge_d2v1':pred_train_targets_bagging_ridge['domain2_var1'],
                         'pred_baggingRidge_d2v2':pred_train_targets_bagging_ridge['domain2_var2']
                        })

In [32]:
test_x_2 = pd.DataFrame({'pred_svr_age':pred_test_targets_svr['age'],
                         'pred_svr_d1v1':pred_test_targets_svr['domain1_var1'],
                         'pred_svr_d1v2':pred_test_targets_svr['domain1_var2'],
                         'pred_svr_d2v1':pred_test_targets_svr['domain2_var1'],
                         'pred_svr_d2v2':pred_test_targets_svr['domain2_var2'],
                         
                         'pred_bayesRidge_age':pred_test_targets_bayes_ridge['age'],
                         'pred_bayesRidge_d1v1':pred_test_targets_bayes_ridge['domain1_var1'],
                         'pred_bayesRidge_d1v2':pred_test_targets_bayes_ridge['domain1_var2'],
                         'pred_bayesRidge_d2v1':pred_test_targets_bayes_ridge['domain2_var1'],
                         'pred_bayesRidge_d2v2':pred_test_targets_bayes_ridge['domain2_var2'],
                         
                         'pred_baggingRidge_age':pred_test_targets_bagging_ridge['age'],
                         'pred_baggingRidge_d1v1':pred_test_targets_bagging_ridge['domain1_var1'],
                         'pred_baggingRidge_d1v2':pred_test_targets_bagging_ridge['domain1_var2'],
                         'pred_baggingRidge_d2v1':pred_test_targets_bagging_ridge['domain2_var1'],
                         'pred_baggingRidge_d2v2':pred_test_targets_bagging_ridge['domain2_var2']
                        })

In [33]:
layer2_pred_train_targets_bayes_ridge = {}
layer2_pred_test_targets_bayes_ridge = {}

overal_score = 0

print("N_FOLD:{}".format(N_FOLD))

for target, w in [("age", 0.3),
                  ("domain1_var1", 0.175),
                  ("domain1_var2", 0.175),
                  ("domain2_var1", 0.175),
                  ("domain2_var2", 0.175)]:
    train_df = train[train[target].notnull()]
    train_x = train_df.drop([ID]+TARGET_COLS, axis=1)
    train_y = train_df[target]
    bayes_ridge = BayesianRidge(n_iter = 3000)
    
    print("-----{}-----".format(target))
    pred_train, preds_test, score_cv = predict_cv(train_x_2, train_y, test_x_2, bayes_ridge, target)
    overal_score += w*score_cv
    layer2_pred_train_targets_bayes_ridge[target] = pred_train
    layer2_pred_test_targets_bayes_ridge[target] = preds_test
print('--------------------------------------------')
print("Overal score:", np.round(overal_score, 8))

N_FOLD:15
-----age-----
age_score:0.14436692
age_mae:7.162386233973766
age_rmse:9.070831544017189
-----domain1_var1-----
domain1_var1_score:0.15085477
domain1_var1_mae:7.764708827810199
domain1_var1_rmse:9.667816403530178
-----domain1_var2-----
domain1_var2_score:0.15123285
domain1_var2_mae:8.959308676761703
domain1_var2_rmse:11.359819297406542
-----domain2_var1-----
domain2_var1_score:0.18056064
domain2_var1_mae:8.527606135432368
domain2_var1_rmse:10.81033960683184
-----domain2_var2-----
domain2_var2_score:0.17514303
domain2_var2_mae:9.090826388087931
domain2_var2_rmse:11.631561141777802
--------------------------------------------
Overal score: 0.15842356


In [34]:
pred_train_targets_df = pd.DataFrame(layer2_pred_train_targets_bayes_ridge)
pred_test_targets_df = pd.DataFrame(layer2_pred_test_targets_bayes_ridge)

In [35]:
pred_train_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5434.0,5434.0,5434.0,5434.0,5434.0
mean,49.614909,51.473952,59.24552,47.246684,51.917022
std,9.920607,3.224507,0.641713,2.421027,1.901658
min,19.371037,39.451778,56.662905,39.661278,44.903618
25%,42.42566,49.257242,58.816162,45.55981,50.629099
50%,49.294403,51.452519,59.251223,47.188466,51.889227
75%,56.51743,53.609663,59.684802,48.915361,53.167029
max,83.756768,62.069873,61.741517,55.560223,59.158189


In [36]:
pred_test_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.880266,51.454791,59.253973,46.995053,51.957473
std,10.208021,3.287495,0.62691,2.463754,1.872589
min,17.182181,40.760322,56.427685,38.057465,45.556673
25%,41.501683,49.17451,58.847166,45.26737,50.653414
50%,48.561556,51.469798,59.257911,46.965653,51.920279
75%,56.089638,53.703641,59.680422,48.772514,53.226737
max,82.710837,63.323718,61.462603,55.510647,59.224266


# Submit

In [37]:
test_df = pd.concat([test['Id'], pred_test_targets_df], axis=1)

In [38]:
test_df

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
0,10003,54.434478,51.164690,58.688492,48.323962,54.679407
1,10006,62.480433,54.825916,58.715565,49.159854,51.724898
2,10010,38.340146,48.784783,59.635651,44.603552,52.932808
3,10011,50.265743,51.879203,59.678916,48.290809,50.392835
4,10012,52.404064,55.207574,56.888129,45.577441,56.695392
...,...,...,...,...,...,...
5872,21745,39.896207,50.648413,58.764710,45.120953,53.732506
5873,21748,54.308338,55.922368,58.914446,46.545891,52.202539
5874,21749,40.029145,46.923193,59.811284,45.578248,52.520730
5875,21751,39.867341,50.288743,58.682123,44.190439,55.491221


In [39]:
test_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.880266,51.454791,59.253973,46.995053,51.957473
std,10.208021,3.287495,0.62691,2.463754,1.872589
min,17.182181,40.760322,56.427685,38.057465,45.556673
25%,41.501683,49.17451,58.847166,45.26737,50.653414
50%,48.561556,51.469798,59.257911,46.965653,51.920279
75%,56.089638,53.703641,59.680422,48.772514,53.226737
max,82.710837,63.323718,61.462603,55.510647,59.224266


In [40]:
sub_df = pd.melt(test_df[["Id", "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"]], id_vars=["Id"], value_name="Predicted")
sub_df["Id"] = sub_df["Id"].astype("str") + "_" +  sub_df["variable"].astype("str")

sub_df = sub_df.drop("variable", axis=1).sort_values("Id")
assert sub_df.shape[0] == test_df.shape[0]*5
sub_df.head(10)

Unnamed: 0,Id,Predicted
0,10003_age,54.434478
5877,10003_domain1_var1,51.16469
11754,10003_domain1_var2,58.688492
17631,10003_domain2_var1,48.323962
23508,10003_domain2_var2,54.679407
1,10006_age,62.480433
5878,10006_domain1_var1,54.825916
11755,10006_domain1_var2,58.715565
17632,10006_domain2_var1,49.159854
23509,10006_domain2_var2,51.724898


In [41]:
sub_df.to_csv("submission_15fold_stacking.csv", index=False)