In [1]:
import numpy as np
import pandas as pd

import gc
from sklearn.model_selection import KFold
from cuml.metrics import mean_absolute_error, mean_squared_error

import seaborn as sns

In [2]:
OUTPUT_DICT = ''

ID = 'Id'
TARGET_COLS = ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']
SEED = 2020

N_FOLD = 15

In [3]:
base_path = '/media/hiroki/share/kaggle_data/trends-assessment-prediction/'
train = pd.read_csv(base_path+'train_scores.csv', dtype={'Id':str})\
            .dropna().reset_index(drop=True) # to make things easy
reveal_ID = pd.read_csv(base_path+'reveal_ID_site2.csv', dtype={'Id':str})
ICN_numbers = pd.read_csv(base_path+'ICN_numbers.csv')
loading = pd.read_csv(base_path+'loading.csv', dtype={'Id':str})
fnc = pd.read_csv(base_path+'fnc.csv', dtype={'Id':str})
sample_submission = pd.read_csv(base_path+'sample_submission.csv', dtype={'Id':str})

In [4]:
fnc_features, loading_features = list(fnc.columns[1:]), list(loading.columns[1:])

In [5]:
sample_submission['ID_num'] = sample_submission[ID].apply(lambda x: int(x.split('_')[0]))
test = pd.DataFrame({ID: sample_submission['ID_num'].unique().astype(str)})
del sample_submission['ID_num']; gc.collect()
test.head()

Unnamed: 0,Id
0,10003
1,10006
2,10010
3,10011
4,10012


In [6]:
# merge
train = train.merge(loading, on=ID, how='left')
train = train.merge(fnc, on=ID, how='left')
train.head()

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2,IC_01,IC_07,IC_05,IC_16,...,CBN(13)_vs_DMN(94),CBN(18)_vs_DMN(94),CBN(4)_vs_DMN(94),CBN(7)_vs_DMN(94),CBN(18)_vs_CBN(13),CBN(4)_vs_CBN(13),CBN(7)_vs_CBN(13),CBN(4)_vs_CBN(18),CBN(7)_vs_CBN(18),CBN(7)_vs_CBN(4)
0,10001,57.436077,30.571975,62.553736,53.32513,51.427998,0.00607,0.014466,0.004136,0.000658,...,-0.149279,0.552841,0.131046,0.335446,0.394867,-0.042853,0.124627,-0.060712,0.515964,0.290488
1,10002,59.580851,50.969456,67.470628,60.651856,58.311361,0.009087,0.009291,0.007049,-0.002076,...,-0.214216,-0.039792,0.143014,-0.189962,0.498373,0.444231,0.592438,0.028649,0.705524,0.248327
2,10004,71.413018,53.152498,58.012103,52.418389,62.536641,0.004675,0.000957,0.006154,-0.000429,...,-0.130339,0.30954,0.141469,0.030853,0.344394,0.214097,0.317556,0.012435,0.665937,0.081358
3,10007,38.617381,49.197021,65.674285,40.151376,34.096421,0.005192,0.010585,0.01216,-0.00092,...,-0.150218,0.408926,0.072004,0.157582,0.532046,0.355448,0.462675,0.161005,0.703679,0.293607
4,10008,35.326582,15.769168,65.782269,44.643805,50.448485,0.007745,0.009748,0.009356,-0.004219,...,-0.080562,0.005339,-0.386757,0.020546,0.518383,0.408071,0.465851,0.112785,0.574596,0.178531


In [7]:
# merge
test = test.merge(loading, on=ID, how='left')
test = test.merge(fnc, on=ID, how='left')
test.head()

Unnamed: 0,Id,IC_01,IC_07,IC_05,IC_16,IC_26,IC_06,IC_10,IC_09,IC_18,...,CBN(13)_vs_DMN(94),CBN(18)_vs_DMN(94),CBN(4)_vs_DMN(94),CBN(7)_vs_DMN(94),CBN(18)_vs_CBN(13),CBN(4)_vs_CBN(13),CBN(7)_vs_CBN(13),CBN(4)_vs_CBN(18),CBN(7)_vs_CBN(18),CBN(7)_vs_CBN(4)
0,10003,0.008151,0.014684,0.010444,-0.005293,-0.002913,0.015042,0.017745,0.00393,-0.008021,...,-0.154941,0.13685,-0.022361,0.137625,0.677972,0.409412,0.563892,0.438684,0.618204,0.284474
1,10006,0.000334,0.005311,0.010053,0.00692,-6.5e-05,0.01531,0.016543,0.004794,0.003982,...,-0.053606,0.240957,0.270419,0.367692,0.354501,0.486364,0.416908,0.285274,0.69349,0.448526
2,10010,0.007103,0.006144,0.00977,-0.002884,-0.001346,0.015651,0.011613,-0.003291,0.013423,...,-0.244332,0.272077,0.193523,0.192254,0.563982,0.124482,0.488926,0.083368,0.774299,0.129327
3,10011,0.004362,0.01024,0.010167,0.004492,-0.001623,0.017381,0.01468,0.007453,0.008786,...,-0.099726,0.557121,0.042626,0.179456,0.416546,0.445402,0.436909,0.165182,0.591561,0.306678
4,10012,-0.007521,-0.003918,0.008434,-0.001145,0.002017,0.015065,0.019616,0.00414,-0.003744,...,-0.02523,0.203298,0.173427,0.046047,0.561599,0.418268,0.609517,0.218285,0.790285,0.30101


In [8]:
def metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))

In [9]:
# Giving less importance to FNC features since they are easier to overfit due to high dimensionality.
FNC_SCALE = 1/600

train[fnc_features] *= FNC_SCALE
test[fnc_features] *= FNC_SCALE

In [10]:
train_x = train.drop([ID]+TARGET_COLS, axis=1)
train_y = train[TARGET_COLS]
test_x = test.drop(ID, axis=1)

In [11]:
# 学習データに対する「目的変数を知らない」予測値と、テストデータに対する予測値を返す関数
def predict_cv(train_x, train_y, test_x, model, target_name):
    preds = []
    preds_test = []
    va_idxes = []
    
    score = []
    mae = []
    rmse = []

    # shuffleしなくても良い
    kf = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    # クロスバリデーションで学習・予測を行い、予測値とインデックスを保存する
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx].values, train_x.iloc[va_idx].values
        tr_y, va_y = train_y.iloc[tr_idx].values, train_y.iloc[va_idx].values
        model.fit(tr_x, tr_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)
        
        score.append(metric(va_y, pred))
        mae.append(mean_absolute_error(va_y, pred))
        rmse.append(np.sqrt(mean_squared_error(va_y, pred)))
        
    score_cv = np.array(score).mean()
    mae_cv = np.array(mae).mean()
    rmse_cv = np.array(rmse).mean()
    print("{0}_score:{1}".format(target_name, np.round(score_cv, 8)))
    print("{0}_mae:{1}".format(target_name, np.array(mae_cv).mean()))
    print("{0}_rmse:{1}".format(target_name, np.array(rmse_cv).mean()))
    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test, score_cv

# 1st layer

## SVR

In [12]:
import cudf
import cupy as cp
from cuml import SVR

In [13]:
pred_train_targets_svr = {}
pred_test_targets_svr = {}

overal_score = 0

print("N_FOLD:{}".format(N_FOLD))

for target, c, w in [("age", 60, 0.3),
                     ("domain1_var1", 12, 0.175),
                     ("domain1_var2", 8, 0.175),
                     ("domain2_var1", 9, 0.175),
                     ("domain2_var2", 12, 0.175)]:
    train_x = train.drop([ID]+TARGET_COLS, axis=1)
    train_y = train[target]
    test_x = test.drop(ID, axis=1)
    svr = SVR(C=c, cache_size=3000.0)
    
    print("-----{}-----".format(target))
    pred_train, preds_test, score_cv = predict_cv(train_x, train_y, test_x, svr, target)
    overal_score += w*score_cv
    pred_train_targets_svr[target] = pred_train
    pred_test_targets_svr[target] = preds_test
print('--------------------------------------------')
print("Overal score:", np.round(overal_score, 8))

N_FOLD:15
-----age-----
age:0.1452783
age:7.207349133355973
age:9.136950732922411
-----domain1_var1-----
domain1_var1:0.15140409
domain1_var1:7.7929600960279535
domain1_var1:9.704187432461952
-----domain1_var2-----
domain1_var2:0.1513435
domain1_var2:8.965799935216493
domain1_var2:11.425679942749944
-----domain2_var1-----
domain2_var1:0.18062684
domain2_var1:8.530591385957175
domain2_var1:10.850669022322423
-----domain2_var2-----
domain2_var2:0.17535461
domain2_var2:9.101265334912783
domain2_var2:11.683568465155526
--------------------------------------------
Overal score: 0.15886107


In [14]:
pred_train_targets_df = pd.DataFrame(pred_train_targets_svr)

In [15]:
pred_train_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5434.0,5434.0,5434.0,5434.0,5434.0
mean,49.757576,51.682374,60.018243,47.788974,52.64677
std,9.637329,3.176761,1.41574,2.740586,2.271021
min,19.869896,41.659028,55.034532,39.569122,44.0632
25%,42.790369,49.499707,59.049677,45.829975,51.115216
50%,49.468533,51.574268,60.021174,47.783556,52.579807
75%,56.597715,53.770375,60.95738,49.733627,54.143079
max,85.81163,65.909269,65.236427,56.161751,61.374605


In [16]:
pred_test_targets_df = pd.DataFrame(pred_test_targets_svr)

In [17]:
pred_test_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.940178,51.873588,60.024772,47.555929,52.829573
std,9.887389,3.20758,1.393332,2.787892,2.235273
min,18.954403,41.650065,54.998182,37.374629,44.763223
25%,41.726084,49.589592,59.077491,45.594611,51.321817
50%,48.580191,51.846779,60.01996,47.614666,52.782163
75%,55.901392,54.046297,60.959252,49.555422,54.337012
max,80.469767,63.505815,66.21593,57.074454,62.631554


In [18]:
pred_test_targets_df

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
0,50.465272,50.271700,61.373086,47.946877,57.586411
1,63.160814,54.870469,59.782276,50.287676,51.857249
2,40.357364,50.439308,60.373848,45.884510,53.246553
3,49.500477,52.800544,60.265182,48.058581,51.293763
4,52.355369,55.447235,57.535187,45.488950,57.945911
...,...,...,...,...,...
5872,40.165435,50.780032,57.977438,46.455590,53.900746
5873,56.289352,57.741846,60.263662,50.209970,53.551774
5874,41.055600,47.478358,59.911345,44.774934,51.690605
5875,40.378236,51.049574,58.559863,45.786377,57.479335


## baysian ridge

In [19]:
from sklearn.linear_model import BayesianRidge

In [20]:
pred_train_targets_bayes_ridge = {}
pred_test_targets_bayes_ridge = {}

overal_score = 0

print("N_FOLD:{}".format(N_FOLD))

for target, w in [("age", 0.3),
                  ("domain1_var1", 0.175),
                  ("domain1_var2", 0.175),
                  ("domain2_var1", 0.175),
                  ("domain2_var2", 0.175)]:
    train_x = train.drop([ID]+TARGET_COLS, axis=1)
    train_y = train[target]
    test_x = test.drop(ID, axis=1)
    bayes_ridge = BayesianRidge(n_iter = 3000)
    
    print("-----{}-----".format(target))
    pred_train, preds_test, score_cv = predict_cv(train_x, train_y, test_x, bayes_ridge, target)
    overal_score += w*score_cv
    pred_train_targets_bayes_ridge[target] = pred_train
    pred_test_targets_bayes_ridge[target] = preds_test
print('--------------------------------------------')
print("Overal score:", np.round(overal_score, 8))

N_FOLD:15
-----age-----
age:0.14444292
age:7.1662966873320215
age:9.07777317740673
-----domain1_var1-----
domain1_var1:0.15093666
domain1_var1:7.769059160281727
domain1_var1:9.66923543640214
-----domain1_var2-----
domain1_var2:0.15115506
domain1_var2:8.95463820100667
domain1_var2:11.35930370626475
-----domain2_var1-----
domain2_var1:0.18116492
domain2_var1:8.556072922977268
domain2_var1:10.82871058050255
-----domain2_var2-----
domain2_var2:0.17561443
domain2_var2:9.115200791444975
domain2_var2:11.6477185932534
--------------------------------------------
Overal score: 0.15863531


In [21]:
pred_train_targets_df = pd.DataFrame(pred_train_targets_bayes_ridge)

In [22]:
pred_train_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5434.0,5434.0,5434.0,5434.0,5434.0
mean,49.614598,51.475157,59.248583,47.248574,51.914203
std,9.929603,3.220202,0.690998,2.341179,1.798656
min,19.853281,40.355812,56.105136,39.670987,41.142573
25%,42.542922,49.294161,58.802876,45.585239,50.716012
50%,49.300944,51.491135,59.271348,47.17423,51.90142
75%,56.43077,53.664411,59.7147,48.822739,53.09158
max,82.438412,62.160007,62.35296,57.322385,58.928327


In [23]:
pred_test_targets_df = pd.DataFrame(pred_test_targets_bayes_ridge)

In [24]:
pred_test_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.928243,51.594471,59.261868,47.002884,52.051816
std,10.248659,3.291547,0.677247,2.388787,1.80602
min,16.32973,40.394305,56.39023,38.356014,44.443069
25%,41.486712,49.377338,58.815535,45.353836,50.848985
50%,48.553426,51.633218,59.280266,46.943469,52.024232
75%,56.15107,53.860653,59.727911,48.669089,53.24859
max,84.251802,63.97069,61.871802,55.96833,59.220321


# 2nd layer

In [25]:
train_x_2 = pd.DataFrame({'pred_svr_age':pred_train_targets_svr['age'],
                         'pred_svr_d1v1':pred_train_targets_svr['domain1_var1'],
                         'pred_svr_d1v2':pred_train_targets_svr['domain1_var2'],
                         'pred_svr_d2v1':pred_train_targets_svr['domain2_var1'],
                         'pred_svr_d2v2':pred_train_targets_svr['domain2_var2'],
                         
                         'pred_bayesRidge_age':pred_train_targets_bayes_ridge['age'],
                         'pred_bayesRidge_d1v1':pred_train_targets_bayes_ridge['domain1_var1'],
                         'pred_bayesRidge_d1v2':pred_train_targets_bayes_ridge['domain1_var2'],
                         'pred_bayesRidge_d2v1':pred_train_targets_bayes_ridge['domain2_var1'],
                         'pred_bayesRidge_d2v2':pred_train_targets_bayes_ridge['domain2_var2']
                        })

In [26]:
test_x_2 = pd.DataFrame({'pred_svr_age':pred_test_targets_svr['age'],
                         'pred_svr_d1v1':pred_test_targets_svr['domain1_var1'],
                         'pred_svr_d1v2':pred_test_targets_svr['domain1_var2'],
                         'pred_svr_d2v1':pred_test_targets_svr['domain2_var1'],
                         'pred_svr_d2v2':pred_test_targets_svr['domain2_var2'],
                         
                         'pred_bayesRidge_age':pred_test_targets_bayes_ridge['age'],
                         'pred_bayesRidge_d1v1':pred_test_targets_bayes_ridge['domain1_var1'],
                         'pred_bayesRidge_d1v2':pred_test_targets_bayes_ridge['domain1_var2'],
                         'pred_bayesRidge_d2v1':pred_test_targets_bayes_ridge['domain2_var1'],
                         'pred_bayesRidge_d2v2':pred_test_targets_bayes_ridge['domain2_var2']
                        })

In [27]:
layer2_pred_train_targets_bayes_ridge = {}
layer2_pred_test_targets_bayes_ridge = {}

overal_score = 0

print("N_FOLD:{}".format(N_FOLD))

for target, w in [("age", 0.3),
                  ("domain1_var1", 0.175),
                  ("domain1_var2", 0.175),
                  ("domain2_var1", 0.175),
                  ("domain2_var2", 0.175)]:
    train_x = train.drop([ID]+TARGET_COLS, axis=1)
    train_y = train[target]
    test_x = test.drop(ID, axis=1)
    bayes_ridge = BayesianRidge(n_iter = 3000)
    
    print("-----{}-----".format(target))
    pred_train, preds_test, score_cv = predict_cv(train_x_2, train_y, test_x_2, bayes_ridge, target)
    overal_score += w*score_cv
    layer2_pred_train_targets_bayes_ridge[target] = pred_train
    layer2_pred_test_targets_bayes_ridge[target] = preds_test
print('--------------------------------------------')
print("Overal score:", np.round(overal_score, 8))

N_FOLD:15
-----age-----
age:0.14421887
age:7.155015942348272
age:9.064742909876987
-----domain1_var1-----
domain1_var1:0.15080455
domain1_var1:7.762141554201105
domain1_var1:9.668409257245811
-----domain1_var2-----
domain1_var2:0.15126951
domain1_var2:8.961450494872961
domain1_var2:11.364854759181254
-----domain2_var1-----
domain2_var1:0.18070826
domain2_var1:8.534527011060739
domain2_var1:10.815193490433728
-----domain2_var2-----
domain2_var2:0.17561923
domain2_var2:9.115560522299303
domain2_var2:11.650648425123958
--------------------------------------------
Overal score: 0.15848593


In [28]:
pred_train_targets_df = pd.DataFrame(layer2_pred_train_targets_bayes_ridge)
pred_test_targets_df = pd.DataFrame(layer2_pred_test_targets_bayes_ridge)

In [29]:
pred_train_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5434.0,5434.0,5434.0,5434.0,5434.0
mean,49.615159,51.474933,59.245438,47.248782,51.917891
std,9.925184,3.221965,0.543476,2.388493,1.771284
min,19.426835,39.718287,57.053921,40.043053,45.566302
25%,42.413943,49.238777,58.896185,45.536387,50.744259
50%,49.2658,51.46485,59.260056,47.215022,51.871432
75%,56.521083,53.627797,59.61094,48.910424,53.072124
max,83.597503,62.240539,61.139086,55.394176,59.225478


In [30]:
pred_test_targets_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.875286,51.551139,59.222403,46.938124,52.058489
std,10.216365,3.296548,0.530204,2.448786,1.77136
min,16.88321,40.763876,57.03497,37.677408,45.948034
25%,41.476548,49.271271,58.869231,45.209673,50.836267
50%,48.545942,51.5666,59.238078,46.907557,52.03695
75%,56.065606,53.780373,59.5836,48.701423,53.249794
max,82.888426,63.374503,61.18855,55.099619,58.924541


# Submit

In [31]:
test_df = pd.concat([test['Id'], pred_test_targets_df], axis=1)

In [32]:
test_df

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
0,10003,54.494662,51.020373,58.881175,48.374469,54.402872
1,10006,62.730412,54.638970,58.695994,49.744513,51.386962
2,10010,38.363157,48.919643,59.522635,44.621070,52.836628
3,10011,50.028493,52.269108,59.438688,47.578689,50.738504
4,10012,52.219921,54.808195,57.735318,44.871965,56.243564
...,...,...,...,...,...,...
5872,21745,39.888245,50.188739,59.176107,45.065572,53.126850
5873,21748,54.694734,56.059177,58.697501,47.245982,51.827699
5874,21749,40.143117,46.995715,59.768408,45.732938,52.361853
5875,21751,40.067328,50.282080,58.889330,44.332487,54.838161


In [35]:
test_df.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.875286,51.551139,59.222403,46.938124,52.058489
std,10.216365,3.296548,0.530204,2.448786,1.77136
min,16.88321,40.763876,57.03497,37.677408,45.948034
25%,41.476548,49.271271,58.869231,45.209673,50.836267
50%,48.545942,51.5666,59.238078,46.907557,52.03695
75%,56.065606,53.780373,59.5836,48.701423,53.249794
max,82.888426,63.374503,61.18855,55.099619,58.924541


In [33]:
sub_df = pd.melt(test_df[["Id", "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"]], id_vars=["Id"], value_name="Predicted")
sub_df["Id"] = sub_df["Id"].astype("str") + "_" +  sub_df["variable"].astype("str")

sub_df = sub_df.drop("variable", axis=1).sort_values("Id")
assert sub_df.shape[0] == test_df.shape[0]*5
sub_df.head(10)

Unnamed: 0,Id,Predicted
0,10003_age,54.494662
5877,10003_domain1_var1,51.020373
11754,10003_domain1_var2,58.881175
17631,10003_domain2_var1,48.374469
23508,10003_domain2_var2,54.402872
1,10006_age,62.730412
5878,10006_domain1_var1,54.63897
11755,10006_domain1_var2,58.695994
17632,10006_domain2_var1,49.744513
23509,10006_domain2_var2,51.386962


In [36]:
sub_df.to_csv("submission_15fold_stacking.csv", index=False)