In [36]:
import numpy as np
import pandas as pd

import gc
from sklearn.model_selection import KFold
from cuml.metrics import mean_absolute_error, mean_squared_error

In [72]:
OUTPUT_DICT = ''

ID = 'Id'
TARGET_COLS = ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']
SEED = 2020

N_FOLD = 5

In [12]:
base_path = '/media/hiroki/share/kaggle_data/trends-assessment-prediction/'
train = pd.read_csv(base_path+'train_scores.csv', dtype={'Id':str})\
            .dropna().reset_index(drop=True) # to make things easy
reveal_ID = pd.read_csv(base_path+'reveal_ID_site2.csv', dtype={'Id':str})
ICN_numbers = pd.read_csv(base_path+'ICN_numbers.csv')
loading = pd.read_csv(base_path+'loading.csv', dtype={'Id':str})
fnc = pd.read_csv(base_path+'fnc.csv', dtype={'Id':str})
sample_submission = pd.read_csv(base_path+'sample_submission.csv', dtype={'Id':str})

In [16]:
fnc_features, loading_features = list(fnc.columns[1:]), list(loading.columns[1:])

In [13]:
sample_submission['ID_num'] = sample_submission[ID].apply(lambda x: int(x.split('_')[0]))
test = pd.DataFrame({ID: sample_submission['ID_num'].unique().astype(str)})
del sample_submission['ID_num']; gc.collect()
test.head()

Unnamed: 0,Id
0,10003
1,10006
2,10010
3,10011
4,10012


In [14]:
# merge
train = train.merge(loading, on=ID, how='left')
train = train.merge(fnc, on=ID, how='left')
train.head()

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2,IC_01,IC_07,IC_05,IC_16,...,CBN(13)_vs_DMN(94),CBN(18)_vs_DMN(94),CBN(4)_vs_DMN(94),CBN(7)_vs_DMN(94),CBN(18)_vs_CBN(13),CBN(4)_vs_CBN(13),CBN(7)_vs_CBN(13),CBN(4)_vs_CBN(18),CBN(7)_vs_CBN(18),CBN(7)_vs_CBN(4)
0,10001,57.436077,30.571975,62.553736,53.32513,51.427998,0.00607,0.014466,0.004136,0.000658,...,-0.149279,0.552841,0.131046,0.335446,0.394867,-0.042853,0.124627,-0.060712,0.515964,0.290488
1,10002,59.580851,50.969456,67.470628,60.651856,58.311361,0.009087,0.009291,0.007049,-0.002076,...,-0.214216,-0.039792,0.143014,-0.189962,0.498373,0.444231,0.592438,0.028649,0.705524,0.248327
2,10004,71.413018,53.152498,58.012103,52.418389,62.536641,0.004675,0.000957,0.006154,-0.000429,...,-0.130339,0.30954,0.141469,0.030853,0.344394,0.214097,0.317556,0.012435,0.665937,0.081358
3,10007,38.617381,49.197021,65.674285,40.151376,34.096421,0.005192,0.010585,0.01216,-0.00092,...,-0.150218,0.408926,0.072004,0.157582,0.532046,0.355448,0.462675,0.161005,0.703679,0.293607
4,10008,35.326582,15.769168,65.782269,44.643805,50.448485,0.007745,0.009748,0.009356,-0.004219,...,-0.080562,0.005339,-0.386757,0.020546,0.518383,0.408071,0.465851,0.112785,0.574596,0.178531


In [15]:
# merge
test = test.merge(loading, on=ID, how='left')
test = test.merge(fnc, on=ID, how='left')
test.head()

Unnamed: 0,Id,IC_01,IC_07,IC_05,IC_16,IC_26,IC_06,IC_10,IC_09,IC_18,...,CBN(13)_vs_DMN(94),CBN(18)_vs_DMN(94),CBN(4)_vs_DMN(94),CBN(7)_vs_DMN(94),CBN(18)_vs_CBN(13),CBN(4)_vs_CBN(13),CBN(7)_vs_CBN(13),CBN(4)_vs_CBN(18),CBN(7)_vs_CBN(18),CBN(7)_vs_CBN(4)
0,10003,0.008151,0.014684,0.010444,-0.005293,-0.002913,0.015042,0.017745,0.00393,-0.008021,...,-0.154941,0.13685,-0.022361,0.137625,0.677972,0.409412,0.563892,0.438684,0.618204,0.284474
1,10006,0.000334,0.005311,0.010053,0.00692,-6.5e-05,0.01531,0.016543,0.004794,0.003982,...,-0.053606,0.240957,0.270419,0.367692,0.354501,0.486364,0.416908,0.285274,0.69349,0.448526
2,10010,0.007103,0.006144,0.00977,-0.002884,-0.001346,0.015651,0.011613,-0.003291,0.013423,...,-0.244332,0.272077,0.193523,0.192254,0.563982,0.124482,0.488926,0.083368,0.774299,0.129327
3,10011,0.004362,0.01024,0.010167,0.004492,-0.001623,0.017381,0.01468,0.007453,0.008786,...,-0.099726,0.557121,0.042626,0.179456,0.416546,0.445402,0.436909,0.165182,0.591561,0.306678
4,10012,-0.007521,-0.003918,0.008434,-0.001145,0.002017,0.015065,0.019616,0.00414,-0.003744,...,-0.02523,0.203298,0.173427,0.046047,0.561599,0.418268,0.609517,0.218285,0.790285,0.30101


In [32]:
def metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))

In [18]:
# Giving less importance to FNC features since they are easier to overfit due to high dimensionality.
FNC_SCALE = 1/600

train[fnc_features] *= FNC_SCALE
test[fnc_features] *= FNC_SCALE

In [7]:
train_x = train.drop([ID]+TARGET_COLS, axis=1)
train_y = train[TARGET_COLS]
test_x = test.drop(ID, axis=1)

In [68]:
# 学習データに対する「目的変数を知らない」予測値と、テストデータに対する予測値を返す関数
def predict_cv(train_x, train_y, test_x, model, target_name):
    preds = []
    preds_test = []
    va_idxes = []
    
    score = []
    mae = []
    rmse = []

    # shuffleしなくても良い
    kf = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    # クロスバリデーションで学習・予測を行い、予測値とインデックスを保存する
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx].values, train_x.iloc[va_idx].values
        tr_y, va_y = train_y.iloc[tr_idx].values, train_y.iloc[va_idx].values
        model.fit(tr_x, tr_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)
        
        score.append(metric(va_y, pred))
        mae.append(mean_absolute_error(va_y, pred))
        rmse.append(np.sqrt(mean_squared_error(va_y, pred)))
        
    score_cv = np.array(score).mean()
    mae_cv = np.array(mae).mean()
    rmse_cv = np.array(rmse).mean()
    print("{0}:{1}".format(target_name, np.round(score_cv, 8)))
    print("{0}:{1}".format(target_name, np.array(mae_cv).mean()))
    print("{0}:{1}".format(target_name, np.array(rmse_cv).mean()))
    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test, score_cv

# SVR

In [69]:
import cudf
import cupy as cp
from cuml import SVR

In [74]:
pred_train_targets = {}
pred_test_targets = {}
score_cv_targets = []

overal_score = 0

for target, c, w, ff in [("age", 60, 0.3, 0.55),
                         ("domain1_var1", 12, 0.175, 0.2),
                         ("domain1_var2", 8, 0.175, 0.2),
                         ("domain2_var1", 9, 0.175, 0.22),
                         ("domain2_var2", 12, 0.175, 0.22)]:
    train_x = train.drop([ID]+TARGET_COLS, axis=1)
    train_y = train[target]
    test_x = test.drop(ID, axis=1)
    svr = SVR(C=c, cache_size=3000.0)
    
    print("-----{}-----".format(target))
    pred_train, preds_test, score_cv = predict_cv(train_x, train_y, test_x, svr, target)
    overal_score += w*score_cv
    pred_train_targets[target] = pred_train
    pred_test_targets[target] = preds_test
print('--------------------------------------------')
print("Overal score:", np.round(overal_score, 8))

-----age-----
age:0.14590207
age:7.2390394318143
age:9.175141152229084
-----domain1_var1-----
domain1_var1:0.15164345
domain1_var1:7.805857038680881
domain1_var1:9.727022448525
-----domain1_var2-----
domain1_var2:0.15156137
domain1_var2:8.979138175832578
domain1_var2:11.444830122148222
-----domain2_var1-----
domain2_var1:0.18084712
domain2_var1:8.543474387970898
domain2_var1:10.867463713297898
-----domain2_var2-----
domain2_var2:0.17560892
domain2_var2:9.11649856732418
domain2_var2:11.709338197245064
--------------------------------------------
Overal score: 0.15921127


In [80]:
pd.DataFrame(pred_test_targets).describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,48.918755,51.879521,60.014677,47.550219,52.863188
std,9.856628,3.145006,1.322402,2.71733,2.170802
min,18.687302,41.814341,55.278092,37.703003,44.988479
25%,41.720555,49.630928,59.108011,45.636272,51.400837
50%,48.539336,51.847639,60.014374,47.581931,52.820171
75%,55.92218,53.995373,60.897448,49.483745,54.325552
max,80.51812,63.114481,65.785272,56.910917,62.418266
