# Library

In [1]:
#===========================================================
# Library
#===========================================================
import os
import gc
from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
from contextlib import contextmanager
import time

import numpy as np
import pandas as pd
import scipy as sp
import random

import matplotlib.pyplot as plt
import seaborn as sns

from functools import partial

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn import preprocessing
import category_encoders as ce
from sklearn.metrics import mean_squared_error

import torch

import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

# Utils

In [2]:
#===========================================================
# Utils
#===========================================================
def get_logger(filename='log'):
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

logger = get_logger()


@contextmanager
def timer(name):
    t0 = time.time()
    yield
    logger.info(f'[{name}] done in {time.time() - t0:.0f} s')


def seed_everything(seed=2020):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    
def load_df(path, df_name, debug=False):
    if path.split('.')[-1]=='csv':
        df = pd.read_csv(path)
        if debug:
            df = pd.read_csv(path, nrows=1000)
    elif path.split('.')[-1]=='pkl':
        df = pd.read_pickle(path)
    if logger==None:
        print(f"{df_name} shape / {df.shape} ")
    else:
        logger.info(f"{df_name} shape / {df.shape} ")
    return df

# Config

In [3]:
#===========================================================
# Config
#===========================================================
OUTPUT_DICT = ''

ID = 'Id'
TARGET_COLS = ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']
SEED = 2020
seed_everything(seed=SEED)

N_FOLD = 5

# Data Loading

In [4]:
base_path = '/media/hiroki/share/kaggle_data/trends-assessment-prediction/'
train = pd.read_csv(base_path+'train_scores.csv', dtype={'Id':str})\
            #.dropna().reset_index(drop=True) # to make things easy
reveal_ID = pd.read_csv(base_path+'reveal_ID_site2.csv', dtype={'Id':str})
ICN_numbers = pd.read_csv(base_path+'ICN_numbers.csv')
#loading = pd.read_csv(base_path+'loading.csv', dtype={'Id':str})
#fnc = pd.read_csv(base_path+'fnc.csv', dtype={'Id':str})
sample_submission = pd.read_csv(base_path+'sample_submission.csv', dtype={'Id':str})

In [5]:
train.head()

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
0,10001,57.436077,30.571975,62.553736,53.32513,51.427998
1,10002,59.580851,50.969456,67.470628,60.651856,58.311361
2,10004,71.413018,53.152498,58.012103,52.418389,62.536641
3,10005,66.53263,,,52.108977,69.993075
4,10007,38.617381,49.197021,65.674285,40.151376,34.096421


In [6]:
reveal_ID.head()

Unnamed: 0,Id
0,10012
1,10014
2,10020
3,10034
4,10059


In [7]:
ICN_numbers.head()

Unnamed: 0,ICN_number
0,69
1,53
2,98
3,99
4,45


In [8]:
sample_submission.head()

Unnamed: 0,Id,Predicted
0,10003_age,50.0
1,10003_domain1_var1,50.0
2,10003_domain1_var2,50.0
3,10003_domain2_var1,50.0
4,10003_domain2_var2,50.0


In [9]:
sample_submission['ID_num'] = sample_submission[ID].apply(lambda x: int(x.split('_')[0]))
test = pd.DataFrame({ID: sample_submission['ID_num'].unique().astype(str)})
del sample_submission['ID_num']; gc.collect()
test.head()

Unnamed: 0,Id
0,10003
1,10006
2,10010
3,10011
4,10012


In [10]:
ic_path = "/media/hiroki/working/kaggle_data/trends-neuroimaging/split_IC/svd"
ic = np.load(ic_path+'/ic_1.npz.npy')

ic_train = ic[:5877, :]
ic_test = ic[5877:, :]
del ic

ic_train = pd.concat([train["Id"], pd.DataFrame(ic_train)], axis=1)
ic_test = pd.concat([test["Id"], pd.DataFrame(ic_test)], axis=1)

In [11]:
ic_test

Unnamed: 0,Id,0,1,2,3,4,5,6,7,8,...,490,491,492,493,494,495,496,497,498,499
0,10003,185.354600,-1.894216,-25.597632,17.524616,9.609703,-7.494714,-3.667134,9.676563,-5.867997,...,1.771262,-6.314309,0.683588,-1.568563,3.687723,6.017120,-1.112476,-1.071275,2.081220,2.977473
1,10006,181.626422,-23.852257,-22.662911,15.507212,-8.213980,4.796110,-0.467497,-5.817393,4.268109,...,-0.947327,-1.697961,-2.390517,-2.846977,4.552740,0.258638,-5.720619,6.610954,-1.139527,-1.578379
2,10010,184.281211,-5.234343,-16.483871,-15.019387,16.283226,4.644053,3.618274,4.822289,4.863458,...,-1.591953,0.392222,1.947491,-3.871654,2.339150,0.767052,-0.526380,-3.309229,-1.375089,1.376860
3,10011,181.040656,-53.347298,-25.376116,-8.029581,1.242265,-3.693744,-1.155550,-7.127769,5.179252,...,1.522331,1.658697,-2.309193,1.122153,-2.494400,0.427132,7.019281,2.360380,1.455302,4.803931
4,10012,184.773520,-8.031468,-16.310459,4.061960,10.600912,-2.745191,-3.448653,14.787611,5.821477,...,1.646200,1.072309,-5.038330,-1.207843,-2.642157,-0.307642,2.899685,-3.172944,6.516042,1.515899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5872,21745,180.950685,-40.158524,-2.054585,8.770301,2.903658,8.315104,9.492417,6.233579,-3.984710,...,5.330599,-3.902928,0.621628,4.796071,1.879420,-9.133247,6.894612,-5.824453,-2.276085,3.988381
5873,21748,176.745390,15.437111,-14.255579,3.347258,2.724352,-1.152077,17.655595,-9.695320,-28.093154,...,3.156461,-0.820893,2.458082,-8.328220,-4.674589,1.802967,2.514840,-4.819525,-3.692336,-3.716252
5874,21749,169.971154,12.118584,25.889464,-0.313058,16.567745,8.901744,0.056223,-14.393599,-9.489391,...,-3.668765,2.042376,0.303606,0.029159,-1.034329,-0.823731,3.468392,2.695276,-2.704517,2.678318
5875,21751,160.253248,11.686251,15.355045,-13.791450,-0.786260,-5.345471,14.768048,-6.323734,2.488926,...,-4.354361,-1.371137,-8.095671,2.211263,4.912829,-2.889812,-2.000491,-5.600463,-2.210095,8.769010


# FE

In [12]:
train

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
0,10001,57.436077,30.571975,62.553736,53.325130,51.427998
1,10002,59.580851,50.969456,67.470628,60.651856,58.311361
2,10004,71.413018,53.152498,58.012103,52.418389,62.536641
3,10005,66.532630,,,52.108977,69.993075
4,10007,38.617381,49.197021,65.674285,40.151376,34.096421
...,...,...,...,...,...,...
5872,21746,14.257265,21.358872,61.165998,51.778483,54.640179
5873,21747,55.456978,68.169675,29.907995,55.349257,54.019517
5874,21750,48.948756,55.114811,60.878271,38.617246,50.679885
5875,21752,66.532630,59.844808,72.303110,55.458281,46.870235


In [13]:
#train.to_csv('add_ica_rep2_train.csv')
#test.to_csv('add_ica_rep2_test.csv')
train

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
0,10001,57.436077,30.571975,62.553736,53.325130,51.427998
1,10002,59.580851,50.969456,67.470628,60.651856,58.311361
2,10004,71.413018,53.152498,58.012103,52.418389,62.536641
3,10005,66.532630,,,52.108977,69.993075
4,10007,38.617381,49.197021,65.674285,40.151376,34.096421
...,...,...,...,...,...,...
5872,21746,14.257265,21.358872,61.165998,51.778483,54.640179
5873,21747,55.456978,68.169675,29.907995,55.349257,54.019517
5874,21750,48.948756,55.114811,60.878271,38.617246,50.679885
5875,21752,66.532630,59.844808,72.303110,55.458281,46.870235


In [14]:
# merge
train = train.merge(ic_train, on=ID, how='left')
train.head()

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2,0,1,2,3,...,490,491,492,493,494,495,496,497,498,499
0,10001,57.436077,30.571975,62.553736,53.32513,51.427998,171.218424,-6.42776,-9.927892,10.366246,...,-1.406378,7.561013,-0.628929,-6.537751,4.709295,-0.934201,-3.013736,-3.901477,3.500502,-0.550326
1,10002,59.580851,50.969456,67.470628,60.651856,58.311361,183.24478,-50.385174,7.0914,16.347872,...,2.732647,-0.841337,1.525409,-1.180793,-0.306342,-1.310111,1.811784,-2.784705,2.317481,-1.611979
2,10004,71.413018,53.152498,58.012103,52.418389,62.536641,176.161957,-7.524483,-10.002413,-13.571608,...,0.365878,3.83537,-2.190664,-2.143797,7.305573,4.748013,-6.533647,-0.257726,1.946055,3.426639
3,10005,66.53263,,,52.108977,69.993075,181.3488,-14.650744,-17.041429,5.972692,...,1.791167,-1.603553,1.828442,-4.826036,-1.267576,1.868431,2.179348,-0.794626,4.521646,-4.960226
4,10007,38.617381,49.197021,65.674285,40.151376,34.096421,180.13825,-5.16286,-9.385625,-1.272054,...,3.503601,2.157977,-1.644566,0.966167,4.606762,-0.374768,3.415235,3.731394,-1.176937,1.075447


In [15]:
# merge
#train = train.merge(pca_loading_pd, on=ID, how='left')
#train = train.merge(pca_fnc_pd, on=ID, how='left')
#train = train.merge(svd, on=ID, how='left')
#train.head()

In [16]:
test

Unnamed: 0,Id
0,10003
1,10006
2,10010
3,10011
4,10012
...,...
5872,21745
5873,21748
5874,21749
5875,21751


In [17]:
# merge
# merge
test = test.merge(ic_test, on=ID, how='left')
test.head()

Unnamed: 0,Id,0,1,2,3,4,5,6,7,8,...,490,491,492,493,494,495,496,497,498,499
0,10003,185.3546,-1.894216,-25.597632,17.524616,9.609703,-7.494714,-3.667134,9.676563,-5.867997,...,1.771262,-6.314309,0.683588,-1.568563,3.687723,6.01712,-1.112476,-1.071275,2.08122,2.977473
1,10006,181.626422,-23.852257,-22.662911,15.507212,-8.21398,4.79611,-0.467497,-5.817393,4.268109,...,-0.947327,-1.697961,-2.390517,-2.846977,4.55274,0.258638,-5.720619,6.610954,-1.139527,-1.578379
2,10010,184.281211,-5.234343,-16.483871,-15.019387,16.283226,4.644053,3.618274,4.822289,4.863458,...,-1.591953,0.392222,1.947491,-3.871654,2.33915,0.767052,-0.52638,-3.309229,-1.375089,1.37686
3,10011,181.040656,-53.347298,-25.376116,-8.029581,1.242265,-3.693744,-1.15555,-7.127769,5.179252,...,1.522331,1.658697,-2.309193,1.122153,-2.4944,0.427132,7.019281,2.36038,1.455302,4.803931
4,10012,184.77352,-8.031468,-16.310459,4.06196,10.600912,-2.745191,-3.448653,14.787611,5.821477,...,1.6462,1.072309,-5.03833,-1.207843,-2.642157,-0.307642,2.899685,-3.172944,6.516042,1.515899


# Model

In [52]:
from sklearn.linear_model import BayesianRidge
from sklearn.preprocessing import StandardScaler
from scipy.preprocessing import

In [53]:
ss = StandardScaler()

In [69]:
# 学習データに対する「目的変数を知らない」予測値と、テストデータに対する予測値を返す関数
def predict_cv(train_x, train_y, test_x, model, target_name):
    preds = []
    preds_test = []
    va_idxes = []
    
    score = []
    mae = []
    rmse = []

    # shuffleしなくても良い
    kf = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    # クロスバリデーションで学習・予測を行い、予測値とインデックスを保存する
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx].values, train_x.iloc[va_idx].values
        tr_y, va_y = train_y.iloc[tr_idx].values, train_y.iloc[va_idx].values
        # z-scaling X
        tr_x = ss.fit_transform(tr_x)
        va_x = ss.transform(va_x)
        test_x = ss.transform(test_x)
        model.fit(tr_x, tr_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)
        
        score.append(metric(va_y, pred))
        mae.append(mean_absolute_error(va_y, pred))
        rmse.append(np.sqrt(mean_squared_error(va_y, pred)))
        
    score_cv = np.array(score).mean()
    mae_cv = np.array(mae).mean()
    rmse_cv = np.array(rmse).mean()
    print("{0}_score:{1}".format(target_name, np.round(score_cv, 8)))
    print("{0}_mae:{1}".format(target_name, np.array(mae_cv).mean()))
    print("{0}_rmse:{1}".format(target_name, np.array(rmse_cv).mean()))
    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test, score_cv

In [70]:
def metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))

In [71]:
pred_train_targets_bayes_ridge = {}
pred_test_targets_bayes_ridge = {}

pred_train_targets_bayes_ridge_dfs = {}
pred_test_targets_bayes_ridge_dfs = {}

overal_score = 0

print("N_FOLD:{}".format(N_FOLD))

for target, w in [("age", 0.3),
                  ("domain1_var1", 0.175),
                  ("domain1_var2", 0.175),
                  ("domain2_var1", 0.175),
                  ("domain2_var2", 0.175)]:
    
    train_df = train[train[target].notnull()]
    test_df = test
    
    use_idx = train_df.index
    train_x = train_df.drop([ID]+TARGET_COLS, axis=1)
    train_y = train_df[target]
    test_x = test_df.drop(ID, axis=1)
    bayes_ridge = BayesianRidge(n_iter = 3000)
    
    print("-----{}-----".format(target))
    pred_train, preds_test, score_cv = predict_cv(train_x, train_y, test_x, bayes_ridge, target)
    overal_score += w*score_cv
    pred_train_targets_bayes_ridge[target] = pred_train
    pred_test_targets_bayes_ridge[target] = preds_test
    
    pred_train_targets_bayes_ridge_dfs[target] = pd.Series(pred_train, name="pre_bayRidge_4D_IC1{}".format(target), index=use_idx)
    pred_train_targets_bayes_ridge_dfs[target] = pd.merge(train['Id'],
                                                 pred_train_targets_bayes_ridge_dfs[target],
                                                 left_index=True,
                                                 right_index=True)
    pred_test_targets_bayes_ridge_dfs[target] = pd.Series(preds_test, name="pre_bayRidge_4D_IC1{}".format(target))
    pred_test_targets_bayes_ridge_dfs[target] = pd.concat([test['Id'], pred_test_targets_bayes_ridge_dfs[target]], axis=1)
print('--------------------------------------------')
print("Overal score:", np.round(overal_score, 8))

N_FOLD:5
-----age-----
age_score:-260.9267164
age_mae:0.809706267990887
age_rmse:1.000288133980351
-----domain1_var1-----
domain1_var1_score:28.69068297
domain1_var1_mae:0.8011063683898183
domain1_var1_rmse:1.0000910648446772
-----domain1_var2-----
domain1_var2_score:-67.20566631
domain1_var2_mae:0.7878276378510682
domain1_var2_rmse:1.0004902606266697
-----domain2_var1-----
domain2_var1_score:25.65146476
domain2_var1_mae:0.794926885013202
domain2_var1_rmse:1.0003510644884352
-----domain2_var2-----
domain2_var2_score:353.74173565
domain2_var2_mae:0.7860494844135656
domain2_var2_rmse:1.0001613146900303
--------------------------------------------
Overal score: -18.62432693


In [57]:
for i, pred_df in enumerate(pred_train_targets_bayes_ridge_dfs.values()):
    #display(pred_df)
    if i == 0:
        pred_1st_train_df = pred_df
        #display(pred_train_dfs, head=False)
    else:
        pred_1st_train_df = pd.concat([pred_1st_train_df, pred_df.drop("Id", axis=1)], axis=1)
        #display(pred_train_dfs, head=False)

In [58]:
for i, pred_df in enumerate(pred_test_targets_bayes_ridge_dfs.values()):
    #display(pred_df)
    if i == 0:
        pred_1st_test_df = pred_df
        #display(pred_train_dfs, head=False)
    else:
        pred_1st_test_df = pd.concat([pred_1st_test_df, pred_df.drop("Id", axis=1)], axis=1)
        #display(pred_train_dfs, head=False)

In [59]:
train.describe()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2,0,1,2,3,4,...,490,491,492,493,494,495,496,497,498,499
count,5877.0,5439.0,5439.0,5838.0,5838.0,5877.0,5877.0,5877.0,5877.0,5877.0,...,5877.0,5877.0,5877.0,5877.0,5877.0,5877.0,5877.0,5877.0,5877.0,5877.0
mean,50.034068,51.474692,59.244132,47.32513,51.905658,170.650916,0.188809,0.178644,-0.046663,0.042454,...,0.044856,-0.066512,0.020114,-0.023583,-0.041477,-0.002944,-0.04708,-0.015872,-0.039938,0.017858
std,13.539881,10.188354,11.387595,11.124863,11.839203,21.103078,21.638394,12.37368,10.247033,8.367502,...,3.601415,3.553862,3.529772,3.538511,3.525679,3.537701,3.553311,3.517778,3.535693,3.518701
min,14.257265,15.769168,1.021874,0.991172,0.815285,18.47325,-81.313317,-34.319162,-30.174353,-26.572686,...,-13.314759,-12.398268,-11.968855,-13.66734,-12.382285,-12.982343,-12.371768,-13.650111,-13.306902,-11.583443
25%,40.129361,44.78124,52.396805,40.122682,44.51488,168.571718,-13.984303,-9.25022,-6.642368,-5.662647,...,-2.352664,-2.443801,-2.272609,-2.325914,-2.372943,-2.288236,-2.423317,-2.320487,-2.391053,-2.278449
50%,50.427747,51.847306,60.052535,47.811205,52.572032,175.235016,0.714755,0.510325,-1.272054,-0.727623,...,0.019503,5.5e-05,0.011203,-0.032057,-0.038326,-0.016913,-0.069447,-0.004277,-0.021338,-0.052522
75%,59.580851,58.495576,67.142611,55.058014,59.910146,181.023168,14.951448,9.737134,5.199727,4.931071,...,2.442936,2.272858,2.322561,2.30289,2.281813,2.348375,2.36023,2.254954,2.315741,2.319523
max,84.491113,81.32558,94.702874,82.164478,94.509903,203.172782,81.610923,64.382312,59.825785,37.15766,...,14.047963,12.202687,13.597273,12.834045,12.820113,15.640291,13.307695,12.871637,14.142632,13.103974


In [29]:
pred_1st_train_df.describe()

Unnamed: 0,pre_lgb_PCICraw_age,pre_lgb_PCICraw_domain1_var1,pre_lgb_PCICraw_domain1_var2,pre_lgb_PCICraw_domain2_var1,pre_lgb_PCICraw_domain2_var2
count,5877.0,5439.0,5439.0,5838.0,5838.0
mean,49.710083,51.21937,59.009686,46.97969,51.661687
std,0.727831,0.65203,0.664542,0.666917,0.675982
min,47.354427,48.800542,56.548431,44.591817,48.949443
25%,49.202714,50.776891,58.552831,46.512491,51.209498
50%,49.705418,51.198239,58.999652,46.966793,51.668384
75%,50.194805,51.658942,59.453929,47.447281,52.121852
max,52.231455,53.735982,61.434653,49.293047,54.622441


In [28]:
pred_1st_test_df.describe()

Unnamed: 0,pre_lgb_PCICraw_age,pre_lgb_PCICraw_domain1_var1,pre_lgb_PCICraw_domain1_var2,pre_lgb_PCICraw_domain2_var1,pre_lgb_PCICraw_domain2_var2
count,5877.0,5877.0,5877.0,5877.0,5877.0
mean,49.694608,51.226754,59.027561,46.953189,51.678852
std,0.520677,0.458215,0.483334,0.461565,0.481859
min,47.829285,49.364666,57.099586,45.07321,49.854125
25%,49.345025,50.915732,58.697477,46.63207,51.358849
50%,49.694732,51.231029,59.023207,46.957575,51.689138
75%,50.046012,51.539621,59.353918,47.275113,52.009354
max,51.552448,52.949377,61.140199,48.371211,53.360198


In [360]:
N_FOLD:5
-----age-----
age_score:0.14700111
age_mae:7.354451609859484
age_rmse:9.317636462961504
-----domain1_var1-----
domain1_var1_score:0.15196461
domain1_var1_mae:7.822831536052648
domain1_var1_rmse:9.755431461591984
-----domain1_var2-----
domain1_var2_score:0.15157942
domain1_var2_mae:8.97954761666281
domain1_var2_rmse:11.37754696187342
-----domain2_var1-----
domain2_var1_score:0.18321671
domain2_var1_mae:8.668318085305518
domain2_var1_rmse:10.944875247555299
-----domain2_var2-----
domain2_var2_score:0.17724289
domain2_var2_mae:9.198720430743872
domain2_var2_rmse:11.730396671833388
--------------------------------------------
Overal score: 0.16030097

SyntaxError: invalid syntax (<ipython-input-360-cc4a20359016>, line 2)

In [None]:
-----age-----
age_score:0.14757656
age_mae:7.383152943924243
age_rmse:9.364733546069733
-----domain1_var1-----
domain1_var1_score:0.15186356
domain1_var1_mae:7.817631282354443
domain1_var1_rmse:9.749862894519833
-----domain1_var2-----
domain1_var2_score:0.1516539
domain1_var2_mae:8.98395596652827
domain1_var2_rmse:11.3820081905469
-----domain2_var1-----
domain2_var1_score:0.1832475
domain2_var1_mae:8.669800346789618
domain2_var1_rmse:10.945619557329753
-----domain2_var2-----
domain2_var2_score:0.17727542
domain2_var2_mae:9.200456433186778
domain2_var2_rmse:11.728454771025685
--------------------------------------------
Overal score: 0.16048003

In [None]:
Overal score: 0.16054234

# Save Pre result

In [361]:
out_base_path = '/media/hiroki/working/kaggle/trends-neuroimaging/models'

In [362]:
with open (out_base_path+'/1st_layer'+'/lgb_fncPC_and_loadingIC_and_raw/lgb_train.pkl', 'wb') as f:
  pkl.dump(pred_train_targets_lgb_dfs , f)
with open (out_base_path+'/1st_layer'+'/lgb_fncPC_and_loadingIC_and_raw/lgb_test.pkl', 'wb') as f:
  pkl.dump(pred_test_targets_lgb_dfs , f)

# Submission

In [None]:
sample_submission.head()

In [None]:
pred_df = pd.DataFrame()

for TARGET in TARGET_COLS:
    tmp = pd.DataFrame()
    tmp[ID] = [f'{c}_{TARGET}' for c in test[ID].values]
    tmp['Predicted'] = prediction_dict[TARGET]
    pred_df = pd.concat([pred_df, tmp])

print(pred_df.shape)
print(sample_submission.shape)

pred_df.head()

In [None]:
submission = sample_submission.drop(columns='Predicted').merge(pred_df, on=ID, how='left')
print(submission.shape)
submission.to_csv('submission.csv', index=False)
submission.head()