In [1]:
import pandas as pd
import joblib
import os
import gc

from evaluation_metric import amex_metric
from sklearn.model_selection import ParameterGrid

In [2]:
class Parameters:
    path = 'Models_DART_all_10corr_5folds/'

In [3]:
def predict(data, models):

    model_list = []
    for path in [Parameters.path]:
        for fname in os.listdir(path):
            for model_name in models:
                if model_name in fname:
                    model_list.append(path + fname)

    pred_list = []
    for model_path in model_list:
        if model_path.startswith(Parameters.path):

            print(model_path)
            print('{:.02f} MB'.format(os.path.getsize(model_path)/1000000))
            model = joblib.load(model_path)
            pred_list.append(model.predict(data))
    
    return pred_list, model_list

In [4]:
validation = pd.read_parquet('Models_DART_all_10corr_5folds/validation.parquet')
validation_labels = validation['target']
validation.drop('target', axis = 1, inplace=True)
corr_col = validation.columns[validation.columns.str.startswith('corr_')].to_list()

top_corr = [
    "corr_D_39-B_26",
    "corr_D_48-B_4",
    "corr_P_2-D_44",
    "corr_D_47-B_4",
    "corr_D_47-D_39",
    "corr_P_2-B_4",
    "corr_D_39-B_10",
    "corr_D_44-B_4",
    "corr_D_39-B_2",
    "corr_D_46-B_4",
]

corr_to_remove = set(corr_col).difference(set(top_corr))
validation.drop(corr_to_remove, axis=1, inplace=True)

models = [
    'HT',
]
validation.shape

(22946, 2604)

In [5]:
print('Predicting the validation set...')
pred_df_validation, model_list = predict(validation, models)

model_names = [model.split('/')[-1][:10] for model in model_list]
pred_df_validation = pd.DataFrame(pred_df_validation).T
pred_df_validation.columns = model_names
pred_df_validation.index = validation.index

del validation
_ = gc.collect()

Predicting the validation set...
Models_DART_all_10corr_5folds/HT4_fold_3_iter_12137_score_0.80167.pkl
140.80 MB
Models_DART_all_10corr_5folds/HT0_fold_3_iter_9984_score_0.80029.pkl
115.86 MB
Models_DART_all_10corr_5folds/HT1_fold_1_iter_7691_score_0.80199.pkl
89.39 MB
Models_DART_all_10corr_5folds/HT3_fold_1_iter_7754_score_0.80211.pkl
90.12 MB
Models_DART_all_10corr_5folds/HT0_fold_1_iter_7155_score_0.80129.pkl
83.19 MB
Models_DART_all_10corr_5folds/HT1_fold_2_iter_11001_score_0.79703.pkl
127.69 MB
Models_DART_all_10corr_5folds/HT4_fold_0_iter_11038_score_0.79940.pkl
128.14 MB
Models_DART_all_10corr_5folds/HT1_fold_4_iter_10836_score_0.79789.pkl
125.74 MB
Models_DART_all_10corr_5folds/HT2_fold_0_iter_10203_score_0.79902.pkl
118.47 MB
Models_DART_all_10corr_5folds/HT3_fold_4_iter_12396_score_0.79901.pkl
143.81 MB
Models_DART_all_10corr_5folds/HT4_fold_4_iter_13143_score_0.79913.pkl
152.46 MB
Models_DART_all_10corr_5folds/HT3_fold_0_iter_10655_score_0.79909.pkl
123.71 MB
Models_DART_al

In [6]:
sorted_columns = sorted(pred_df_validation.columns)
pred_df_validation = pred_df_validation[sorted_columns]

In [7]:
pred_df_validation

Unnamed: 0_level_0,HT0_fold_0,HT0_fold_1,HT0_fold_2,HT0_fold_3,HT0_fold_4,HT1_fold_0,HT1_fold_1,HT1_fold_2,HT1_fold_3,HT1_fold_4,...,HT3_fold_0,HT3_fold_1,HT3_fold_2,HT3_fold_3,HT3_fold_4,HT4_fold_0,HT4_fold_1,HT4_fold_2,HT4_fold_3,HT4_fold_4
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12c761e40348fa242de0356426fa9547f0feea34d1f57bf4de9c50e7b236c300,0.025752,0.022992,0.016944,0.016186,0.023453,0.021029,0.020166,0.015504,0.015169,0.021408,...,0.020673,0.019744,0.015472,0.015274,0.020740,0.020565,0.019701,0.015567,0.015813,0.020437
c34b0cf3175108892e42e9382e8177847ac15636870604b3f6323dc55bca9af8,0.713392,0.619633,0.690577,0.751155,0.715005,0.726586,0.623543,0.716842,0.756894,0.721196,...,0.731387,0.624878,0.724355,0.757454,0.732837,0.742409,0.625931,0.724551,0.758704,0.736576
14aff3c0e0ed2aaaef6deb127f2541b67fdd0002615bfd041562993d3bfb0ac9,0.117376,0.116085,0.151765,0.133629,0.138310,0.121989,0.113809,0.157922,0.128679,0.142873,...,0.117201,0.114587,0.155737,0.127925,0.142128,0.121049,0.113386,0.157745,0.128826,0.141252
f82be644eb90ab65d737b6dfc5de670559fa2ca23c98eabb03e4aeb85bb1d503,0.000665,0.000673,0.000901,0.000506,0.000432,0.000450,0.000539,0.000693,0.000412,0.000351,...,0.000421,0.000533,0.000683,0.000412,0.000293,0.000421,0.000534,0.000673,0.000388,0.000282
09dec6d53f0f12db6edcaecbb4d2bddf41f220ba9569d1aa41c3233b4dfb2dc8,0.000831,0.000777,0.000748,0.000640,0.000555,0.000642,0.000660,0.000603,0.000541,0.000418,...,0.000601,0.000652,0.000577,0.000536,0.000401,0.000561,0.000644,0.000564,0.000527,0.000393
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3d282773d9b60959ae8f6a1035f41c600b8080584aa8d15addd5113811695fc1,0.000526,0.000720,0.000551,0.000460,0.000500,0.000348,0.000586,0.000432,0.000400,0.000392,...,0.000311,0.000570,0.000416,0.000391,0.000376,0.000300,0.000574,0.000421,0.000384,0.000399
071484115f0049ba1f9285f9fd035a6c72bf9d84fff5d48babefe08e0370d24f,0.051311,0.051600,0.058998,0.034561,0.046167,0.049355,0.046787,0.051735,0.033575,0.042333,...,0.048063,0.045776,0.050459,0.032438,0.039054,0.049591,0.045410,0.050457,0.032009,0.037592
d6f9f09ac6a5a841e46a91a909e1204e24bbafc86e828fefb5ab09aefea92933,0.001310,0.001388,0.001441,0.001068,0.001124,0.001094,0.001197,0.001248,0.000987,0.000951,...,0.001086,0.001162,0.001310,0.000974,0.000872,0.001067,0.001163,0.001294,0.000933,0.000866
f8b60c2413bb9386a9809ca58ef26837ca9ebce942345cfbb83fadac3cbb99b6,0.973083,0.975628,0.975725,0.979156,0.977411,0.975040,0.977682,0.978760,0.980733,0.979256,...,0.975171,0.978037,0.979581,0.980897,0.978839,0.975636,0.978142,0.979647,0.980938,0.979104


In [9]:
params = {
    "fold_0": ['HT0_fold_0', 'HT1_fold_0', 'HT2_fold_0', 'HT3_fold_0', 'HT4_fold_0'],
    "fold_1": ['HT0_fold_1', 'HT1_fold_1', 'HT2_fold_1', 'HT3_fold_1', 'HT4_fold_1'],
    "fold_2": ['HT0_fold_2', 'HT1_fold_2', 'HT2_fold_2', 'HT3_fold_2', 'HT4_fold_2'],
    "fold_3": ['HT0_fold_3', 'HT1_fold_3', 'HT2_fold_3', 'HT3_fold_3', 'HT4_fold_3'],
    "fold_4": ['HT0_fold_4', 'HT1_fold_4', 'HT2_fold_4', 'HT3_fold_4', 'HT4_fold_4'],
}

grid = list(ParameterGrid(params))
len(grid)

for counter, i in enumerate(grid):
    score = amex_metric(validation_labels, pred_df_validation[[i['fold_0'], i['fold_1'], i['fold_2'], i['fold_3'], i['fold_4']]].mean(axis = 1))
    grid[counter]['score'] = score

In [10]:
pd.DataFrame(grid).sort_values('score', ascending = False)

Unnamed: 0,fold_0,fold_1,fold_2,fold_3,fold_4,score
1195,HT1_fold_0,HT4_fold_1,HT2_fold_2,HT4_fold_3,HT0_fold_4,0.794904
945,HT1_fold_0,HT2_fold_1,HT2_fold_2,HT4_fold_3,HT0_fold_4,0.794898
820,HT1_fold_0,HT1_fold_1,HT2_fold_2,HT4_fold_3,HT0_fold_4,0.794898
1170,HT1_fold_0,HT4_fold_1,HT1_fold_2,HT4_fold_3,HT0_fold_4,0.794898
1045,HT1_fold_0,HT3_fold_1,HT1_fold_2,HT4_fold_3,HT0_fold_4,0.794898
...,...,...,...,...,...,...
3013,HT4_fold_0,HT4_fold_1,HT0_fold_2,HT2_fold_3,HT3_fold_4,0.792788
3008,HT4_fold_0,HT4_fold_1,HT0_fold_2,HT1_fold_3,HT3_fold_4,0.792787
2888,HT4_fold_0,HT3_fold_1,HT0_fold_2,HT2_fold_3,HT3_fold_4,0.792787
3002,HT4_fold_0,HT4_fold_1,HT0_fold_2,HT0_fold_3,HT2_fold_4,0.792783


In [11]:
test_first_half = pd.read_parquet('Data/test_all_slopes_corr_pcaslope_lagv2_avediff_catLastLastNAdate_part1.parquet')
corr_col = test_first_half.columns[test_first_half.columns.str.startswith('corr_')].to_list()

top_corr = [
    "corr_D_39-B_26",
    "corr_D_48-B_4",
    "corr_P_2-D_44",
    "corr_D_47-B_4",
    "corr_D_47-D_39",
    "corr_P_2-B_4",
    "corr_D_39-B_10",
    "corr_D_44-B_4",
    "corr_D_39-B_2",
    "corr_D_46-B_4",
]

corr_to_remove = set(corr_col).difference(set(top_corr))
test_first_half.drop(corr_to_remove, axis=1, inplace=True)

models = [
    'HT1_fold_0',
    'HT4_fold_1',
    'HT2_fold_2',
    'HT4_fold_3',
    'HT0_fold_4',
]

In [12]:
print('Predicting the first half...')
pred_list_first_half, model_list_first_half = predict(test_first_half, models)

model_names = [model.split('/')[-1][:10] for model in model_list_first_half]
pred_df_first_half = pd.DataFrame(pred_list_first_half).T
pred_df_first_half.columns = model_names
pred_df_first_half.index = test_first_half.index

del test_first_half
_ = gc.collect()

test_second_half = pd.read_parquet('Data/test_all_slopes_corr_pcaslope_lagv2_avediff_catLastLastNAdate_part2.parquet')
test_second_half.drop(corr_to_remove, axis=1, inplace=True)
print('\nPredicting the second half...')
pred_list_second_half, model_list_second_half = predict(test_second_half, models)

model_names = [model.split('/')[-1][:10] for model in model_list_second_half]
pred_df_second_half = pd.DataFrame(pred_list_second_half).T
pred_df_second_half.columns = model_names
pred_df_second_half.index = test_second_half.index

del test_second_half
_ = gc.collect()

Predicting the first half...
Models_DART_all_10corr_5folds/HT4_fold_3_iter_12137_score_0.80167.pkl
140.80 MB
Models_DART_all_10corr_5folds/HT0_fold_4_iter_9372_score_0.79676.pkl
108.79 MB
Models_DART_all_10corr_5folds/HT1_fold_0_iter_9611_score_0.79860.pkl
111.62 MB
Models_DART_all_10corr_5folds/HT4_fold_1_iter_7781_score_0.80213.pkl
90.43 MB
Models_DART_all_10corr_5folds/HT2_fold_2_iter_11314_score_0.79750.pkl
131.31 MB

Predicting the second half...
Models_DART_all_10corr_5folds/HT4_fold_3_iter_12137_score_0.80167.pkl
140.80 MB
Models_DART_all_10corr_5folds/HT0_fold_4_iter_9372_score_0.79676.pkl
108.79 MB
Models_DART_all_10corr_5folds/HT1_fold_0_iter_9611_score_0.79860.pkl
111.62 MB
Models_DART_all_10corr_5folds/HT4_fold_1_iter_7781_score_0.80213.pkl
90.43 MB
Models_DART_all_10corr_5folds/HT2_fold_2_iter_11314_score_0.79750.pkl
131.31 MB


In [13]:
pred_df_first_half.shape, pred_df_second_half.shape

((462310, 5), (462311, 5))

In [14]:
pred_df = pd.concat([pred_df_first_half, pred_df_second_half], axis=0)
pred_df.shape

(924621, 5)

In [None]:
# sorted_columns = sorted(pred_df.columns)
# pred_df = pred_df[sorted_columns]

In [15]:
pred_df

Unnamed: 0_level_0,HT4_fold_3,HT0_fold_4,HT1_fold_0,HT4_fold_1,HT2_fold_2
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.020554,0.023140,0.017080,0.020858,0.015026
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.000900,0.000911,0.000716,0.000635,0.000744
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.030916,0.030530,0.036347,0.038460,0.030392
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.170641,0.201568,0.170963,0.244515,0.176422
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,0.895854,0.825375,0.885739,0.876696,0.862192
...,...,...,...,...,...
ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c5d60460dba6dedc41e,0.009353,0.009429,0.006980,0.008376,0.007084
ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3a4f0ca3de613b0b2ad,0.847886,0.837254,0.805792,0.874671,0.853518
ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475cb095d2443a68030f1,0.419337,0.521963,0.403771,0.475066,0.485816
ffffddef1fc3643ea179c93245b68dca0f36941cd83977822e8b356988ca4d07,0.321460,0.216551,0.287009,0.214992,0.278124


In [17]:
pred_df.to_csv(Parameters.path + 'p_M10_14240_folds.csv')
pred_df.mean(axis = 1).to_csv(Parameters.path + 'p_M10_14240.csv', header=['prediction'])