In [1]:
import pandas as pd
import joblib
import os
import gc

In [2]:
def predict(data, models):

    model_list = []
    for path in ['models_DART_all_SEED42/']:
        for fname in os.listdir(path):
            for model_name in models:
                if model_name in fname:
                    model_list.append(path + fname)

    pred_list = []
    for model_path in model_list:
        if model_path.startswith('models_DART_all_SEED42/'):

            print(model_path)
            print('{:.02f} MB'.format(os.path.getsize(model_path)/1000000))
            model = joblib.load(model_path)
            pred_list.append(model.predict(data))
    
    return pred_list, model_list

In [3]:
test_first_half = pd.read_parquet('Data/test_all_slopes_corr_pcaslope_lagv2_avediff_catLastLastNAdate_part1.parquet')
test_second_half = pd.read_parquet('Data/test_all_slopes_corr_pcaslope_lagv2_avediff_catLastLastNAdate_part2.parquet')

corr_col = test_first_half.columns[test_first_half.columns.str.startswith('corr_')].to_list()

top_corr = [
    "corr_D_39-B_26",
    "corr_D_48-B_4",
    "corr_P_2-D_44",
    "corr_D_47-B_4",
    "corr_D_47-D_39",
    "corr_P_2-B_4",
    "corr_D_39-B_10",
    "corr_D_44-B_4",
    "corr_D_39-B_2",
    "corr_D_46-B_4",
    "corr_D_48-B_3",
    "corr_D_48-B_9",
    "corr_S_5-S_24",
    "corr_S_7-S_3",
    "corr_D_43-D_144",
    "corr_D_48-D_39",
    "corr_P_3-D_46",
    "corr_S_5-D_43",
    "corr_R_1-B_4",
    "corr_P_3-D_47",
    "corr_D_39-B_3",
    "corr_R_6-D_39",
    "corr_S_27-B_2",
    "corr_S_23-D_43",
    "corr_R_6-D_69",
    "corr_P_2-D_48",
    "corr_S_25-B_4",
    "corr_D_43-B_4",
    "corr_R_27-D_69",
    "corr_S_7-S_27",
    "corr_D_39-B_11",
    "corr_S_3-D_39",
    "corr_S_12-B_4",
    "corr_D_39-B_15",
    "corr_R_27-B_26",
    "corr_S_23-D_39",
    "corr_R_27-R_1",
    "corr_R_1-D_39",
    "corr_S_19-D_39",
    "corr_S_27-B_3",
    "corr_S_16-D_39",
    "corr_R_27-B_5",
    "corr_S_3-D_62",
    "corr_D_71-D_62",
    "corr_R_27-D_39",
    "corr_D_48-D_43",
    "corr_D_61-B_36",
    "corr_S_25-D_39",
    "corr_R_6-D_43",
    "corr_S_27-R_27",
    "corr_S_27-S_12",
    "corr_S_27-D_39",
    "corr_D_46-B_3",
    "corr_D_62-D_47",
    "corr_B_4-B_3",
    "corr_R_1-D_48",
]

corr_to_remove = set(corr_col).difference(set(top_corr))
test_first_half.drop(corr_to_remove, axis=1, inplace=True)
test_second_half.drop(corr_to_remove, axis=1, inplace=True)

models = [
    'HT3',
]

In [4]:
pred_list_first_half, model_list_first_half = predict(test_first_half, models)
pred_list_second_half, model_list_second_half = predict(test_second_half, models)

models_DART_all_SEED42/HT3_fold_0_iter_12986_score_0.80539.pkl
150.49 MB
models_DART_all_SEED42/HT3_fold_4_iter_12183_score_0.80044.pkl
140.70 MB
models_DART_all_SEED42/HT3_fold_1_iter_9524_score_0.79610.pkl
110.39 MB
models_DART_all_SEED42/HT3_fold_3_iter_9260_score_0.79598.pkl
107.67 MB
models_DART_all_SEED42/HT3_fold_2_iter_15079_score_0.80073.pkl
174.41 MB
models_DART_all_SEED42/HT3_fold_0_iter_12986_score_0.80539.pkl
150.49 MB
models_DART_all_SEED42/HT3_fold_4_iter_12183_score_0.80044.pkl
140.70 MB
models_DART_all_SEED42/HT3_fold_1_iter_9524_score_0.79610.pkl
110.39 MB
models_DART_all_SEED42/HT3_fold_3_iter_9260_score_0.79598.pkl
107.67 MB
models_DART_all_SEED42/HT3_fold_2_iter_15079_score_0.80073.pkl
174.41 MB


In [5]:
model_names = [model.split('/')[-1][:10] for model in model_list_first_half]
pred_df_first_half = pd.DataFrame(pred_list_first_half).T
pred_df_first_half.columns = model_names
pred_df_first_half.index = test_first_half.index

del test_first_half
_ = gc.collect()

model_names = [model.split('/')[-1][:10] for model in model_list_second_half]
pred_df_second_half = pd.DataFrame(pred_list_second_half).T
pred_df_second_half.columns = model_names
pred_df_second_half.index = test_second_half.index

del test_second_half
_ = gc.collect()

In [6]:
pred_df_first_half.shape, pred_df_second_half.shape

((462310, 5), (462311, 5))

In [8]:
pred_df = pd.concat([pred_df_first_half, pred_df_second_half], axis=0)

In [10]:
sorted_columns = sorted(pred_df.columns)
pred_df = pred_df[sorted_columns]

In [11]:
pred_df

Unnamed: 0_level_0,HT3_fold_0,HT3_fold_1,HT3_fold_2,HT3_fold_3,HT3_fold_4
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.017995,0.021148,0.023785,0.025397,0.020555
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.000630,0.000886,0.000679,0.000748,0.000534
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.035274,0.047941,0.028546,0.029985,0.043076
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.198812,0.194224,0.178718,0.209978,0.201378
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,0.852665,0.874891,0.854362,0.859622,0.900359
...,...,...,...,...,...
ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c5d60460dba6dedc41e,0.006854,0.006626,0.007046,0.006909,0.008667
ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3a4f0ca3de613b0b2ad,0.840407,0.792118,0.837194,0.827679,0.841303
ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475cb095d2443a68030f1,0.500356,0.457198,0.480847,0.517268,0.463822
ffffddef1fc3643ea179c93245b68dca0f36941cd83977822e8b356988ca4d07,0.230169,0.291113,0.248406,0.242624,0.235743


In [15]:
pred_df.to_csv('models_DART_all_SEED42/p_M7_folds.csv')
pred_df.mean(axis = 1).to_csv('models_DART_all_SEED42/p_M7.csv', header=['prediction'])