In [1]:
import pandas as pd
import numpy as np
import random
import joblib
import os
import gc
from sklearn.model_selection import ParameterGrid
from evaluation_metric import amex_metric

In [2]:
class Parameters:
    path = 'Models_all_56corr_5folds_validation42/'
    n_folds = 5

In [3]:
def predict(data, models):

    model_list = []
    for path in [Parameters.path]:
        for fname in os.listdir(path):
            for model_name in models:
                if model_name in fname:
                    model_list.append(path + fname)

    pred_list = []
    for counter, model_path in enumerate(model_list):
        if model_path.startswith(Parameters.path):
            print(model_path)
            model = joblib.load(model_path)
            pred_list.append(model.predict(data))
    
    return pred_list, model_list

In [4]:
top_corr = [
    "corr_D_39-B_26",
    "corr_D_48-B_4",
    "corr_P_2-D_44",
    "corr_D_47-B_4",
    "corr_D_47-D_39",
    "corr_P_2-B_4",
    "corr_D_39-B_10",
    "corr_D_44-B_4",
    "corr_D_39-B_2",
    "corr_D_46-B_4",
    "corr_D_48-B_3",
    "corr_D_48-B_9",
    "corr_S_5-S_24",
    "corr_S_7-S_3",
    "corr_D_43-D_144",
    "corr_D_48-D_39",
    "corr_P_3-D_46",
    "corr_S_5-D_43",
    "corr_R_1-B_4",
    "corr_P_3-D_47",
    "corr_D_39-B_3",
    "corr_R_6-D_39",
    "corr_S_27-B_2",
    "corr_S_23-D_43",
    "corr_R_6-D_69",
    "corr_P_2-D_48",
    "corr_S_25-B_4",
    "corr_D_43-B_4",
    "corr_R_27-D_69",
    "corr_S_7-S_27",
    "corr_D_39-B_11",
    "corr_S_3-D_39",
    "corr_S_12-B_4",
    "corr_D_39-B_15",
    "corr_R_27-B_26",
    "corr_S_23-D_39",
    "corr_R_27-R_1",
    "corr_R_1-D_39",
    "corr_S_19-D_39",
    "corr_S_27-B_3",
    "corr_S_16-D_39",
    "corr_R_27-B_5",
    "corr_S_3-D_62",
    "corr_D_71-D_62",
    "corr_R_27-D_39",
    "corr_D_48-D_43",
    "corr_D_61-B_36",
    "corr_S_25-D_39",
    "corr_R_6-D_43",
    "corr_S_27-R_27",
    "corr_S_27-S_12",
    "corr_S_27-D_39",
    "corr_D_46-B_3",
    "corr_D_62-D_47",
    "corr_B_4-B_3",
    "corr_R_1-D_48",
]

In [None]:
validation = pd.read_parquet(Parameters.path + 'validation.parquet')
validation_labels = validation['target']
validation.drop('target', axis = 1, inplace=True)

validation.shape

In [None]:
models = ['HT']

print('Predicting the validation set...\n')
pred_df_validation, model_list = predict(validation, models)

In [None]:
model_list_modified = [model.split('/')[-1] for model in model_list]
model_list_modified = [model.split('_')[0:3] for model in model_list_modified]
model_list_modified = ['_'.join(model) for model in model_list_modified]

In [None]:
pred_df_validation = pd.DataFrame(pred_df_validation).T
pred_df_validation.columns = model_list_modified
pred_df_validation.index = validation.index

del validation
_ = gc.collect()

In [None]:
fold_score_dic = {}
for col in pred_df_validation.columns:
    score = amex_metric(validation_labels, pred_df_validation[col])
    fold_score_dic[col] = score
score_df = pd.DataFrame.from_dict(fold_score_dic, orient='index', columns=['score']).sort_values('score', ascending=False)

score_df

In [None]:
high_score_folds = {}
for i in range(Parameters.n_folds):
    high_score_folds[f'fold_{i}'] = score_df[score_df.index.str.contains(f'fold_{i}')].index.tolist()
high_score_folds

In [None]:
total_combinations = 1
for i in range(Parameters.n_folds):
   total_combinations *= len(high_score_folds['fold_{}'.format(i)])
total_combinations

In [None]:
grid = iter(ParameterGrid(high_score_folds))
counter = 0
max_score = 0
while True:
    try:
        counter += 1
        list_folds = list(next(grid).values())
        score = amex_metric(validation_labels, pred_df_validation[list_folds].mean(axis=1))

        if score > max_score:
            max_score = score
            max_params = list_folds
            print(counter, f':{max_score:.7f}', end=', ')

    except StopIteration:
        break

print(f'\n\nMax score: {max_score:.7f}, Max params: {max_params}')

In [None]:
# Max score: 0.8094517, Max params: ['HT11_fold_0', 'HT10_fold_1', 'HT21_fold_2', 'HT5_fold_3', 'HT2_fold_4']

In [5]:
max_params=['HT11_fold_0', 'HT10_fold_1', 'HT21_fold_2', 'HT5_fold_3', 'HT2_fold_4']

In [6]:
test_first_half = pd.read_parquet('Data/test_all_slopes_corr_pcaslope_lagv2_avediff_catLastLastNAdate_part1.parquet')
corr_col = test_first_half.columns[test_first_half.columns.str.startswith("corr_")].to_list()
corr_to_remove = set(corr_col).difference(set(top_corr))
test_first_half.drop(corr_to_remove, axis=1, inplace=True)

print('Predicting the first half...')
pred_list_first_half, model_list_first_half = predict(test_first_half, max_params)

model_list_modified_first_half = [model.split('/')[-1] for model in model_list_first_half]
model_list_modified_first_half = [model.split('_')[0:3] for model in model_list_modified_first_half]
model_list_modified_first_half = ['_'.join(model) for model in model_list_modified_first_half]

pred_df_first_half = pd.DataFrame(pred_list_first_half).T
pred_df_first_half.columns = model_list_modified_first_half
pred_df_first_half.index = test_first_half.index

del test_first_half
_ = gc.collect()

test_second_half = pd.read_parquet('Data/test_all_slopes_corr_pcaslope_lagv2_avediff_catLastLastNAdate_part2.parquet')
test_second_half.drop(corr_to_remove, axis=1, inplace=True)
print('\nPredicting the second half...')
pred_list_second_half, model_list_second_half = predict(test_second_half, max_params)

pred_df_second_half = pd.DataFrame(pred_list_second_half).T
pred_df_second_half.columns = model_list_modified_first_half
pred_df_second_half.index = test_second_half.index

del test_second_half
_ = gc.collect()

Predicting the first half...
Models_all_56corr_5folds_validation42/HT5_fold_3_iter_13280_score_0.79846.pkl
Models_all_56corr_5folds_validation42/HT10_fold_1_iter_9878_score_0.79968.pkl
Models_all_56corr_5folds_validation42/HT21_fold_2_iter_14252_score_0.79831.pkl
Models_all_56corr_5folds_validation42/HT11_fold_0_iter_15517_score_0.79775.pkl
Models_all_56corr_5folds_validation42/HT2_fold_4_iter_13027_score_0.79944.pkl

Predicting the second half...
Models_all_56corr_5folds_validation42/HT5_fold_3_iter_13280_score_0.79846.pkl
Models_all_56corr_5folds_validation42/HT10_fold_1_iter_9878_score_0.79968.pkl
Models_all_56corr_5folds_validation42/HT21_fold_2_iter_14252_score_0.79831.pkl
Models_all_56corr_5folds_validation42/HT11_fold_0_iter_15517_score_0.79775.pkl
Models_all_56corr_5folds_validation42/HT2_fold_4_iter_13027_score_0.79944.pkl


In [7]:
pred_df_first_half.shape, pred_df_second_half.shape

((462310, 5), (462311, 5))

In [8]:
pred_df = pd.concat([pred_df_first_half, pred_df_second_half], axis=0)
pred_df.shape

(924621, 5)

In [9]:
pred_df.mean(axis = 1).to_csv('My_Predictions/' + f'p_M12_{max_params}.csv', header=['prediction'])