In [1]:
import pandas as pd
import numpy as np
import random
import joblib
import os
import gc

from evaluation_metric import amex_metric
from sklearn.model_selection import ParameterGrid

In [2]:
class Parameters:
    path = 'archive_models/Models_DART_all_10corr_5folds_validation/'
    n_folds = 5

In [None]:
def predict(data, models):

    model_list = []
    for path in [Parameters.path]:
        for fname in os.listdir(path):
            for model_name in models:
                if model_name in fname:
                    model_list.append(path + fname)

    pred_list = []
    for counter, model_path in enumerate(model_list):
        if model_path.startswith(Parameters.path):
            print(model_path)
            model = joblib.load(model_path)
            pred_list.append(model.predict(data))
    
    return pred_list, model_list

In [None]:
top_corr = [
    "corr_D_39-B_26",
    "corr_D_48-B_4",
    "corr_P_2-D_44",
    "corr_D_47-B_4",
    "corr_D_47-D_39",
    "corr_P_2-B_4",
    "corr_D_39-B_10",
    "corr_D_44-B_4",
    "corr_D_39-B_2",
    "corr_D_46-B_4",
]

In [None]:
validation = pd.read_parquet(Parameters.path + 'validation.parquet')
validation_labels = validation['target']
validation.drop('target', axis = 1, inplace=True)

validation.shape

In [None]:
models = ['HT']

In [None]:
print('Predicting the validation set...\n')
pred_df_validation, model_list = predict(validation, models)

model_names = [model.split('/')[-1][:10] for model in model_list]
pred_df_validation = pd.DataFrame(pred_df_validation).T
pred_df_validation.columns = model_names
pred_df_validation.index = validation.index

del validation
_ = gc.collect()

In [None]:
sorted_columns = sorted(pred_df_validation.columns)
pred_df_validation = pred_df_validation[sorted_columns]
pred_df_validation

In [None]:
fold_score_dic = {}
for col in pred_df_validation.columns:
    score = amex_metric(validation_labels, pred_df_validation[col])
    fold_score_dic[col] = score
score_df = pd.DataFrame.from_dict(fold_score_dic, orient='index', columns=['score']).sort_values('score', ascending=False)

score_df

In [None]:
high_score_folds = {}
for i in range(Parameters.n_folds):
    high_score_folds[f'fold_{i}'] = score_df[score_df.index.str.contains(f'fold_{i}')].iloc[0:].index.tolist()

In [None]:
grid = list(ParameterGrid(high_score_folds))
len_grid = len(grid)
print(f'Number of combinations: {len_grid}')

for counter, i in enumerate(grid):
    if counter % int(len_grid/10) == 0:
        print(f'{counter}', end = ', ')
    score = amex_metric(validation_labels, pred_df_validation[[i['fold_0'], i['fold_1'], i['fold_2'], i['fold_3'], i['fold_4']]].mean(axis = 1))
    grid[counter]['score'] = score

In [None]:
fold_df = pd.DataFrame(grid).sort_values('score', ascending = False).reset_index(drop = True)
fold_df

In [None]:
max_score = 0
for j in range(5):
    choosen_folds = fold_df.iloc[j,:-1].values.tolist()
    print(choosen_folds)
    for i in range(50000):
        weights = np.random.rand(len(choosen_folds))
        weighted_prediction = pred_df_validation[choosen_folds].multiply(weights).mean(axis = 1)
        score = amex_metric(validation_labels, weighted_prediction)
        if score > max_score:
            max_score = score
            max_weights = weights
            max_prediction = weighted_prediction
            max_choosen_folds = choosen_folds
            print(f'New max score: {max_score:.5f}')

In [None]:
max_choosen_folds = ['HT4']

In [None]:
max_weights = [0.9058,0.6510,0.3286,0.8022,0.0267]

In [None]:
del pred_df_validation, validation_labels, fold_df, grid, max_prediction
_ = gc.collect()

In [None]:
test_first_half = pd.read_parquet('Data/test_all_slopes_corr_pcaslope_lagv2_avediff_catLastLastNAdate_part1.parquet')
corr_col = test_first_half.columns[test_first_half.columns.str.startswith("corr_")].to_list()
corr_to_remove = set(corr_col).difference(set(top_corr))
test_first_half.drop(corr_to_remove, axis=1, inplace=True)

print('Predicting the first half...')
pred_list_first_half, model_list_first_half = predict(test_first_half, max_choosen_folds)

model_names = [model.split('/')[-1][:10] for model in model_list_first_half]
pred_df_first_half = pd.DataFrame(pred_list_first_half).T
pred_df_first_half.columns = model_names
pred_df_first_half.index = test_first_half.index

del test_first_half
_ = gc.collect()

test_second_half = pd.read_parquet('Data/test_all_slopes_corr_pcaslope_lagv2_avediff_catLastLastNAdate_part2.parquet')
test_second_half.drop(corr_to_remove, axis=1, inplace=True)
print('\nPredicting the second half...')
pred_list_second_half, model_list_second_half = predict(test_second_half, max_choosen_folds)

model_names = [model.split('/')[-1][:10] for model in model_list_second_half]
pred_df_second_half = pd.DataFrame(pred_list_second_half).T
pred_df_second_half.columns = model_names
pred_df_second_half.index = test_second_half.index

del test_second_half
_ = gc.collect()

In [None]:
pred_df_first_half.shape, pred_df_second_half.shape

In [None]:
pred_df = pd.concat([pred_df_first_half, pred_df_second_half], axis=0)
pred_df.shape

In [None]:
pred_df = pred_df[max_choosen_folds]
pred_df

In [None]:
pred_df.to_csv(Parameters.path + 'p_M10_HT4_folds.csv')
# pred_df[max_choosen_folds].multiply(max_weights).mean(axis = 1).to_csv(Parameters.path + 'p_M10_HT4.csv', header=['prediction'])
pred_df.mean(axis = 1).to_csv(Parameters.path + 'p_M10_HT4.csv', header=['prediction'])