In [14]:
import pandas as pd
from sklearn.metrics import roc_auc_score
import joblib
import os
import numpy as np

import sys
sys.path.append("..")
from src.modeling import find_best_ensemble, predict_proba_in_batches

# Setup and data loading

- Load test set (during inference on kaggle we'll have the full dataset)
- Combine with static features and filter using columns used during training

In [15]:
### Setup
ROOT = '..'
EXTENSION = 'parquet'
DATA_FOLDER = 'data'
MODEL_FOLDER = 'models'
OUTPUT_FOLDER = 'output'

In [16]:
test = pd.read_parquet(f'{ROOT}/{DATA_FOLDER}/test/test_base.{EXTENSION}')
test['date_decision'] = pd.to_datetime(test['date_decision'])
test.head(2)

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM
0,57543,2020-10-06,202010,92
1,57549,2020-10-06,202010,92


In [17]:
columns_to_keep = pd.read_csv(f'{ROOT}/{DATA_FOLDER}/{OUTPUT_FOLDER}/columns_to_keep.csv')

In [18]:
# Import static features
test_static_0 = pd.read_parquet(f'{ROOT}/{DATA_FOLDER}/test/test_static_0_0.{EXTENSION}')
test_static_1 = pd.read_parquet(f'{ROOT}/{DATA_FOLDER}/test/test_static_0_1.{EXTENSION}')
test_static_concat = pd.concat([test_static_0, test_static_1])

# Importing columns to keep
columns_to_keep = pd.read_csv(f'{ROOT}/{DATA_FOLDER}/{OUTPUT_FOLDER}/columns_to_keep.csv')['cols'].tolist()

test_merged = test.merge(test_static_concat, on='case_id', validate='1:1')
test_merged = test_merged.loc[:, columns_to_keep]
test_merged.head()

Unnamed: 0,case_id,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,...,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,price_1097A,sellerplacecnt_915L,sellerplacescnt_216L,sumoutstandtotal_3546847A,totaldebt_9A,totalsettled_863A
0,57543,202010,92,,,7637.2,0.0,0.0,0.0,0.0,...,36.0,0.0,0.0,0.0,0.0,0.0,1.0,,0.0,0.0
1,57549,202010,92,,,902.60004,0.0,0.0,0.0,0.0,...,12.0,0.0,0.0,1.0,,0.0,0.0,,0.0,0.0
2,57551,202010,92,,,3610.2,0.0,0.0,0.0,0.0,...,12.0,0.0,0.0,0.0,,0.0,1.0,,0.0,0.0
3,57552,202010,92,,,6964.4,0.0,0.0,0.0,0.0,...,18.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0
4,57569,202010,92,,,5553.4,0.0,0.0,0.0,0.0,...,11.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0


In [19]:
x = test_merged.drop(['case_id'], axis=1)

# Hill climbing: use OOF files to find best weights

- Load OOF files and models used during training
- Start with the best model in terms of predictive power
- Iteratively test weights until new best score is found (if any)
- Compare new best score with previous one
- If new best is better (+ tolerance), save the weight and the model

In [20]:
OOF = np.sort( [f for f in os.listdir('{}/{}/{}'.format(ROOT, DATA_FOLDER, OUTPUT_FOLDER)) if 'oof' in str(f)] )
MODELS = np.sort( [f for f in os.listdir('{}/{}/{}'.format(ROOT, DATA_FOLDER, MODEL_FOLDER))] )
OOF_CSV = [pd.read_csv('{}/{}/{}/{}'.format(ROOT, DATA_FOLDER, OUTPUT_FOLDER, k)) for k in OOF]
MODELS_LOAD = [joblib.load('{}/{}/{}/{}'.format(ROOT, DATA_FOLDER, MODEL_FOLDER, k)) for k in MODELS]

print('We have %i oof files...'%len(OOF))
print('We have %i models...'%len(MODELS))
print(); print(OOF); print(MODELS)

We have 2 oof files...
We have 2 models...

['oof_lgb_1_202402201134.csv' 'oof_xb_1_202402201134.csv']
['lgb_1_202402201134.pkl' 'xb_1_202402201134.pkl']


In [21]:
oof_scores = []
truth_values = None
for i, (file, model) in enumerate(zip(OOF_CSV, MODELS)):
    if truth_values is None:
        truth_values = file['truth']
    score = roc_auc_score(file['truth'], file['oof'])
    oof_scores.append(score)
    print('Model {} has AUC {:.4f}'.format(i, score))

Model 0 has AUC 0.7864
Model 1 has AUC 0.7802


In [22]:
# Find the best initial model
initial_best_index = np.argmax(oof_scores)
initial_best_index

0

In [23]:
# Build the first ensemble...
current_score = roc_auc_score(truth_values, OOF_CSV[initial_best_index].oof)
best_weights = [1]
best_models = [initial_best_index]
current_ensemble = OOF_CSV[initial_best_index].oof

# ...and perform iteratively the model search using find_best_ensemble function
for _ in range(len(OOF)):
    best_weight, best_model, best_score = find_best_ensemble(current_ensemble=current_ensemble,
                                                             best_models=best_models, 
                                                             oof_files=OOF,
                                                             oof_csv=OOF_CSV,
                                                             truth=truth_values)
    print()
    if best_score - current_score < 0.003: # If there are no significant improvements, stop the search
        print()
        print('No further improvements. Stopping.')
        break
    
    print()
    print('Ensemble AUC {:.4f} after adding model {} with weight {}. Increase of {:.4f}'.format(best_score, best_model,
                                                                                                best_weight, best_score - current_score))
    
    current_ensemble = best_weight * current_ensemble + (1-best_weight) * OOF_CSV[best_model].oof
    current_score = best_score
    
    best_weights.append(best_weight)
    best_models.append(best_model)
    print()

Searching for best model to add... 
0 , 1 , 

No further improvements. Stopping.


In [24]:
print('We are using models', best_models)
print('with weights', best_weights)
print('and achieve ensemble AUC = %.5f'% current_score)

We are using models [0]
with weights [1]
and achieve ensemble AUC = 0.78637


# Test prediction

- Given the best weights and models from previous step, get the first prediction using starting model
- Iterate over the list of models and weights found previously and ensemble the predictions
- Use predict in batch to avoid predicting on large dataset (which is going to happen during submission)

In [33]:
print('Getting initial predictions')
current_prediction = predict_proba_in_batches(MODELS_LOAD[best_models[0]], x)

print('\nIterating over models and weights to ensemble')
for model, weight in zip(best_models[1:], best_weights[1:]):
    current_prediction = weight * current_prediction + (1-weight) * predict_proba_in_batches(MODELS_LOAD[model], x)

print('\nDone')

current_prediction[:10]

Getting initial predictions
Processing batch: 1/1

Iterating over models and weights to ensemble

Done


array([0.42952113, 0.53824271, 0.38161238, 0.40126   , 0.26709266,
       0.28235209, 0.33220087, 0.25758381, 0.49248294, 0.52387615])

In [34]:
# In Kaggle we'll add the predictions to the score column in the submission file!