In [1]:
import os
os.chdir('..')

In [2]:
%load_ext dotenv
%dotenv -o .env

In [3]:
DATA_DIR = os.environ['DATA_DIR']
RESULTS_DIR = os.environ['RESULTS_DIR']

### Train the model

In [4]:
import os
import json
import datetime
import argparse
import lightgbm as lgb
from ta.preprocess import get_data
from ta.training import get_feature_names, get_loss_fct_weights
from typing import Dict

DATA_DIR = os.environ['DATA_DIR']
RESULTS_DIR = os.environ['RESULTS_DIR']

data_dir = DATA_DIR
results_dir = RESULTS_DIR
t_train = datetime.datetime(2017, 4, 29)
t_val = datetime.datetime(2017, 5, 1)
lgb_parameters = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': ['binary', 'auc'],
        'first_metric_only': True,

        'num_iterations': 500,
        'num_leaves': 11,
        'min_data_in_leaf': 30,

        'learning_rate': 0.03,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.5,
        'bagging_freq': 1,

        'lambda_l2': 0.1
    }

In [20]:
os.makedirs(results_dir, exist_ok=True)

data, le, le_countries = get_data(
    data_dir=data_dir,
    load_data=True,
    save_data=False,
    countries_overwrite=False,
    nrows=None,
    google_api_key=None
)

contiuous_columns, categorical_columns, target_col, data_info = \
    get_feature_names(data)

train_cond = data['ts'] < t_train
val_cond = data['ts'].between(t_train, t_val)
test_cond = data['ts'] > t_val

weights_train = get_loss_fct_weights(data, train_cond, target_col)
weights_val = get_loss_fct_weights(data, val_cond, target_col)
weights_test = get_loss_fct_weights(data, test_cond, target_col)

lgb_train = lgb.Dataset(
    data=data.loc[train_cond, contiuous_columns + categorical_columns],
    label=data.loc[train_cond, target_col],
    weight=weights_train,
    categorical_feature=categorical_columns,
    free_raw_data=False)

lgb_val = lgb.Dataset(
    data=data.loc[val_cond, contiuous_columns + categorical_columns],
    label=data.loc[val_cond, target_col],
    weight=weights_val,
    categorical_feature=categorical_columns,
    free_raw_data=False)

lgb_test = lgb.Dataset(
    data=data.loc[test_cond, contiuous_columns + categorical_columns],
    label=data.loc[test_cond, target_col],
    weight=weights_test,
    # feature_name=features,
    categorical_feature=categorical_columns,
    free_raw_data=False)

In [21]:
evals_result = dict()

gbm = lgb.train(
    params=lgb_parameters,
    train_set=lgb_train,
    early_stopping_rounds=50,
    valid_names=['train', 'val'],
    valid_sets=[lgb_train, lgb_val],
    verbose_eval=20,
    evals_result=evals_result)




Training until validation scores don't improve for 50 rounds
[20]	train's binary_logloss: 0.434533	train's auc: 0.970976	val's binary_logloss: 0.420851	val's auc: 0.981318
[40]	train's binary_logloss: 0.314691	train's auc: 0.973819	val's binary_logloss: 0.296052	val's auc: 0.981814
[60]	train's binary_logloss: 0.2525	train's auc: 0.975719	val's binary_logloss: 0.232756	val's auc: 0.981837
[80]	train's binary_logloss: 0.219522	train's auc: 0.977789	val's binary_logloss: 0.204058	val's auc: 0.9826
[100]	train's binary_logloss: 0.200447	train's auc: 0.978905	val's binary_logloss: 0.189634	val's auc: 0.983181
[120]	train's binary_logloss: 0.186004	train's auc: 0.980118	val's binary_logloss: 0.18228	val's auc: 0.983089
[140]	train's binary_logloss: 0.174246	train's auc: 0.981269	val's binary_logloss: 0.178358	val's auc: 0.983075
[160]	train's binary_logloss: 0.165501	train's auc: 0.982248	val's binary_logloss: 0.17622	val's auc: 0.9831
[180]	train's binary_logloss: 0.158544	train's auc: 0.9

In [22]:
y_train = lgb_train.label
y_val = lgb_val.label
y_test = lgb_test.label.values

y_hat_train = gbm.predict(lgb_train.data)
y_hat_val = gbm.predict(lgb_val.data)
y_hat_test = gbm.predict(lgb_test.data)

In [23]:
from sklearn.metrics import auc, roc_curve, confusion_matrix

In [24]:
# AUC 

fpr, tpr, thresholds = roc_curve(lgb_train.label, y_hat_train, pos_label=1)
print(auc(fpr, tpr))

fpr, tpr, thresholds = roc_curve(lgb_val.label, y_hat_val, pos_label=1)
print(auc(fpr, tpr))

fpr, tpr, thresholds = roc_curve(lgb_test.label, y_hat_test, pos_label=1)
print(auc(fpr, tpr))

0.9823804591207745
0.9830152623262862
0.9807941603807236


In [33]:
# Confusion matrix
# TN  FP
# FN  TP
threshold = 0.6

display(confusion_matrix(y_train, (y_hat_train>threshold)*1))
display(confusion_matrix(y_val, (y_hat_val>threshold)*1))
display(confusion_matrix(y_test, (y_hat_test>threshold)*1))

array([[26877,  1564],
       [   80,  1262]])

array([[11012,   380],
       [   29,   275]])

array([[4051,  206],
       [  10,  152]])

In [34]:
# y_train.sum(), (y_hat_train>0.6).sum()