In [3]:
import sklearn.linear_model._logistic

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import catboost
import lightgbm
import xgboost
from scipy.stats import rankdata
from sklearn.metrics import f1_score, make_scorer

%matplotlib inline
pd.set_option('max.columns', 999)
pd.set_option('max.rows', 999)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [18]:
macro_f1_score = make_scorer(f1_score, average='macro')

def custom_f1_lgbm(preds, train_data):
    ## eval_name, eval_result, is_higher_better
    return 'F1', f1_score(train_data.label, preds > 0.5), False

def custom_f1_xgb(preds, train_data):
    ## eval_name, eval_result, is_higher_better
    return 'F1', f1_score(train_data.get_label(), preds > 0.5)

In [3]:
train = pd.read_csv('../data/callcenter_train.csv', index_col='id')
time_columns = [
    'Время окончания разговора с оператором',
    'Время переключения на оператора',
    'Время постановки в очередь',
    'Время окончания вызова',
    'Время начала вызова',
]

In [4]:
for col in time_columns:
    train[col] = train[col].apply(pd.Timestamp) - pd.Timestamp('00:00:00')
    train[col] = train[col].fillna(pd.Timestamp('00:00:00') - pd.Timestamp('00:00:00'))

In [5]:
for i, col_1 in enumerate(time_columns[:-1]):
    for col_2 in time_columns[i + 1:]:
        train[f'{col_1}-{col_2}'] = abs(train[col_1] - train[col_2])

In [6]:
for col in train.columns.drop(['Метка', 'Длительность разговора с оператором, сек']):
    train[f'{col}_seconds'] = train[col].apply(lambda x: x.seconds)

In [7]:
new_train = train[
    ['Метка', 'Длительность разговора с оператором, сек'] + [x for x in train.columns if 'seconds' in x]
]

In [8]:
N_SPLITS = 5
SEED = 42
valid_target = new_train[['Метка']].copy()
cv = StratifiedKFold(N_SPLITS, shuffle=True, random_state=SEED)

In [9]:
def get_thresh(label, pred):
    all_thresh = np.linspace(pred.min(), pred.max(), 100)
    results = [(th, f1_score(label, pred > th)) for th in all_thresh]
    return max(results, key=lambda x: x[1])

## CatBoost

In [11]:
models_catboost = []
valid_target['oof_prediction_catboost'] = 0

for i, (train_index, valid_index) in enumerate(cv.split(new_train.index, new_train['Метка'])):
    print(f'START {i} FOLD')
    
    ## Create train, valid data pools
    train_pool = catboost.Pool(
        new_train.loc[train_index].drop('Метка', axis=1),
        new_train.loc[train_index, 'Метка'],
        thread_count = 1,
    )
    valid_pool = catboost.Pool(
        new_train.loc[valid_index].drop('Метка', axis=1),
        new_train.loc[valid_index, 'Метка'],
        thread_count = 1,
    )
    ## train model
    model = catboost.train(
        params = {
            'loss_function':'Logloss',
            'custom_metric': 'F1',
            'verbose': False,
            'use_best_model': True,
            'random_state': SEED,
            'learning_rate': 0.03,
        },
        pool = train_pool,
        eval_set = valid_pool,
        iterations = 2000,
        early_stopping_rounds=20,
    )
    
    ## add model to list of models
    models_catboost.append(model)
    
    ## predict valid
    valid_target.loc[valid_index, 'oof_prediction_catboost'] = model.predict(
        new_train.loc[valid_index].drop('Метка', axis=1)
    )

## show val score
valid_target['oof_prediction_catboost'] = rankdata(valid_target['oof_prediction_catboost'])
valid_target['oof_prediction_catboost'] = valid_target['oof_prediction_catboost']/valid_target['oof_prediction_catboost'].max()

thresh_cat, val_score_cat = get_thresh(valid_target['Метка'], valid_target['oof_prediction_catboost'])
print(f'VALIDATION CATBOOST THRESH: {thresh_cat}')
print(f'VALIDATION CATBOOST SCORE: {val_score_cat}')

START 0 FOLD
START 1 FOLD
START 2 FOLD
START 3 FOLD
START 4 FOLD
VALIDATION CATBOOST THRESH: 0.6464761233947054
VALIDATION CATBOOST SCORE: 0.8097818784431405


### Lightgbm

In [20]:
models_lgbm = []
valid_target['oof_prediction_lightgbm'] = 0

for i, (train_index, valid_index) in enumerate(cv.split(new_train, new_train['Метка'])):
    print(f'START {i} FOLD')
    
    ## Create train, valid data sets
    train_set = lightgbm.Dataset(
        new_train.loc[train_index].drop('Метка', axis=1),
        new_train.loc[train_index, 'Метка'],
    )
    valid_set = lightgbm.Dataset(
        new_train.loc[valid_index].drop('Метка', axis=1),
        new_train.loc[valid_index, 'Метка'],
    )
    ## train model
    model = lightgbm.train(
        params = {
            'loss_function':'logloss',
            'learning_rate': 0.01,
            'random_state': SEED,
        },
        train_set = train_set,
        valid_sets = valid_set,
        feval = custom_f1_lgbm,
        num_boost_round = 2000,
        early_stopping_rounds = 20,
        verbose_eval = False,
    )
    
    ## add model to list of models
    models_lgbm.append(model)
    
    ## predict valid
    valid_target.loc[valid_index, 'oof_prediction_lightgbm'] = model.predict(
        new_train.loc[valid_index].drop('Метка', axis=1)
    )
    
## show val score
valid_target['oof_prediction_lightgbm'] = rankdata(valid_target['oof_prediction_lightgbm'])
valid_target['oof_prediction_lightgbm'] = valid_target['oof_prediction_lightgbm']/valid_target['oof_prediction_lightgbm'].max()

thresh_lgbm, val_score_lgbm = get_thresh(valid_target['Метка'], valid_target['oof_prediction_lightgbm'])
print(f'VALIDATION LIGHtGBM THRESH: {thresh_lgbm}')
print(f'VALIDATION LIGHtGBM SCORE: {val_score_lgbm}')

START 0 FOLD
START 1 FOLD
START 2 FOLD
START 3 FOLD
START 4 FOLD
VALIDATION LIGHtGBM THRESH: 0.6506003204898233
VALIDATION LIGHtGBM SCORE: 0.8070097406929679


### Xgboost

In [21]:
models_xgb = []
valid_target['oof_prediction_xgboost'] = 0

for i, (train_index, valid_index) in enumerate(cv.split(new_train, new_train['Метка'])):
    print(f'START {i} FOLD')
    
    ## Create train, valid data sets
    train_set = xgboost.DMatrix(
        new_train.loc[train_index].drop('Метка', axis=1),
        new_train.loc[train_index, 'Метка'],
    )
    valid_set = [(
        xgboost.DMatrix(
            new_train.loc[valid_index].drop('Метка', axis=1),
            new_train.loc[valid_index, 'Метка'],
        ),
        'valid'
    )]
 
    ## train model
    model = xgboost.train(
        params = {
            'loss_function':'logloss',
            'max_depth': 10,
            'random_state': SEED,
            'learning_rate': 0.01,
        },
        dtrain = train_set,
        evals = valid_set,
        feval = custom_f1_xgb,
        maximize = False,
        num_boost_round = 2000,
        early_stopping_rounds = 20,
        verbose_eval = False,
    )
    
    ## add model to list of models
    models_xgb.append(model)
    
    ## predict valid
    valid_target.loc[valid_index, 'oof_prediction_xgboost'] = model.predict(
        xgboost.DMatrix(new_train.loc[valid_index].drop('Метка', axis=1))
    )

## show val score
valid_target['oof_prediction_xgboost'] = rankdata(valid_target['oof_prediction_xgboost'])
valid_target['oof_prediction_xgboost'] = valid_target['oof_prediction_xgboost']/valid_target['oof_prediction_xgboost'].max()

thresh_xgb, val_score_xgb = get_thresh(valid_target['Метка'], valid_target['oof_prediction_xgboost'])
print(f'VALIDATION XGBOOST THRESH: {thresh_xgb}')
print(f'VALIDATION XGBOOST SCORE: {val_score_xgb}')

START 0 FOLD


  if getattr(data, 'base', None) is not None and \


START 1 FOLD
START 2 FOLD
START 3 FOLD
START 4 FOLD
VALIDATION XGBOOST THRESH: 0.6574467467009464
VALIDATION XGBOOST SCORE: 0.8037966101694916


In [22]:
all_models = ['CATBOOST', 'LIGHTGBM', 'XGBOOST']

## Validate score

In [23]:
mean_prediction = valid_target[[f'oof_prediction_{model_name.lower()}' for model_name in all_models]].mean(axis=1)

thresh, val_score = get_thresh(valid_target['Метка'], mean_prediction)

print(f'VALIDATION THRESH: {thresh}')
print(f'VALIDATION SCORE: {val_score}')

VALIDATION THRESH: 0.641025108038818
VALIDATION SCORE: 0.8071583122971511
