# "The Thera bank" Churn Prediction

In these notebook I create XGBoost model predicting customers willing to churn. Metric of my concern is Logloss.

# Load libraries

In [10]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import ClusterCentroids

from xgboost import XGBClassifier

from sklearn.model_selection import cross_validate
from sklearn import metrics

import os
import pickle
import sys

sys.path.append(os.path.join(os.environ['PWD'],'scripts'))
from Data_Prep import Data_Prep
from utils import get_metrics_score
from utils import make_confusion_matrix

from ray.tune.suggest.hyperopt import HyperOptSearch
import ray
from ray import tune



## Load and split dataset

In [2]:
train = pd.read_csv(os.path.join(os.environ['PWD'],'data/train.csv'))

X_train = train.drop('Attrition_Flag',axis=1)
y_train = train['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1}).astype('int')

## Hyperparameter tuning of XGBoost

In [3]:
# Prepare data reference for tuning client
object_ref_X_train = ray.put(X_train)
object_ref_y_train = ray.put(y_train)



In [4]:
ll_scorer = metrics.make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True)

Hyperparameter tuning using ray-tune library and optimization algorithm HyperOptSearch.

In [5]:
def training_function(config):
    # Hyperparameters
    config['n_estimators'] = int(config['n_estimators'])      
    config['learning_rate'] = 10 ** config['learning_rate']
                                           
    X_train=ray.get(object_ref_X_train)
    y_train=ray.get(object_ref_y_train)
    
    sys.path.append( os.path.join(os.environ['PWD'],'scripts'))
    from Data_Prep import Data_Prep

    estimator = Pipeline([('dp', Data_Prep()), ('ros', RandomOverSampler()), 
                          ('xgb', XGBClassifier(random_state=1, eval_metric='logloss', use_label_encoder=False, **config))])
    
    cv_results = cross_validate(estimator, X_train, y_train, scoring=ll_scorer, cv=3, return_train_score=True)

    d = {'Logloss Training':np.mean(cv_results['train_score']), 'Logloss Test':np.mean(cv_results['test_score'])}
    
    tune.report(**d)

config = {
    "n_estimators": tune.randint(80, 120),
    "gamma": tune.uniform(0, 3),
    "subsample": tune.quniform(0.7, 0.95, 0.01),
    "colsample_bytree": tune.quniform(0.7, 0.95, 0.01),
    "colsample_bylevel": tune.quniform(0.7, 0.95, 0.01),    
    "learning_rate": tune.quniform(-2.0, -1.0, 0.2),  # powers of 10
}
analysis = tune.run(
    training_function,
    config=config, 
    metric='Logloss Test',
    mode="max",
    num_samples=48,
    search_alg=HyperOptSearch(random_state_seed=1),
    resume =  'AUTO',#"ERRORED_ONLY", 
    name='XGBoost_ros_tuning', 
    local_dir=os.path.join(os.environ['PWD'],'data/ray_results'),
    verbose = 1)

2022-02-11 12:32:44,229	INFO tune.py:636 -- Total run time: 1.02 seconds (0.00 seconds for the tuning loop).


In [6]:
config = analysis.best_config
config['learning_rate'] = 10 ** config['learning_rate']

# Fit the best algorithm to the data.
estimator = Pipeline([('dp', Data_Prep()), ('ros', RandomOverSampler()), 
                          ('xgb', XGBClassifier(random_state=1, eval_metric='logloss', use_label_encoder=False, **config))])

estimator.fit(X_train, y_train)

Pipeline(steps=[('dp', Data_Prep()), ('ros', RandomOverSampler()),
                ('xgb',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=0.9400000000000001,
                               colsample_bynode=1, colsample_bytree=0.79,
                               enable_categorical=False, eval_metric='logloss',
                               gamma=1.5379798088848569, gpu_id=-1,
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.1, max_delta_step=0, max_depth=6,
                               min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=110,
                               n_jobs=12, num_parallel_tree=1, predictor='auto',
                               random_state=1, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=0.79,
                             

In [7]:
model_path = os.path.join(os.environ['PWD'],'models/xgboost_ros.pkl')
pickling_on = open(model_path,"wb")
pickle.dump(estimator, pickling_on)
pickling_on.close()
print('estimator pickled successfully!')

estimator pickled successfully!
