# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from catboost import Pool, CatBoostClassifier, cv, sum_models

%matplotlib inline

# Read csv

In [2]:
df_train = pd.read_csv('/data/train.csv')
df_test = pd.read_csv('/data/test.csv')
df_sample_s = pd.read_csv('/data/sample_submission.csv')

# Add features

In [3]:
df_train[['HasCrCard', 'IsActiveMember']] = df_train[['HasCrCard', 'IsActiveMember']].astype(int)
df_test[['HasCrCard', 'IsActiveMember']] = df_test[['HasCrCard', 'IsActiveMember']].astype(int)

In [4]:
bins = [300, 579, 669, 739, 799, 850]
labels = [0, 1, 2, 3, 4]
df_train['Cred_grp'] = pd.cut(df_train['CreditScore'], bins=bins, labels=labels)

In [5]:
bins = [0, 50000, 100000, 150000, 200000]
labels = [0, 1, 2, 3]
df_train['Salary_grp'] = pd.cut(df_train['EstimatedSalary'], bins=bins, labels=labels)

In [8]:
def get_group_age(df):
    if df['Age'] <= 21:
        return 0
    if df['Age'] > 21 and df['Age'] <= 35:
        return 1
    if df['Age'] > 35 and df['Age'] <= 60:
        return 2
    if df['Age'] > 60 and df['Age'] <= 75:
        return 3
    if df['Age'] > 75 and df['Age'] <= 90:
        return 4
    if df['Age'] > 90:
        return 5

# def get_agg

In [9]:
df_train['Age_grp'] = df_train.apply(get_group_age, axis=1)

In [10]:
df_test['Age_grp'] = df_test.apply(get_group_age, axis=1)

# Sampling data

In [11]:
cat_col_pl = ['Cred_grp', 'Salary_grp']

cat_col = ['Gender', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Geography', 'Surname', 'Age_grp']
info_col = ['Exited', 'id', 'CustomerId']
features = [f for f in df_train.columns if f not in info_col]

In [12]:
dev, test_oos = train_test_split(df_train, test_size=0.001, stratify=df_train['Exited'], random_state=55, shuffle=True)
train, valid = train_test_split(dev, test_size=0.01, stratify=dev['Exited'], random_state=55)

# Prepare pools for model

In [13]:
train_pool, valid_pool, oos_pool, test_pool = (
    Pool(
        train[features],
        label=train['Exited'],
        cat_features=cat_col,
        text_features=None,
    ),
    Pool(
        valid[features],
        label=valid['Exited'],
        cat_features=cat_col,
        text_features=None,
    ),
    Pool(
        test_oos[features],
        label=test_oos['Exited'],
        cat_features=cat_col,
        text_features=None,
    ),
     Pool(
        df_test[features],
        cat_features=cat_col,
        text_features=None,
    )
)

# Prepare model

In [14]:
cbc = CatBoostClassifier(
    iterations=1000,
    early_stopping_rounds=100,
    depth=4,
    eval_metric='AUC',
)

# Learning model

In [15]:
cbc.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.111548
0:	test: 0.8460679	best: 0.8460679 (0)	total: 250ms	remaining: 4m 9s
1:	test: 0.8518228	best: 0.8518228 (1)	total: 347ms	remaining: 2m 53s
2:	test: 0.8571644	best: 0.8571644 (2)	total: 436ms	remaining: 2m 25s
3:	test: 0.8570586	best: 0.8571644 (2)	total: 520ms	remaining: 2m 9s
4:	test: 0.8633183	best: 0.8633183 (4)	total: 599ms	remaining: 1m 59s
5:	test: 0.8638439	best: 0.8638439 (5)	total: 670ms	remaining: 1m 50s
6:	test: 0.8662883	best: 0.8662883 (6)	total: 754ms	remaining: 1m 47s
7:	test: 0.8693906	best: 0.8693906 (7)	total: 859ms	remaining: 1m 46s
8:	test: 0.8701929	best: 0.8701929 (8)	total: 949ms	remaining: 1m 44s
9:	test: 0.8697146	best: 0.8701929 (8)	total: 1.04s	remaining: 1m 42s
10:	test: 0.8697432	best: 0.8701929 (8)	total: 1.12s	remaining: 1m 40s
11:	test: 0.8697994	best: 0.8701929 (8)	total: 1.21s	remaining: 1m 39s
12:	test: 0.8709037	best: 0.8709037 (12)	total: 1.3s	remaining: 1m 38s
13:	test: 0.8727375	best: 0.8727375 (13)	total: 1.4s	remain

<catboost.core.CatBoostClassifier at 0x1fafad5a3a0>

# Check model quality

In [16]:
def get_metric(model, pool, metric):
    return metric(pool.get_label(), model.predict(pool, prediction_type="RawFormulaVal"))


def print_cb_metrics(model, train_pool, valid_pool=None, oos_pool=None, oot_pool=None, metric=None):
    
    metrics = {
        'train': None,
        'valid': None,
        'oos': None,
        'oot': None,
    }
    
    if metric is None:
        metric = {'name': 'AUC', 'func': roc_auc_score}
    
    tr_m = get_metric(model, train_pool, metric.get('func'))
    print(f"TRAIN {metric.get('name')}: [{tr_m}]")
    metrics['train'] = tr_m
    
    if valid_pool is not None:
        v_m = get_metric(model, valid_pool, metric.get('func'))
        print(f"VALID {metric.get('name')}: [{v_m}]")
        metrics['valid'] = v_m
    
    if oos_pool is not None:
        o_m = get_metric(model, oos_pool, metric.get('func'))
        print(f"OOS {metric.get('name')}: [{o_m}]")
        metrics['oos'] = o_m
        
    if oot_pool is not None:
        ot_m = get_metric(model, oot_pool, metric.get('func'))
        print(f"OOT {metric.get('name')}: [{ot_m}]")
        metrics['oot'] = ot_m
    
    return metrics

In [17]:
auc = {'name': 'AUC', 'func': roc_auc_score}

In [18]:
print_cb_metrics(model=cbc, train_pool=train_pool, valid_pool=valid_pool, oos_pool=oos_pool, oot_pool=None, metric=auc)

TRAIN AUC: [0.9040704375724191]
VALID AUC: [0.8966472819708138]
OOS AUC: [0.9006267542622566]


{'train': 0.9040704375724191,
 'valid': 0.8966472819708138,
 'oos': 0.9006267542622566,
 'oot': None}

## Check additional Information

In [29]:
cbc.get_feature_importance()

array([ 6.62166262,  2.61134041,  4.9191447 ,  4.52601347, 20.4597822 ,
        0.84038425, 10.9664171 , 31.28607846,  0.59946371, 11.5892311 ,
        2.12116493,  3.45931705])

In [30]:
cbc.feature_names_

['Surname',
 'CreditScore',
 'Geography',
 'Gender',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary',
 'Age_grp']

# Collecting submit

In [16]:
def get_norm_score(score):
    mass = []
    for i in score:
        mass.append(i[1])
    return mass

def get_sample_submission(model, pool, df):
    pred = model.predict_proba(pool)
    pred1 = get_norm_score(pred)
    df['Exited'] = pred1
    df_fin = df[['id', 'Exited']]
    return df_fin

In [17]:
df = get_sample_submission(cbc, test_pool, df_test)

# Save submit

In [18]:
df.to_csv('/conda_proj/kaggle_cup/data/my_score_v_1.2.6.csv', index=False)