In [1]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import roc_auc_score

In [2]:
import warnings
warnings.filterwarnings("ignore")
pd.options.display.float_format = '{:,.2f}'.format

In [16]:
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [17]:
cat_cols = [column for column in train_df.columns if train_df[column].dtype == 'object']
num_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()

num_cols.remove('id')
num_cols.remove('CustomerId')
num_cols.remove('IsActiveMember')
num_cols.remove('HasCrCard')
num_cols.remove('Exited')
cat_cols.remove('Surname')
cat_cols.append('IsActiveMember')
cat_cols.append('HasCrCard')

train_df['HasCrCard'] = train_df['HasCrCard'].astype(bool)
train_df['IsActiveMember'] = train_df['IsActiveMember'].astype(bool)
test_df['HasCrCard'] = test_df['HasCrCard'].astype(bool)
test_df['IsActiveMember'] = test_df['IsActiveMember'].astype(bool)
train_df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,True,False,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,True,True,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,True,False,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,True,True,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,True,True,15068.83,0


In [18]:
_num_cols = []
for col in num_cols:
    train_df[f'log_{col}'] = np.log1p(train_df[col])
    test_df[f'log_{col}'] = np.log1p(test_df[col])
    _num_cols.append(f'log_{col}')
num_cols = _num_cols

In [19]:
# Split the data into features (X) and target variable (y)
X = train_df[num_cols+cat_cols]
X_test = test_df[num_cols+cat_cols]
y = train_df['Exited']

# Initialize the CatBoost Classifier
model = CatBoostClassifier(iterations=100,  # Specify the number of boosting iterations
                           depth=6,         # Specify the depth of the trees
                           learning_rate=0.1,  # Specify the learning rate
                           cat_features=cat_cols,
                           eval_metric='AUC',
                           loss_function='Logloss')  # Specify categorical features

# Create a Pool object for CatBoost
pool = Pool(data=X, label=y, cat_features=cat_cols)

# Perform cross-validation
cv_results = cv(pool,
                model.get_params(),
                fold_count=5,  # Number of folds for cross-validation
                verbose=10)  # Set verbose level for progress

# Print cross-validation results
print("Cross-Validation AUC: {:.4f} +/- {:.4f}".format(cv_results['test-AUC-mean'].max(),
                                                       cv_results['test-AUC-std'].max()))


Training on fold [0/5]
0:	test: 0.8616146	best: 0.8616146 (0)	total: 23ms	remaining: 2.28s
10:	test: 0.8740477	best: 0.8740477 (10)	total: 286ms	remaining: 2.31s
20:	test: 0.8845013	best: 0.8845013 (20)	total: 553ms	remaining: 2.08s
30:	test: 0.8859750	best: 0.8859750 (30)	total: 787ms	remaining: 1.75s
40:	test: 0.8865106	best: 0.8865106 (40)	total: 1.01s	remaining: 1.46s
50:	test: 0.8868516	best: 0.8868516 (50)	total: 1.24s	remaining: 1.19s
60:	test: 0.8870478	best: 0.8870478 (60)	total: 1.49s	remaining: 955ms
70:	test: 0.8871576	best: 0.8871576 (70)	total: 1.7s	remaining: 694ms
80:	test: 0.8873024	best: 0.8873024 (80)	total: 1.91s	remaining: 447ms
90:	test: 0.8873682	best: 0.8873682 (90)	total: 2.13s	remaining: 210ms
99:	test: 0.8874543	best: 0.8874543 (99)	total: 2.33s	remaining: 0us

bestTest = 0.8874543088
bestIteration = 99

Training on fold [1/5]
0:	test: 0.8671335	best: 0.8671335 (0)	total: 21.3ms	remaining: 2.11s
10:	test: 0.8781564	best: 0.8781564 (10)	total: 218ms	remaining:

In [21]:
model.fit(X,y, cat_features=cat_cols)

0:	total: 10.6ms	remaining: 1.05s
1:	total: 18.1ms	remaining: 885ms
2:	total: 26.8ms	remaining: 868ms
3:	total: 34.3ms	remaining: 823ms
4:	total: 41.3ms	remaining: 785ms
5:	total: 48.6ms	remaining: 761ms
6:	total: 56.2ms	remaining: 746ms
7:	total: 63.2ms	remaining: 726ms
8:	total: 72ms	remaining: 728ms
9:	total: 78.9ms	remaining: 710ms
10:	total: 86ms	remaining: 695ms
11:	total: 92.6ms	remaining: 679ms
12:	total: 99.7ms	remaining: 667ms
13:	total: 107ms	remaining: 655ms
14:	total: 114ms	remaining: 643ms
15:	total: 121ms	remaining: 633ms
16:	total: 127ms	remaining: 622ms
17:	total: 135ms	remaining: 614ms
18:	total: 142ms	remaining: 604ms
19:	total: 149ms	remaining: 595ms
20:	total: 159ms	remaining: 597ms
21:	total: 170ms	remaining: 603ms
22:	total: 178ms	remaining: 596ms
23:	total: 186ms	remaining: 589ms
24:	total: 193ms	remaining: 580ms
25:	total: 201ms	remaining: 572ms
26:	total: 209ms	remaining: 566ms
27:	total: 218ms	remaining: 560ms
28:	total: 226ms	remaining: 554ms
29:	total: 236m

<catboost.core.CatBoostClassifier at 0x16415e290>

In [24]:
test_pool = Pool(data=X_test, cat_features=cat_cols)

test_probabilities = model.predict_proba(test_pool)

test_probabilities = [prob[1] for prob in test_probabilities]
test_probabilities

[0.023136511242585953,
 0.8360631303062893,
 0.029314981240086115,
 0.21434286073834333,
 0.37955320830835165,
 0.03919057011239596,
 0.03714749126382444,
 0.0670053180264965,
 0.6056381779950756,
 0.015015962441136985,
 0.11653588251728571,
 0.032747058956357517,
 0.01620879028530582,
 0.21825755535746436,
 0.5984975780502025,
 0.04698793242102833,
 0.07830993608464218,
 0.3678400517294848,
 0.02327314724969324,
 0.08293832479344536,
 0.023576303455874813,
 0.1032010649869941,
 0.24956219636962743,
 0.011360058286929708,
 0.49234239807250885,
 0.1293789307072648,
 0.9364968134536313,
 0.5016169735347931,
 0.16854421312831952,
 0.3872248453159526,
 0.025429616585863932,
 0.04258531104855792,
 0.1483558953910509,
 0.1584481590049076,
 0.04849196394437393,
 0.013604647516482748,
 0.7924206127644329,
 0.03038150927478064,
 0.09034399599236158,
 0.8865463532237581,
 0.5596969047492611,
 0.18490161621871118,
 0.1516565424446558,
 0.015303398238834065,
 0.3714121937507328,
 0.170927513050048

In [28]:
sub = pd.DataFrame()
sub['id'] = test_df['id']
sub['Exited'] = test_probabilities

In [30]:
sub.to_csv('submission.csv')