In [None]:
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score


import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shubhammeshram579/bank-customer-churn-prediction")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shubhammeshram579/bank-customer-churn-prediction?dataset_version_number=1...


100%|██████████| 262k/262k [00:00<00:00, 33.8MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/shubhammeshram579/bank-customer-churn-prediction/versions/1





In [None]:
file_path = f"{path}/Churn_Modelling.csv"
df = pd.read_csv(file_path)

In [None]:
# Don't need to keep customerid or surname
del df['CustomerId']
del df['Surname']

In [None]:
# Delete rows where Geography and Age has missing values
df = df.dropna(subset=['Age'])
df = df.dropna(subset=['Geography'])

df['HasCrCard'] = df['HasCrCard'].fillna(0)
df['IsActiveMember'] = df['IsActiveMember'].fillna(0)

In [None]:
# Lower case all column names and replace blanks with _
df.columns = df.columns.str.lower().str.replace(' ','_')

In [None]:
translate_values = {0: 'No', 1: 'Yes'}
df['isactivemember'] = df['isactivemember'].map(translate_values)
df['hascrcard'] = df['hascrcard'].map(translate_values)

In [None]:
categorical = ['geography', 'gender', 'isactivemember', 'hascrcard']
numerical = ['creditscore', 'age', 'tenure', 'balance', 'numofproducts', 'estimatedsalary']

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.exited.values
y_val = df_val.exited.values
y_test = df_test.exited.values

del df_train['exited']
del df_val['exited']
del df_test['exited']

In [None]:
train_dicts = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

dicts_val = df_val.to_dict(orient='records')
X_val = dv.transform(dicts_val)

In [None]:
# Need a special data structure, that is specialised for XGBoost
features = list(dv.get_feature_names_out())


In [None]:
xgb_model = xgb.XGBClassifier(max_depth=4, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [None]:
y_prob_xgb = xgb_model.predict_proba(X_val)[:, 1]  # Probability of positive class
y_pred_xgb = (y_prob_xgb >= 0.5).astype(int)

roc_auc_xgb = roc_auc_score(y_val, y_prob_xgb)

In [None]:
roc_auc_xgb

0.8635277275467149

Save Model

In [None]:
import pickle

In [None]:
output = 'Mideterm_model.bin'
output

'Mideterm_model.bin'

In [None]:
with open(output, 'wb') as f_out:
    pickle.dump((dv, xgb_model), f_out)

In [None]:
import pickle

In [None]:
model_file = 'Mideterm_model.bin'

In [None]:
with open(model_file, 'rb') as f_in:
    dv, xgb_model = pickle.load(f_in)

In [None]:
dv,xgb_model

(DictVectorizer(sparse=False),
 XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='logloss',
               feature_types=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=0.05, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=None,
               n_jobs=None, num_parallel_tree=None, random_state=None, ...))

In [None]:
customer= {'age': 35.0,
           'balance': 103502.22,
           'creditscore': 852,
           'estimatedsalary': 146191.82,
           'gender': 'Female',
           'geography': 'France',
           'hascrcard': 'Yes',
           'isactivemember': 'Yes',
           'numofproducts': 2,
           'rownumber': 252,
           'tenure': 5}

In [None]:
X = dv.transform(customer)

In [None]:
xgb_model.predict_proba(X)[0,1]

0.0537814