# Import modules

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score,
    balanced_accuracy_score,
    make_scorer,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay

# Import data
Load data from https://www.kaggle.com/datasets/shrutimechlearn/churn-modelling/data

In [3]:
df = pd.read_csv("Churn_Modelling.csv")

In [4]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


I'm going to drop CustomerId, RowNumber and Surname columns becouse they are unique and irrrelevant

In [5]:
df.drop(["RowNumber", "CustomerId", "Surname"], axis=1, inplace=True)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
df.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

Check all non numeric columns 

In [7]:
df["Geography"].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [8]:
df["Gender"].unique()

array(['Female', 'Male'], dtype=object)

In [10]:
pd.get_dummies(df["Geography"])

Unnamed: 0,France,Germany,Spain
0,True,False,False
1,False,False,True
2,True,False,False
3,True,False,False
4,False,False,True
...,...,...,...
9995,True,False,False
9996,True,False,False
9997,True,False,False
9998,False,True,False


# Format data

In [11]:
X = df.drop("Exited", axis=1).copy()
y = df["Exited"].copy()
X_encoded = pd.get_dummies(X, columns=["Geography", "Gender"])

# Split data

In [12]:
sum(y) / len(y)

0.2037

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

In [14]:
X_train.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
2151,753,57,7,0.0,1,1,0,159475.08,True,False,False,False,True
8392,739,32,3,102128.27,1,1,0,63981.37,False,True,False,False,True
5006,755,37,0,113865.23,2,1,1,117396.25,False,True,False,True,False
4117,561,37,5,0.0,2,1,0,83093.25,True,False,False,False,True
7182,692,49,6,110540.43,2,0,1,107472.99,False,True,False,False,True


In [18]:
max(X_train["EstimatedSalary"])

199992.48