<a href="https://colab.research.google.com/github/Mohanandu/Data-Analytics-on-FIFA/blob/main/bank_customer_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
shantanudhakadd_bank_customer_churn_prediction_path = kagglehub.dataset_download('shantanudhakadd/bank-customer-churn-prediction')

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [None]:
random_state=42

# Data preprocessing

In [None]:
df = pd.read_csv('/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv')
df.head()

In [None]:
df = df.drop(['RowNumber', 'Surname', 'CustomerId'], axis=1)
df.describe()

In [None]:
df.isna().sum()
# We don't have any NaN

In [None]:
exited = df['Exited'].value_counts()
labels = ['False', 'True']
plt.pie(exited, labels=labels, autopct='%1.1f%%')

In [None]:
# gender = df['Gender'].value_counts()
# labels = ['False', 'True']
# plt.pie(exited, labels=labels, autopct='%1.1f%%')
df['Gender'].value_counts()

In [None]:
df = pd.get_dummies(df, dtype=np.int64)
df.head()

In [None]:
df['BalanceSalaryRatio'] = df['Balance'] / df['EstimatedSalary']
df.drop(['Balance', 'EstimatedSalary'], axis=1, inplace=True)

In [None]:
corr_mat = df.corr()
plt.figure(figsize=(10, 5))
sns.set(font_scale=0.7)
sns.heatmap(corr_mat, annot=True, cmap='coolwarm')
plt.title('Correlation matrix')
plt.show()

In [None]:
Y = df['Exited']
X = df.drop('Exited', axis=1)

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=random_state)

# Estimators

## Random forest

In [None]:
random_forest_sklearn = RandomForestClassifier()
param_grid = {
    'n_estimators': [100, 200, 1000],
    'max_depth': [10, 15, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
    'class_weight': ['balanced'],
    'random_state': [random_state]
}

grid_search = GridSearchCV(estimator=random_forest_sklearn, param_grid=param_grid, n_jobs=4)
grid_search.fit(x_train, y_train)

In [None]:
best_params = grid_search.best_params_
best_params

In [None]:
y_pred = grid_search.predict(x_test)
roc_auc_score(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

### But what if we try to optimize and take only 200 estimators


In [None]:
random_forest_sklearn = RandomForestClassifier()
param_grid = {
    'n_estimators': [200],
    'max_depth': [10, 15, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
    'class_weight': ['balanced'],
    'random_state': [random_state]
}

grid_search_200 = GridSearchCV(estimator=random_forest_sklearn, param_grid=param_grid, n_jobs=4)
grid_search_200.fit(x_train, y_train)

In [None]:
y_pred = grid_search_200.predict(x_test)
roc_auc_score(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

200 estimators isn't much worse, but now we'll look at feature importance for 1000 estimators version

In [None]:
feature_importances = pd.DataFrame(columns=df.drop('Exited', axis=1).columns, data=[grid_search.best_estimator_.feature_importances_])
feature_importances.head()

## SVC

In [None]:
svc = SVC(kernel='poly', degree=7, random_state=random_state)
svc.fit(x_train, y_train)

In [None]:
y_pred = svc.predict(x_test)
roc_auc_score(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

## XGBoost

In [None]:
xgb = XGBClassifier()
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15, 20, 30],
    'learning_rate': [1e-3, 5e-2, 1e-2, 5e-1],
    'random_state': [random_state]
}

grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid, n_jobs=4)
grid_search_xgb.fit(x_train, y_train)

y_pred = grid_search_xgb.predict(x_test)
roc_auc_score(y_test, y_pred)

In [None]:
feature_importances = pd.DataFrame(columns=df.drop('Exited', axis=1).columns, data=[grid_search_xgb.best_estimator_.feature_importances_])
feature_importances.head()

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
grid_search_xgb.best_params_

## MLP

In [None]:
mlp = MLPClassifier()
param_grid = {
    'hidden_layer_sizes': [24, 32, 64, 128],
    'solver': ['adam'],
    'alpha': [1e-4, 5e-3, 1e-3],
    'learning_rate': ['adaptive'],
    'learning_rate_init': [1e-2, 5e-2, 1e-3],
    'early_stopping': [True],
    'random_state': [random_state]
}

grid_search_mlp = GridSearchCV(estimator=mlp, param_grid=param_grid, n_jobs=4)
grid_search_mlp.fit(x_train, y_train)

y_pred = grid_search_mlp.predict(x_test)
roc_auc_score(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
grid_search_mlp.best_params_

## Logistic regression with L1

In [None]:
logi = LogisticRegression(penalty='l1', class_weight='balanced', solver='liblinear', C=0.01, random_state=random_state)
logi.fit(x_train, y_train)
y_pred = logi.predict(x_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
feature_importances = pd.DataFrame(columns=df.drop('Exited', axis=1).columns, data=logi.coef_)
feature_importances.head()

# Conclusion
## Why recall?
    Recall's really important metric for this task because recall shows, how many positives from all positives we've detected and we need to detect as much customers, that'll potentially leave, as possible, and it's better for us because it's more important to retain client than anything else
## Classifier model
- RandomForestClassifier's shown accuracy of 0.87 and recall of 0.46 that means almost half of positives (customer's exited) have been detected
- The best parameters for classifier is:
- 1. class_weight = balanced
  2. max_depth = 20
  3. min_samples_leaf = 1
  4. min_samples_split = 2
  5. n_estimators = 1000
- LogisticRegression's shown best performance among all classifiers on recall (0.75), but accuracy's a bit lower (0.7). I think that this estimator's much better because of high recall increase and relatively small accuracy decrease.
## Features
- Adding new feature "BalanceSalaryRatio" and removing correlating "Balance" and "EstimatedSalary" improves model's performance
- Top 3 most important features are:
- 1. Age (0.28)
  2. CreditScore (0.18)
  3. BalanceSalaryRatio (0.16)
- But if we use Logistic Regression with L1 penalty:
- 1. Age (0.7)
  2. Geography_Germany (means that client's from Germany) (0.3)
  3. IsActiveMember (-0.37) (inverse proportionality)
- All in all, because LogisticRegression's shown better results, hence, I'll base my conclusions on it.
- The most important features are:
- 1. Age
  2. Geography_Germany
  3. IsActiveMember
  
## Experiments
- I tried RandomForest with different n_estimators and here's the results of 2 experiments:
  1. With n_estimators = 200: accuracy = 0.867, recall = 0.46
  2. With n_estimators = 1000: accuracy = 0.8675, recall = 0.46
