# Bank Customer Churn Prediction 

## This project aims to build a classification model to predict whether a bank customer will churn. Identify the most influential features contributing to churn and develop a high-performance model to detect at-risk customers in advance.

______________________________________________________________________________________________________________________________________

## Problem Statement

### Objective

To forecast future churn (Exited) for each customer (ID) based on their informations.
This will contribute to:

・Making Strategy and approaches for future customer who will churn



### Problem Type
Classification problem

### Objective Variable (Target)
Exited:customer who churned bank

### Evaluation Metric

RMSLE (Root Mean Squared Logarithmic Error):
Calculated using the root mean square of the logarithmic difference between forecast and actual sales.


In [None]:
# import libraries

# 1. to handle the data
import pandas as pd
import numpy as np

# to visualize the dataset
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# this is for jupyter notebook to show the plot in the notebook itself instead of opening a new window
%matplotlib inline

# To preprocess the data
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
#Model
import lightgbm as lgb

#Evaluation
from sklearn.metrics import roc_auc_score

# ignore warnings   
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import zipfile

with zipfile.ZipFile('playground-series-s4e1.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [None]:
# Load Submission Data 
df_submission = pd.read_csv('data/sample_submission.csv')
# Load test Data 
df_test = pd.read_csv('data/test.csv')
# Load Train Dataset and show head of Data 
#Train Data 
df_train = pd.read_csv('data/train.csv')

## EDA

In [None]:
print(f'Sample submission file: {df_submission.shape}\n')

print(f'Infomation :\n{df_submission.info()}')
df_submission.head()

In [None]:
print(f'Test file:\n{df_test.shape}\n')


print(f'Infomation :\n{df_test.info()}')
df_test.head()

In [None]:
print(f'Train file:\n{df_train.shape}')

print(f'Infomation :\n{df_train.info()}')
df_train.head()

In [None]:
files = [df_submission, df_train, df_test]
names = ['df_submission', 'df_train', 'df_test']

for name, df in zip(names, files):
    if df.isnull().sum().sum() == 0:
        print(f'{name} does not have null values.')
    else:
        print(f'{name} has null values:')
        print(df.isnull().sum())

In [None]:
files = [df_submission, df_train, df_test]
names = ['df_submission', 'df_train', 'df_test']

for name, df in zip(names, files):
    if df.duplicated().sum().sum() == 0:
        print(f'{name} does not have duplicated values.')
    else:
        print(f'{name} has duplicated values:')
        print(df.duplicated().sum())

In [None]:
df_test.describe()

In [None]:
df_train.describe()

## Destribution of Target Feature

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='Exited', data=df_train)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 'Exited' を含めてからプロット対象を決める
train_columns = df_train.drop(['id', 'Surname', 'CustomerId'], axis=1)

num_cols = train_columns.drop('Exited', axis=1).columns
n_cols = 3  # 横方向の列数
n_rows = 4  # 縦方向の行数

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 12))
axes = axes.flatten()

for i, col in enumerate(num_cols):
    ax = axes[i]
    sns.histplot(data=train_columns, x=col, hue='Exited', multiple='stack', bins=30, ax=ax)
    ax.set_title(f'{col}')
    ax.set_xlabel('')
    ax.set_ylabel('')

# 余った枠があれば非表示にする
for j in range(len(num_cols), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


## Visualization

## Age vs Exited

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 1. ビンの定義と分割
bin_edges = list(range(10, 81, 10))  # [10, 20, 30, ..., 80]
bin_labels = [f"{i}〜{i+10}" for i in bin_edges[:-1]]
df_train['Age_bin'] = pd.cut(df_train['Age'], bins=bin_edges, labels=bin_labels, right=False)

# 2. カウント集計（Exited=0,1）
age_bin_counts = df_train.groupby(['Age_bin', 'Exited']).size().unstack(fill_value=0)

# 3. 総数と割合の計算
age_bin_counts['Total'] = age_bin_counts[0] + age_bin_counts[1]
age_bin_counts['Exited_0_pct'] = (age_bin_counts[0] / age_bin_counts['Total'] * 100).round(1)
age_bin_counts['Exited_1_pct'] = (age_bin_counts[1] / age_bin_counts['Total'] * 100).round(1)

# 4. プロット（積み上げ棒グラフ）
ax = age_bin_counts[[0, 1]].plot(kind='bar', stacked=True, figsize=(13,5))

# 5. アノテーションの追加
for idx, rect in enumerate(ax.patches):
    # 棒グラフは左から [Exited=0, Exited=1, Exited=0, Exited=1, ...]
    bin_idx = idx // 2  # 年代のインデックス
    exited_type = idx % 2  # 0: Stayed, 1: Exited

    value = rect.get_height()
    if exited_type == 1:  # 一番上のバーにだけ注釈を書く
        total = age_bin_counts.iloc[bin_idx]['Total']
        pct_0 = age_bin_counts.iloc[bin_idx]['Exited_0_pct']
        pct_1 = age_bin_counts.iloc[bin_idx]['Exited_1_pct']
        x_pos = rect.get_x() + rect.get_width() / 2
        y_pos = rect.get_y() + rect.get_height()
        ax.text(x_pos, y_pos + 1, f"Total: {int(total)}\n {pct_0}%,  {pct_1}%", 
                ha='center', va='bottom', fontsize=10)

# 6. ラベル調整
plt.title("Customer Exits by Age Group (with total and %)")
plt.xlabel("Age Group")
plt.ylabel("Number of Customers")
plt.xticks(rotation=45)
plt.legend(title='Exited', labels=['Stayed (0)', 'Exited (1)'])
plt.tight_layout()
plt.show()


## Gender vs Exited

In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

# 集計
gender_exit = df_train.groupby(['Gender', 'Exited']).size().unstack(fill_value=0)
gender_exit['Total'] = gender_exit[0] + gender_exit[1]
gender_exit['Exited_0_pct'] = (gender_exit[0] / gender_exit['Total'] * 100).round(1)
gender_exit['Exited_1_pct'] = (gender_exit[1] / gender_exit['Total'] * 100).round(1)

# 色の定義
colors = ['royalblue', 'crimson']

# プロット
ax = gender_exit[[0, 1]].plot(kind='bar', stacked=True, figsize=(9, 5), color=colors)

# ラベル・タイトル設定
plt.title("Customer Exits by Gender")
plt.xlabel("Gender")
plt.ylabel("Number of Customers")
plt.xticks(rotation=0)

# --- 手動凡例の作成 ---
legend_elements = [
    Patch(facecolor='royalblue', label='Stayed (0)'),
    Patch(facecolor='crimson', label='Exited (1)'),
    Patch(facecolor='white', edgecolor='white', label=f"Female: Total {int(gender_exit.loc['Female', 'Total'])}\n  {gender_exit.loc['Female', 'Exited_0_pct']}%,  {gender_exit.loc['Female', 'Exited_1_pct']}%"),
    Patch(facecolor='white', edgecolor='white', label=f"Male: Total {int(gender_exit.loc['Male', 'Total'])}\n  {gender_exit.loc['Male', 'Exited_0_pct']}%,  {gender_exit.loc['Male', 'Exited_1_pct']}%")
]

plt.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1.0, 0.5), title='Exited Info', frameon=False)

plt.tight_layout()
plt.show()


## Feature Engineering

In [None]:
df_train['is_test'] = 0
df_test['is_test'] = 1
df_merged = pd.concat([df_train,df_test],ignore_index=True)


In [None]:

df_encoded = pd.get_dummies(df_merged, columns=["Gender", "Geography",'Age_bin'], dtype=int)
df_encoded = df_encoded.drop(['Surname','id','CustomerId'],axis=1)

df_encoded.head()


In [None]:
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

# 相関行列を計算
corr = df_encoded.corr()

# アノテーション行列を文字列に変換（小数第2位まで）
annot_matrix = corr.round(2).astype(str)

# -0.05 〜 +0.05 の範囲を空欄にする
annot_matrix[(corr >= -0.2) & (corr <= 0.2)] = ""

# ヒートマップ描画
plt.figure(figsize=(15, 10))
sb.heatmap(corr, annot=annot_matrix, fmt='', cmap='coolwarm', square=True, cbar=True)
plt.title('Correlation of Features')
plt.show()


## ML

In [None]:
# Separate train and test sets
kaggle_test_df = df_encoded[df_encoded['is_test'] == 1]
full_train_df = df_encoded[df_encoded['is_test'] == 0]

In [None]:
# Compute index to split 80/20
split_index = int(len(full_train_df) * 0.8)

# 80% train, 20% validation
train = full_train_df.iloc[:split_index]
valid = full_train_df.iloc[split_index:]

# Separate features/targets
drop_cols = ['Exited','is_test']

X_train = train.drop(columns=drop_cols)
y_train = train['Exited']

X_valid = valid.drop(columns=drop_cols)
y_valid = valid['Exited']

In [None]:
from sklearn.linear_model import LogisticRegression


lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
print("coefficient = ", lr.coef_)
print("intercept = ", lr.intercept_)

In [None]:
Y_pred = lr.predict(X_valid)
print(Y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

print('confusion matrix = \n', confusion_matrix(y_true=y_valid, y_pred=Y_pred))
print('accuracy = ', accuracy_score(y_true=y_valid, y_pred=Y_pred))
print('precision = ', precision_score(y_true=y_valid, y_pred=Y_pred))
print('recall = ', recall_score(y_true=y_valid, y_pred=Y_pred))
print('f1 score = ', f1_score(y_true=y_valid, y_pred=Y_pred))

In [None]:
# from imblearn.over_sampling import SMOTE
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(class_weight='balanced', random_state=42)

dt.fit(X_train, y_train)
score = dt.score(X_valid, y_valid)
score

In [None]:
# from sklearn.tree import plot_tree
# plot_tree(dt, feature_names=X_train.columns, filled=True)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
pred = dt.predict(X_valid)

# ここで labels を指定する
cm = confusion_matrix(y_valid, pred, labels=[0, 1])

# DataFrame に変換
df_cm = pd.DataFrame(cm, columns=['pred0', 'pred1'], index=['y0', 'y1'])
print(df_cm)

In [None]:
print(classification_report(y_valid, pred))