<a href="https://colab.research.google.com/github/MuhammadJundullah/Data-Analysis/blob/main/Bank%20Customers%20Churn%20Analysis/Bank_Customers_Churn_Analysis_(Logistic_Regressionn).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset Source : https://www.kaggle.com/models/nidhibarao/bank-customer-churn/code

In [None]:
import kagglehub
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)

import matplotlib.pyplot as plt
import seaborn as sns

# Data Importing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# EDA

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Bank Churn Customers_Dataset/Bank Churn Modelling.csv")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop(columns=["CustomerId", "Surname"], inplace = True)
df.head(2)

# Data Preprocessing

## Data Encoding


In [None]:
df["Geography"].value_counts()

In [None]:
geography_enc = pd.get_dummies(df['Geography'], prefix = 'geo').astype(int)
df = pd.concat([df, geography_enc],axis = 1)
df.drop(columns=['Geography'], inplace = True)
df.head(2)

In [None]:
df['Gender'].value_counts()

In [None]:
gender_enc = pd.get_dummies(df['Gender']).astype(int)
df = pd.concat([df, gender_enc], axis = 1)
df.drop(columns=["Gender"], inplace = True)
df.head(2)

In [None]:
df['Num Of Products'].value_counts()

In [None]:
nop_enc = pd.get_dummies(df['Num Of Products'], prefix = 'nop').astype(int)
df = pd.concat([df, nop_enc],axis = 1)
df.drop(columns=['Num Of Products'], inplace = True)
df.head(2)

In [None]:
df['Tenure'].value_counts()

In [None]:
nop_enc = pd.get_dummies(df['Tenure'], prefix = 'Ten').astype(int)
df = pd.concat([df, nop_enc],axis = 1)
df.drop(columns=['Tenure'], inplace = True)
df.head(2)

In [None]:
df['Has Credit Card'].value_counts()

In [None]:
df['Is Active Member'].value_counts()

In [None]:
# df['Zero Balance'] = np.where(df['Balance'] > 0, 1, 0)
# df.drop(columns=['Balance'], inplace = True)
# df.head(2)

## Data Standarization

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='Balance', hue="Churn", kde=True)
plt.title('Balance Distribution (Saldo Nasabah)')
plt.xlabel('Balance')
plt.ylabel('Frekuensi')
plt.show()

In [None]:
scaler = StandardScaler()
df["Balance"] = scaler.fit_transform(df[["Balance"]])
df.head(2)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='Estimated Salary', hue="Churn", kde=True)
plt.title('Estimated Salary Distribution')
plt.xlabel('Estimated Salary')
plt.ylabel('Frekuensi')
plt.show()

In [None]:
df["Estimated Salary"] = scaler.fit_transform(df[["Estimated Salary"]])
df.head(2)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='Age', hue="Churn", kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frekuensi')
plt.show()

In [None]:
df["Age"] = scaler.fit_transform(df[["Age"]])
df.head(2)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='CreditScore', hue="Churn", kde=True)
plt.title('Credit Score Distribution')
plt.xlabel('Credit Score')
plt.ylabel('Frekuensi')
plt.show()

In [None]:
df["CreditScore"] = scaler.fit_transform(df[["CreditScore"]])
df.head(2)

# Building Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
x = df.drop(columns=["Churn"])
y = df["Churn"]

In [None]:
# membagi data test dan training
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(f"Akurasi: {accuracy_score(y_test, y_pred)}")

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

# Random Under Sampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state = 2529)


In [None]:
x_rus, y_rus = rus.fit_resample(x, y)
x_rus.shape, y_rus.shape, x.shape, y.shape

In [None]:
# membagi data test dan training
X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(x_rus, y_rus, test_size=0.3, random_state=42)

In [None]:
model.fit(X_train_rus, y_train_rus)
y_pred_rus = model.predict(X_test_rus)

In [None]:
print(f"Akurasi: {accuracy_score(y_test_rus, y_pred_rus)}")

In [None]:
print(confusion_matrix(y_test_rus, y_pred_rus))

In [None]:
print(classification_report(y_test_rus, y_pred_rus))

In [None]:
pip install shap

In [None]:
import shap

# Misalnya, model adalah model yang sudah dilatih
explainer = shap.Explainer(model, X_train_rus)
shap_values = explainer.shap_values(X_train_rus)

# Plot global feature importance
shap.summary_plot(shap_values, X_train_rus)


In [None]:
coef = model.coef_[0]  # Koefisien dari model
feature_importance = pd.DataFrame({'Fitur': X_train_rus.columns, 'Koefisien': coef})
feature_importance.sort_values(by='Koefisien', ascending=False, inplace=True)
feature_importance.head()

In [None]:
feature_importance.tail()