# Bank Customer Segmentation

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

df = pd.read_csv("bank_transactions.csv")

In [2]:
# Fix missing numeric values
df['CustAccountBalance'] = df['CustAccountBalance'].fillna(0)
df['TransactionAmount (INR)'] = df['TransactionAmount (INR)'].fillna(0)

# Fix missing categories
df['CustGender'] = df['CustGender'].fillna("Unknown")
df['CustLocation'] = df['CustLocation'].fillna("Unknown")

# Parse date
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'], errors='coerce')


  df['TransactionDate'] = pd.to_datetime(df['TransactionDate'], errors='coerce')


In [None]:
cust = df.groupby('CustomerID').agg(
    txn_count = ('TransactionID', 'count'),
    total_amount = ('TransactionAmount (INR)', 'sum'),
    avg_amount = ('TransactionAmount (INR)', 'mean'),
    avg_balance = ('CustAccountBalance', 'mean')
).reset_index()

In [4]:
cust = df.groupby('CustomerID').agg(
    txn_count = ('TransactionID', 'count'),
    total_amount = ('TransactionAmount (INR)', 'sum'),
    avg_amount = ('TransactionAmount (INR)', 'mean'),
    avg_balance = ('CustAccountBalance', 'mean')
).reset_index()


In [5]:
features = ['txn_count', 'total_amount', 'avg_amount', 'avg_balance']
X = cust[features].fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
inertia = []
silhouette_scores = []
k_values = range(2, 8)

for k in k_values:
    km = KMeans(n_clusters=k, random_state=0)
    labels = km.fit_predict(X_scaled)
    inertia.append(km.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, labels))

plt.plot(k_values, inertia, marker='o')
plt.title("Elbow Method")
plt.xlabel("k")
plt.ylabel("Inertia")
plt.show()

plt.plot(k_values, silhouette_scores, marker='o')
plt.title("Silhouette Scores")
plt.xlabel("k")
plt.ylabel("Silhouette")
plt.show()

KeyboardInterrupt: 

In [None]:
k = 4  # choose based on the elbow/silhouette
km = KMeans(n_clusters=k, random_state=0)
cust['cluster'] = km.fit_predict(X_scaled)

cust.head()

In [None]:
loc_grp = df.groupby('CustLocation').agg(
    total_txn = ('TransactionID', 'count'),
    total_amount = ('TransactionAmount (INR)', 'sum'),
    avg_amount = ('TransactionAmount (INR)', 'mean')
).reset_index()

print(loc_grp.sort_values('total_amount', ascending=False).head(20))

In [None]:
reference_date = df['TransactionDate'].max()

rfm = df.groupby('CustomerID').agg(
    last_txn = ('TransactionDate', 'max'),
    frequency = ('TransactionID', 'count'),
    monetary = ('TransactionAmount (INR)', 'sum')
).reset_index()

rfm['recency_days'] = (reference_date - rfm['last_txn']).dt.days
rfm = rfm[['CustomerID','recency_days','frequency','monetary']]

rfm.head()
