# K-Means Clustering on BankChurners.csv 

In [4]:

import pandas as pd
import numpy as np
import math


In [5]:

# Load dataset
df = pd.read_csv('BankChurners.csv')

# Select only numerical columns relevant for clustering
features = ['Customer_Age','Dependent_count','Months_on_book','Total_Relationship_Count',
            'Months_Inactive_12_mon','Contacts_Count_12_mon','Credit_Limit','Total_Revolving_Bal',
            'Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1','Total_Trans_Amt','Total_Trans_Ct',
            'Total_Ct_Chng_Q4_Q1','Avg_Utilization_Ratio']

data = df[features].values

# Normalize data for fair clustering
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

print("Data Shape:", data_scaled.shape)


Data Shape: (10127, 14)


In [6]:

def distance(x1, x2):
    return math.sqrt(sum((a-b)**2 for a,b in zip(x1,x2)))


In [7]:

def update_cluster_center(cluster_data):
    if len(cluster_data) == 0:
        return [0]*len(cluster_data[0])  
    sums = [0]*len(cluster_data[0])
    for row in cluster_data:
        for i in range(len(row)):
            sums[i] += row[i]
    return [s/len(cluster_data) for s in sums]


In [None]:

def kmeans_du(k, data):
    center_data = [data[np.random.randint(0,len(data))] for i in range(k)]
    print("Initial Centers:", center_data)
    
    for j in range(0,5):
        cluster_data = [[] for _ in range(k)]
        
        for d in data:
            mindistance = []
            for i in range(k):
                mindistance.append(distance(center_data[i], d))
            cluster_data[mindistance.index(min(mindistance))].append(d)
        
        # Print clusters
        for i in range(k):
            print(f"Iteration {j}, Cluster {i} --> Size: {len(cluster_data[i])}")
        
        # Update centers
        for i in range(k):
            if len(cluster_data[i]) > 0:
                center_data[i] = update_cluster_center(cluster_data[i])
        print("NEW Cluster Centers:", center_data)
    
    return cluster_data, center_data


In [9]:

clusters, centers = kmeans_du(3, data_scaled)


Initial Centers: [array([ 0.95729034, -0.26654715,  1.76202755,  0.12057905, -0.33759792,
        0.49240377,  0.09760281,  0.94875599,  0.01252557,  0.77127041,
       -0.5637645 , -0.12179474,  1.20877214, -0.26078827]), array([-0.16540558,  0.50336813,  0.00896451,  0.12057905,  0.65194019,
        0.49240377, -0.55983386,  0.61131077, -0.61452083,  0.32418185,
        3.1630955 ,  1.96585133,  0.09147405,  0.70410605]), array([-0.91386953, -0.26654715,  0.00896451,  1.40730617, -0.33759792,
        0.49240377, -0.77615528, -0.49428247, -0.7316795 , -0.23239779,
       -0.7027122 , -0.93129016,  1.01975554,  0.75126254])]


Iteration 0, Cluster 0 --> Size: 3924
Iteration 0, Cluster 1 --> Size: 1256
Iteration 0, Cluster 2 --> Size: 4947
NEW Cluster Centers: [[0.6958811031622044, -0.14921551167897754, 0.6731921072835612, -0.11568162541275114, -0.000943138245843275, 0.0019200639870371947, 0.3898657810913382, 0.12956339774410525, 0.3781684611475541, 0.15695961840149586, -0.2308024907759325, -0.14994836923309077, 0.10855919093919301, -0.161707136425155], [-0.18377949595122525, 0.19871216323555219, -0.18773665715068621, -0.9796955722666, 0.09493107510996328, -0.24103263912053977, 0.3075878369935541, 0.28662575166510224, 0.2818270242537995, 0.060511904411671265, 2.1194766809582735, 1.5349503757887166, 0.031819451857190865, -0.09745029848366948], [-0.5053184560124769, 0.06790765935000839, -0.48631667426710284, 0.34049572203084527, -0.023354064253374777, 0.05967306724282085, -0.387338517943466, -0.1755424938021539, -0.371519665252839, -0.13986506863726056, -0.3550421947602246, -0.2707702185405547, -0.0941886995710