# Customer Segmentation dengan K-Means
Notebook ini menggunakan dataset **Mall Customers** dari Kaggle untuk melakukan clustering pelanggan berdasarkan fitur numerik menggunakan algoritma **K-Means**.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

%matplotlib inline

## 1. Load Dataset

In [None]:
# Pastikan file Mall_Customers.csv berada di folder yang sama dengan notebook ini
df = pd.read_csv('Mall_Customers.csv')
df.head()

## 2. Exploratory Data Analysis Singkat

In [None]:
# Informasi struktur dataset
df.info()

In [None]:
# Statistik deskriptif fitur numerik
df.describe()

## 3. Pemilihan Fitur untuk Clustering

In [None]:
# Kita gunakan dua fitur yang umum: Annual Income dan Spending Score
features = ['Annual Income (k$)', 'Spending Score (1-100)']
X = df[features]
X.head()

## 4. Normalisasi Fitur

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled[:5]

## 5. Menentukan Jumlah Cluster (k) dengan Elbow Method

In [None]:
inertia = []
K_range = range(2, 11)  # k dari 2 sampai 10

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure()
plt.plot(list(K_range), inertia, marker='o')
plt.xlabel('Jumlah Cluster (k)')
plt.ylabel('Inertia (Within-Cluster SSE)')
plt.title('Elbow Method untuk Menentukan k')
plt.grid(True)
plt.show()

## 6. Evaluasi Silhouette Score per k

In [None]:
sil_scores = []

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_scaled)
    sil = silhouette_score(X_scaled, cluster_labels)
    sil_scores.append(sil)
    print(f'k={k}, silhouette score={sil:.4f}')

plt.figure()
plt.plot(list(K_range), sil_scores, marker='o')
plt.xlabel('Jumlah Cluster (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score untuk Berbagai Nilai k')
plt.grid(True)
plt.show()

## 7. Training K-Means dengan k Optimal

In [None]:
# Misal dari grafik elbow dan silhouette kita pilih k = 5
k_opt = 5  # ubah jika hasil analisismu berbeda

kmeans_final = KMeans(n_clusters=k_opt, random_state=42, n_init=10)
cluster_labels = kmeans_final.fit_predict(X_scaled)

df['Cluster'] = cluster_labels
df.head()

## 8. Analisis Karakteristik Tiap Cluster

In [None]:
cluster_summary = df.groupby('Cluster')[features].mean().round(2)
cluster_summary

## 9. Visualisasi Cluster dalam 2D

In [None]:
plt.figure()
for c in range(k_opt):
    cluster_data = X_scaled[cluster_labels == c]
    plt.scatter(cluster_data[:, 0], cluster_data[:, 1], label=f'Cluster {c}')

centroids = kmeans_final.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker='X', s=200, linewidths=2, edgecolors='black', label='Centroid')

plt.xlabel(features[0] + ' (scaled)')
plt.ylabel(features[1] + ' (scaled)')
plt.title('Visualisasi Cluster Pelanggan (K-Means)')
plt.legend()
plt.grid(True)
plt.show()

## 10. Menyimpan Hasil Cluster

In [None]:
df.to_csv('Mall_Customers_with_clusters.csv', index=False)
print('File disimpan sebagai Mall_Customers_with_clusters.csv')