# Praktikum 2 — Clustering dan k-Means

Pada praktikum ini akan dipraktekkan beberapa teknik clustering menggunakan k-Means, implementasi manual (Expectation–Maximization style), optimasi jumlah klaster, kasus non-linier, serta aplikasi pada dataset angka dan kompresi citra.

In [None]:
# Import library
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np

from sklearn.datasets import make_blobs, make_moons, load_digits, load_sample_image
from sklearn.cluster import KMeans, MiniBatchKMeans, SpectralClustering
from sklearn.metrics import pairwise_distances_argmin, accuracy_score, confusion_matrix
from scipy.stats import mode
from sklearn.manifold import TSNE
import warnings
warnings.simplefilter('ignore')

## Contoh: k-Means pada data sintetis (make_blobs)
Kita mulai dengan dataset sintetis yang memiliki 4 pusat cluster.

In [None]:
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
plt.figure(figsize=(6,5))
plt.scatter(X[:, 0], X[:, 1], s=50)
plt.title('Data Sintetis (make_blobs)')
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()

### Jalankan k-Means (n_clusters=4)

In [None]:
kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)

In [None]:
plt.figure(figsize=(6,5))
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
plt.title('Hasil k-Means (k=4)')
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()

## Algoritma Expectation–Maximization (implementasi sederhana)
Implementasi sederhana: pilih centroid random, kemudian alternasi penugasan label dan update centroid sampai konvergen.

In [None]:
def find_clusters(X, n_clusters, rseed=2):
    rng = np.random.RandomState(rseed)
    i = rng.permutation(X.shape[0])[:n_clusters]
    centers = X[i]
    while True:
        labels = pairwise_distances_argmin(X, centers)
        new_centers = np.array([X[labels == j].mean(0) for j in range(n_clusters)])
        # If any cluster has no points, keep old center for that cluster
        for idx in range(n_clusters):
            if np.isnan(new_centers[idx]).any():
                new_centers[idx] = centers[idx]
        if np.allclose(centers, new_centers):
            break
        centers = new_centers
    labels = pairwise_distances_argmin(X, centers)
    return centers, labels

centers, labels = find_clusters(X, 4)
plt.figure(figsize=(6,5))
plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis')
plt.title('Hasil Implementasi Manual find_clusters')
plt.show()

### Perubahan seed random (hasil bisa berbeda)
Contoh mengganti `rseed` untuk melihat perbedaan inisialisasi.

In [None]:
centers2, labels2 = find_clusters(X, 4, rseed=0)
plt.figure(figsize=(6,5))
plt.scatter(X[:, 0], X[:, 1], c=labels2, s=50, cmap='viridis')
plt.title('find_clusters with rseed=0')
plt.show()

### Optimisasi jumlah klaster
Jika kita tidak tahu jumlah cluster, kita bisa eksperimen dengan beberapa nilai `k`.

In [None]:
labels_k6 = KMeans(6, random_state=0).fit_predict(X)
plt.figure(figsize=(6,5))
plt.scatter(X[:, 0], X[:, 1], c=labels_k6, s=50, cmap='viridis')
plt.title('k-Means with k=6')
plt.show()

## Batas klaster yang tidak selalu linier
Contoh data `make_moons` dimana klaster berbentuk non-linier (half-moon).

In [None]:
X_moons, y_moons = make_moons(200, noise=.05, random_state=0)
labels_moons = KMeans(2, random_state=0).fit_predict(X_moons)
plt.figure(figsize=(6,5))
plt.scatter(X_moons[:, 0], X_moons[:, 1], c=labels_moons, s=50, cmap='viridis')
plt.title('k-Means pada make_moons (k=2)')
plt.show()

In [None]:
model = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', assign_labels='kmeans', random_state=0)
labels_spec = model.fit_predict(X_moons)
plt.figure(figsize=(6,5))
plt.scatter(X_moons[:, 0], X_moons[:, 1], c=labels_spec, s=50, cmap='viridis')
plt.title('Spectral Clustering pada make_moons (k=2)')
plt.show()

## Contoh Kasus 1 — Clustering pada dataset digit (load_digits)
Gunakan k-Means untuk mengelompokkan gambar digit (0-9).

In [None]:
digits = load_digits()
digits.data.shape

In [None]:
# Terapkan K-Means (k=10)
kmeans_digits = KMeans(n_clusters=10, random_state=0)
clusters = kmeans_digits.fit_predict(digits.data)
kmeans_digits.cluster_centers_.shape

In [None]:
fig, ax = plt.subplots(2, 5, figsize=(8, 3))
centers = kmeans_digits.cluster_centers_.reshape(10, 8, 8)
for axi, center in zip(ax.flat, centers):
    axi.set(xticks=[], yticks=[])
    axi.imshow(center, interpolation='nearest', cmap=plt.cm.binary)
plt.suptitle('Centroids (kmeans) sebagai gambar 8x8')
plt.show()

In [None]:
# Permutasi label ke label mayoritas pada tiap cluster
labels = np.zeros_like(clusters)
for i in range(10):
    mask = (clusters == i)
    if mask.any():
        labels[mask] = mode(digits.target[mask])[0]

accuracy_score(digits.target, labels)

In [None]:
# Confusion matrix (true vs predicted after permutasi)
mat = confusion_matrix(digits.target, labels)
plt.figure(figsize=(8,6))
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=digits.target_names, yticklabels=digits.target_names)
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.title('Confusion Matrix (permute clusters -> labels)')
plt.show()

### t-SNE untuk visualisasi dan clustering pada proyeksi 2D

In [None]:
tsne = TSNE(n_components=2, init='random', random_state=0)
digits_proj = tsne.fit_transform(digits.data)

kmeans_proj = KMeans(n_clusters=10, random_state=0)
clusters_proj = kmeans_proj.fit_predict(digits_proj)

labels_proj = np.zeros_like(clusters_proj)
for i in range(10):
    mask = (clusters_proj == i)
    if mask.any():
        labels_proj[mask] = mode(digits.target[mask])[0]

accuracy_score(digits.target, labels_proj)

## Studi Kasus 2 — Kompresi Citra (reduksi warna)
Gunakan `load_sample_image` untuk mengambil contoh gambar, kemudian lakukan reduksi warna dengan k-Means / MiniBatchKMeans.

In [None]:
flower = load_sample_image("flower.jpg")
plt.figure(figsize=(6,5))
plt.imshow(flower)
plt.axis('off')
plt.title('Sample image: flower.jpg')
plt.show()

In [None]:
flower.shape

In [None]:
data = flower / 255.0
data = data.reshape(flower.shape[0] * flower.shape[1], 3)
data.shape

In [None]:
def plot_pixels(data, title, colors=None, N=10000):
    if colors is None:
        colors = data
    rng = np.random.RandomState(0)
    i = rng.permutation(data.shape[0])[:N]
    colors = colors[i]
    R, G, B = data[i].T
    fig, ax = plt.subplots(1, 2, figsize=(16, 6))
    ax[0].scatter(R, G, color=colors, marker='.')
    ax[0].set(xlabel='Red', ylabel='Green', xlim=(0, 1), ylim=(0, 1))
    ax[1].scatter(R, B, color=colors, marker='.')
    ax[1].set(xlabel='Red', ylabel='Blue', xlim=(0, 1), ylim=(0, 1))
    fig.suptitle(title, size=20)
    plt.show()

In [None]:
plot_pixels(data, title='Input color space: 16 million possible colors')

In [None]:
kmeans_mb = MiniBatchKMeans(16, random_state=0)
kmeans_mb.fit(data)
new_colors = kmeans_mb.cluster_centers_[kmeans_mb.predict(data)]
plot_pixels(data, colors=new_colors, title='Reduced color space: 16 colors')

In [None]:
flower_recolored = new_colors.reshape(flower.shape)
fig, ax = plt.subplots(1, 2, figsize=(16, 6), subplot_kw=dict(xticks=[], yticks=[]))
fig.subplots_adjust(wspace=0.05)
ax[0].imshow(flower)
ax[0].set_title('Original Image', size=16)
ax[1].imshow(flower_recolored)
ax[1].set_title('16-color Image', size=16)
plt.show()

### Selesai
Notebook ini menunjukkan berbagai pendekatan clustering: k-Means standar, implementasi manual (EM-like), pemilihan k, pendekatan untuk klaster non-linier (Spectral Clustering), aplikasi pada digit, serta aplikasi pada kompresi citra.