<a href="https://colab.research.google.com/github/JinzhiT/5750-project-2/blob/main/project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Math 5750/6880: Mathematics of Data Science \
Project 2

# 1. Clustering Gaussian Blobs using $k$-means

In [None]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

# Generate 5 Gaussian blobs in 10 dimensions
X, y_true = make_blobs(
    n_samples=1000,
    centers=5,
    n_features=10,
    cluster_std=1.5,
    random_state=1)        # reproducibility
X = StandardScaler().fit_transform(X)

print(type(X),X.shape)
print(type(y_true),y_true.shape)

<class 'numpy.ndarray'> (1000, 10)
<class 'numpy.ndarray'> (1000,)


In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment


X, y_true = make_blobs(n_samples=1000, centers=5, n_features=10,
                       cluster_std=1.5, random_state=1)
X = StandardScaler().fit_transform(X)

k = 5
kmeans = KMeans(n_clusters=k, n_init=20, random_state=1).fit(X)
labels_pred = kmeans.labels_
print("Smallest inertia (k=5):", round(kmeans.inertia_, 3))  # ~924.316

cm_raw = confusion_matrix(y_true, labels_pred, labels=np.arange(k))
row_ind, col_ind = linear_sum_assignment(-cm_raw)     # maximize correct
mapping = {pred: true for true, pred in zip(row_ind, col_ind)}
labels_matched = np.vectorize(lambda c: mapping[c])(labels_pred)
cm_matched = confusion_matrix(y_true, labels_matched, labels=np.arange(k))
print("Matched accuracy:", np.trace(cm_matched) / np.sum(cm_matched))

pca = PCA(n_components=2, random_state=1).fit(X)
X_2d = pca.transform(X)
centers_2d = pca.transform(kmeans.cluster_centers_)

plt.figure(figsize=(7,5))
plt.scatter(X_2d[:,0], X_2d[:,1], c=labels_pred, s=10)
plt.scatter(centers_2d[:,0], centers_2d[:,1], marker="X", s=200,
            linewidths=1, edgecolors="black")
plt.title("K-means (k=5) on Gaussian Blobs — PCA 2D view")
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.tight_layout(); plt.savefig("pca_clusters_centers.png", dpi=200); plt.close()

k_values = list(range(1, 8))
inertias = []
for kk in k_values:
    km = KMeans(n_clusters=kk, n_init=3, random_state=1).fit(X)
    inertias.append(km.inertia_)

plt.figure(figsize=(7,5))
plt.plot(k_values, inertias, marker="o")
plt.title("Elbow Analysis (k vs. inertia)")
plt.xlabel("k"); plt.ylabel("Inertia")
plt.xticks(k_values)
plt.tight_layout(); plt.savefig("elbow_plot.png", dpi=200); plt.close()

plt.figure(figsize=(6,5))
plt.imshow(cm_matched, aspect="auto"); plt.colorbar()
plt.title("Confusion Matrix (true vs. matched predicted labels)")
plt.xlabel("Predicted (matched)"); plt.ylabel("True")
plt.xticks(ticks=np.arange(k), labels=[str(i) for i in range(k)])
plt.yticks(ticks=np.arange(k), labels=[str(i) for i in range(k)])
plt.tight_layout(); plt.savefig("confusion_matrix_matched.png", dpi=200); plt.close()


Smallest inertia (k=5): 924.316
Matched accuracy: 1.0


# 2. Clustering Fashion-MNIST using $k$-means

In [None]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler

# Load Fashion-MNIST from OpenML
# Classes (0-9): T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt, Sneaker, Bag, Ankle boot
X, y = fetch_openml("Fashion-MNIST", version=1, as_frame=False, parser="auto", return_X_y=True)
y = y.astype(int)

print(type(X),X.shape)
print(type(y),y.shape)

<class 'numpy.ndarray'> (70000, 784)
<class 'numpy.ndarray'> (70000,)


In [None]:
# your code here

# 3. Dimensionality reduction for Fashion-MNIST

In [None]:
# your code here

# 4. Clustering Fashion-MNIST using spectral clustering

In [None]:
# your code here