### Implementing K Means Clustering

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs

# Create 2D data with 3 clusters
X, y_true = make_blobs(n_samples=300,       # total samples
                       centers=3,          # number of clusters
                       n_features=2,       # 2D
                       cluster_std=1.0,    # standard deviation of clusters
                       random_state=42)    # reproducibility

# Plot the dataset
plt.figure(figsize=(6,6))
plt.scatter(X[:,0], X[:,1], c=y_true, cmap='viridis', s=50)
plt.title("Dummy 2D dataset with 3 clusters")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()


### Sklearns implementation

In [None]:
from sklearn.cluster import KMeans

# Initialize KMeans with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)

# Predicted cluster labels
y_pred = kmeans.labels_

# Cluster centers
centers = kmeans.cluster_centers_

# Plot the K-Means clusters
plt.figure(figsize=(6,6))
plt.scatter(X[:,0], X[:,1], c=y_pred, cmap='viridis', s=50)
plt.scatter(centers[:,0], centers[:,1], c='red', s=200, marker='X')  # cluster centers
plt.title("K-Means clustering on 2D dataset")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()


In [None]:
from sklearn.metrics import silhouette_score

score = silhouette_score(X, y_pred)
print("Silhouette score:", score)


### implementation form scratch

In [None]:
## We have X, y_pred, y_true
X.shape

In [None]:
n_examples, n_features = X.shape

In [None]:
n_clusters = 3

In [None]:
centroids = np.random.randn(n_clusters, n_features)

In [None]:
centroids

In [None]:
X_min = X.min(axis=0)
X_max = X.max(axis=0)

# Initialize centroids randomly within the data range
centroids = X_min + (X_max - X_min) * np.random.rand(n_clusters, n_features,seed=2)

In [None]:
centroids

In [None]:
y_train_pred = np.argmin(D, axis = 1 )
print(y_train_pred.shape)


In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(y_train_pred)

In [None]:
score = silhouette_score(X, y_train_pred)
print("Silhouette score:", score)


In [None]:
def loop_until_convergence(X, centroids, n_clusters=3, tol=1e-3, n_iter=100):
    n_samples, n_features = X.shape

    for iteration in range(n_iter):
        # Step 1: assign points to nearest centroid
        y_pred = np.zeros(n_samples, dtype=int)
        for i in range(n_samples):
            min_dist = float('inf')
            assigned_cluster = 0
            for j in range(n_clusters):
                dist = np.linalg.norm(X[i] - centroids[j])
                if dist < min_dist:
                    min_dist = dist
                    assigned_cluster = j
            y_pred[i] = assigned_cluster

        # Step 2: store previous centroids
        old_centroids = centroids.copy()

        # Step 3: update centroids based on current assignments
        for j in range(n_clusters):
            points = X[y_pred == j]
            if len(points) > 0:          # avoid empty cluster
                centroids[j] = np.mean(points, axis=0)

        # Step 4: check convergence (centroid shift)
        shift = np.linalg.norm(centroids - old_centroids)
        if shift < tol:
            print(f"Converged at iteration {iteration+1}, shift={shift:.6f}")
            break

    return centroids, y_pred


In [None]:
final_centroids, y_pred = loop_until_convergence(X, centroids, n_clusters=n_clusters)
print("Final centroids:\n", final_centroids)

In [None]:
score = silhouette_score(X, y_pred)
print("Silhouette score:", score)
