### K Means Clustering

K-Means is an unsupervised machine learning algorithm used for clustering, which partitions a dataset into K clusters based on similarity. It minimizes the sum of squared distances from each data point to the centroid of its assigned cluster.

In [4]:
from sklearn.cluster import KMeans
import numpy as np

# Generate synthetic data
np.random.seed(42)
X = np.random.rand(100, 2)

# Instantiate K Means model
kmeans = KMeans(n_clusters=3, random_state=42, n_init='auto')

# Fit the model to the data
kmeans.fit(X)

# Get the centroids and cluster assignments
centroids = kmeans.cluster_centers_
cluster_assignments = kmeans.labels_

print("Centroids:", centroids)
print("Cluster Assignments:", cluster_assignments)

Centroids: [[0.8039633  0.57026999]
 [0.36376248 0.20008043]
 [0.18520943 0.72228065]]
Cluster Assignments: [2 0 1 2 0 2 0 1 2 1 1 1 2 2 1 1 2 0 1 0 2 2 2 2 1 0 0 0 1 1 1 0 2 2 2 0 2
 0 1 1 0 1 1 0 0 2 0 0 1 1 2 2 0 2 1 1 0 0 0 0 0 1 1 0 2 1 1 0 0 2 0 1 1 0
 1 0 2 0 0 2 2 0 2 1 2 1 1 1 0 0 1 0 2 0 1 2 0 1 0 0]


#### K-Means clustering algorithm from scratch 

In [5]:
import numpy as np

class KMeansScratch:

    def __init__(self, n_clusters, max_iters=100, random_state=None):
        self.n_clusters = n_clusters
        self.max_iters = max_iters
        self.random_state = random_state
        self.centroids = None
        self.cluster_assignments = None

    def initialize_centroids(self, X):
        if self.random_state is not None:
            np.random.seed(self.random_state)
        indices = np.random.choice(len(X), self.n_clusters, replace=False)
        return X[indices]
    
    def assign_to_clusters(self, X):
        distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)
        return np.argmin(distances, axis=1)

    def update_centroids(self, X):
        new_centroids = np.array([np.mean(X[self.cluster_assignments == k], axis=0) for k in range(self.n_clusters)])
        return new_centroids

    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        for _ in range(self.max_iters):
            prev_assignments = self.cluster_assignments
            self.cluster_assignments = self.assign_to_clusters(X)
            self.centroids = self.update_centroids(X)
            if np.array_equal(prev_assignments, self.cluster_assignments):
                break

    def predict(self, X):
        return self.assign_to_clusters(X)

# Example usage:
# Generate synthetic data
np.random.seed(42)
X = np.random.rand(100, 2)

# Instantiate and fit the KMeansScratch model
kmeans_scratch = KMeansScratch(n_clusters=3, random_state=42)
kmeans_scratch.fit(X)

# Get the centroids and cluster assignments
centroids_scratch = kmeans_scratch.centroids
cluster_assignments_scratch = kmeans_scratch.cluster_assignments

print("Centroids (from scratch):", centroids_scratch)
print("Cluster Assignments (from scratch):", cluster_assignments_scratch)

Centroids (from scratch): [[0.36376248 0.20008043]
 [0.19671223 0.72161646]
 [0.81167067 0.56668218]]
Cluster Assignments (from scratch): [1 2 0 1 2 1 2 0 1 0 0 0 1 1 0 0 1 2 0 2 1 1 1 1 0 2 2 2 0 0 0 2 1 1 1 2 1
 2 0 0 2 0 0 2 2 1 2 2 0 0 1 1 2 1 0 0 2 2 2 2 2 0 0 2 1 0 0 2 1 1 2 0 0 2
 0 2 1 2 2 1 1 2 1 0 1 0 0 0 2 2 0 2 1 2 0 1 2 0 2 2]
