In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [26]:
import numpy as np

class KMeans:
    def __init__(self, n_clusters=3, max_iter=300, tol=0.0001):
        self.n_clusters = n_clusters        # Number of clusters
        self.max_iter = max_iter            # Maximum number of iterations
        self.tol = tol                      # Tolerance for convergence
        self.centroids = None               # Will hold the centroids after fitting
        self.labels = None                  # Labels for each point
    
    def fit(self, X):
        # Step 1: Initialize centroids randomly from the dataset
        np.random.seed(42)
        random_indices = np.random.choice(len(X), self.n_clusters, replace=False)
        self.centroids = X[random_indices]

        for i in range(self.max_iter):
            # Step 2: Assign labels based on closest centroid
            self.labels = self._assign_labels(X)
            
            # Step 3: Calculate new centroids from the mean of the points in each cluster
            new_centroids = np.array([X[self.labels == j].mean(axis=0) for j in range(self.n_clusters)])
            
            # Step 4: Check for convergence
            if np.all(np.linalg.norm(new_centroids - self.centroids, axis=1) < self.tol):
                break
            
            # Update centroids for the next iteration
            self.centroids = new_centroids
    
    def _assign_labels(self, X):
        # Calculate distances from each point to each centroid
        distances = np.array([np.linalg.norm(X - centroid, axis=1) for centroid in self.centroids])
        
        # Return the index of the closest centroid for each point
        return np.argmin(distances, axis=0)
    
    def predict(self, X):
        # Predict the closest cluster for each point
        return self._assign_labels(X)

# Example usage
if __name__ == "__main__":
    # Random data for testing (you can replace this with your dataset)
    X = np.array([
        [1, 2], [2, 3], [3, 4],
        [8, 8], [9, 10], [10, 11],
        [25, 30], [24, 28], [27, 29]
    ])
    
    # Set the number of clusters
    kmeans = KMeans(n_clusters=3)
    kmeans.fit(X)
    
    print("Centroids:", kmeans.centroids)
    print("Labels:", kmeans.labels)


Centroids: [[25.33333333 29.        ]
 [ 2.          3.        ]
 [ 9.          9.66666667]]
Labels: [1 1 1 2 2 2 0 0 0]
