In [1]:
#Importing Necessary Libraries
import numpy as np
import pandas as pd

## Creating a Custom K means Class

In [2]:
class KMeansCustom:
    """
    Custom implementation of the k-means clustering algorithm.

    Parameters:
    - n_clusters (int): The number of clusters.
    - initial_centers (ndarray, optional): Initial cluster centers. If not provided, they are randomly initialized.
    - random_state (int, optional): Seed for random number generation. Defaults to None.
    """

    def __init__(self, n_clusters, initial_centers=None, random_state=None):
        self.k = n_clusters
        self.centroids = initial_centers
        self.labels = None
        self.random_state = random_state

    def fit(self, X, max_iterations=100):
        """
        Fit the k-means model to the input data.

        Parameters:
        - X (ndarray): Input data.
        - max_iterations (int, optional): Maximum number of iterations. Defaults to 100.
        """
        # Set seed for random number generation
        np.random.seed(self.random_state)

        # If initial centroids are not provided, randomly initialize them
        if self.centroids is None:
            self.centroids = X[np.random.choice(
                X.shape[0], self.k, replace=False)]

        for _ in range(max_iterations):
            # Assign each data point to the nearest centroid
            distances = np.linalg.norm(
                X[:, np.newaxis] - self.centroids, axis=2)
            self.labels = np.argmin(distances, axis=1)

            # Update centroids based on mean of assigned points
            new_centroids = np.array(
                [X[self.labels == i].mean(axis=0) for i in range(self.k)])

            # Check for convergence
            if np.all(new_centroids == self.centroids):
                break

            self.centroids = new_centroids

    def get_centers(self):
        """
        Get the cluster centers.

        Returns:
        - ndarray: Cluster centers.
        """
        return self.centroids

    def get_labels(self):
        """
        Get the labels assigned to each data point.

        Returns:
        - ndarray: Array of labels.
        """
        return self.labels


## Testing the code

In [3]:
df = pd.read_csv('datasets/iris.csv')

#Removing category column
X = df.iloc[:, :-1].values


### Without passing initial centroids and going for 200 iterations

In [4]:
# For k = 2
k_means_random_2 = KMeansCustom(n_clusters=2, random_state=42)
k_means_random_2.fit(X, max_iterations=200)

# For k = 3
k_means_random_3 = KMeansCustom(n_clusters=3, random_state=42)
k_means_random_3.fit(X, max_iterations=200)

In [5]:
print("-"*100)
print("For k = 2")
print(f"Cluster centers:- {k_means_random_2.get_centers()}")
print(f"Labels:- {k_means_random_2.get_labels()}")
print("-"*100)
print("For k = 3")
print(f"Cluster centers:- {k_means_random_3.get_centers()}")
print(f"Labels:- {k_means_random_3.get_labels()}")


----------------------------------------------------------------------------------------------------
For k = 2
Cluster centers:- [[6.30103093 2.88659794 4.95876289 1.69587629]
 [5.00566038 3.36981132 1.56037736 0.29056604]]
Labels:- [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]
----------------------------------------------------------------------------------------------------
For k = 3
Cluster centers:- [[5.9016129  2.7483871  4.39354839 1.43387097]
 [5.006      3.428      1.462      0.246     ]
 [6.85       3.07368421 5.74210526 2.07105263]]
Labels:- [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 

### With passing initial centroid and going for 200 iterations

In [6]:
initial_centroid_2 = np.array([[4.0, 3.0, 1.3, 0.2], [6.0, 3.0, 4.8, 1.8]])
initial_centroid_3 = np.array([[4.0, 3.0, 1.3, 0.2], [6.0, 3.0, 4.8, 1.8], [7.0, 3.2, 6.0, 2.0]])

# For k = 2
k_means_custom_2 = KMeansCustom(n_clusters=2, initial_centers=initial_centroid_2, random_state=42)
k_means_custom_2.fit(X, max_iterations=200)

# For k = 3
k_means_custom_3 = KMeansCustom(n_clusters=3, initial_centers=initial_centroid_3, random_state=42)
k_means_custom_3.fit(X, max_iterations=200)

In [7]:
print("-"*100)
print("For k = 2")
print(f"Cluster centers:- {k_means_custom_2.get_centers()}")
print(f"Labels:- {k_means_custom_2.get_labels()}")
print("-"*100)
print("For k = 3")
print(f"Cluster centers:- {k_means_custom_3.get_centers()}")
print(f"Labels:- {k_means_custom_3.get_labels()}")


----------------------------------------------------------------------------------------------------
For k = 2
Cluster centers:- [[5.00566038 3.36981132 1.56037736 0.29056604]
 [6.30103093 2.88659794 4.95876289 1.69587629]]
Labels:- [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1]
----------------------------------------------------------------------------------------------------
For k = 3
Cluster centers:- [[5.006      3.428      1.462      0.246     ]
 [5.9016129  2.7483871  4.39354839 1.43387097]
 [6.85       3.07368421 5.74210526 2.07105263]]
Labels:- [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 