1. sklearn

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

# Load data
iris = load_iris()
X, y = iris.data, iris.target

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Train & Predict
clf = KMeans(n_clusters=2, random_state=0, n_init="auto")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6888888888888889


2. scratch

In [4]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter

class KMeansScratch:
    def __init__(self, n_clusters=3, max_iters=100, tol=1e-4):
        self.k = n_clusters #有几个类
        self.max_iters = max_iters #迭代次数
        self.tol = tol #容差，容错性
        self.centroids = None # kmeans中心
        self.labels_ = None

    def fit(self, X):
        X = np.array(X)
        n_samples, n_features = X.shape

        # Initialize centroids randomly from samples
        random_indices = np.random.choice(n_samples, self.k, replace=False) #不放回取样
        print(random_indices)
        self.centroids = X[random_indices]

        for iteration in range(self.max_iters):
            # Assign clusters
            labels = self._assign_clusters(X)

            # Recompute centroids
            new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(self.k)])

            # Check convergence
            if np.all(np.abs(new_centroids - self.centroids) < self.tol):
                break

            self.centroids = new_centroids

        self.labels_ = self._assign_clusters(X)

    def _assign_clusters(self, X):
        distances = self._compute_distances(X)
        return np.argmin(distances, axis=1)

    def _compute_distances(self, X):
        return np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)

    def predict(self, X):
        return self._assign_clusters(np.array(X))



# Load data
iris = load_iris()
X, y = iris.data, iris.target

# Reduce to 2 clusters for comparison with your sklearn example
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Train KMeans from scratch
clf = KMeansScratch(n_clusters=2)
clf.fit(X_train)
y_pred = clf.predict(X_test)

# Accuracy
print("Scratch KMeans Accuracy:", accuracy_score(y_test, y_pred))


[94 19]
Scratch KMeans Accuracy: 0.022222222222222223
