In [96]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [120]:
def cost_func(x, centroids, idx):
    """Computes the cost function (inertia) for K-means."""
    m = len(x)
    J = np.sum(np.linalg.norm(x - centroids[idx], axis=1) ** 2)
    return J / m

In [121]:
def c_cent(x, centroids):
    """Assigns each point to the nearest centroid."""
    distances = np.linalg.norm(x[:, np.newaxis] - centroids, axis=2)
    return np.argmin(distances, axis=1)

In [122]:
def comp_cent(x, idx, k):
    """Computes new centroids as the mean of assigned points."""
    centroids = np.array([x[idx == i].mean(axis=0) if len(x[idx == i]) > 0 else np.zeros(x.shape[1]) for i in range(k)])
    return centroids

In [123]:
def init_cent(x, k):
    """K-Means++ initialization for better centroids."""
    centroids = [x[np.random.randint(x.shape[0])]]
    for _ in range(1, k):
        distances = np.min(np.linalg.norm(x[:, np.newaxis] - np.array(centroids), axis=2), axis=1)
        prob = distances ** 2 / np.sum(distances ** 2)
        next_centroid = x[np.random.choice(x.shape[0], p=prob)]
        centroids.append(next_centroid)
    return np.array(centroids)

In [124]:
def k_means(x, centroids, iter):
    """Performs K-means clustering."""
    k = len(centroids)
    for _ in range(iter):
        idx = c_cent(x, centroids)
        centroids = comp_cent(x, idx, k)
    return centroids, idx

In [133]:
def load_data(file_path):
    """Loads dataset from CSV and applies feature scaling."""
    data = pd.read_csv(file_path)
    x = data.values  # Load all columns as features
    mean = np.mean(x, axis=0)
    std = np.std(x, axis=0)
    x = (x - mean) / std
    return x  # Ensure this is a NumPy array, not a tuple


In [140]:
def fit(x, k, epoch, iter):
    J_hist = []
    cent_hist = []
    for _ in range(epoch):
        centroids = init_cent(x, k)
        cent, idx = k_means(x, centroids, iter)
        cent_hist.append((cent, idx))
        J_hist.append(cost_func(x, cent, idx))
    
    best_idx = np.argmin(J_hist)
    best_centroids, best_labels = cent_hist[best_idx]
    
    return best_centroids, best_labels, J_hist[best_idx]  # Only returning 3 values


In [141]:
def silhouette_score(x, idx, k):
    """Computes silhouette score manually to find the best k."""
    scores = []
    for i in range(len(x)):
        same_cluster = x[idx == idx[i]]
        other_clusters = [x[idx == j] for j in range(k) if j != idx[i]]
        a = np.mean(np.linalg.norm(same_cluster - x[i], axis=1))
        b = min([np.mean(np.linalg.norm(cluster - x[i], axis=1)) for cluster in other_clusters])
        scores.append((b - a) / max(a, b))
    return np.mean(scores)

In [142]:
def find_optimal_k(x, min_k, max_k, epoch, iter):
    """Finds the optimal k using silhouette score."""
    best_k, best_score = None, -1
    for k in range(min_k, max_k + 1):
        _, idx, _ = fit(x, k, epoch, iter)
        score = silhouette_score(x, idx, k)
        print(f"k = {k}, Silhouette Score: {score}")
        if score > best_score:
            best_k, best_score = k, score
    return best_k

In [143]:
def plot_clusters(x, idx, centroids):
    """Plots the clustered data if it's 2D."""
    if x.shape[1] == 2:
        plt.scatter(x[:, 0], x[:, 1], c=idx, cmap='viridis', alpha=0.5)
        plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='x', s=100)
        plt.title("K-Means Clustering")
        plt.show()
    else:
        print("Cannot plot data with more than 2 dimensions.")


In [144]:
def train_and_evaluate(file_path, min_k=2, max_k=10, epoch=10, iter=100):
    """Loads data, finds optimal k, trains K-means, and evaluates cost."""
    x = load_data(file_path)
    best_k = find_optimal_k(x, min_k, max_k, epoch, iter)
    print(f"Optimal k found: {best_k}")
    cent, idx, J_hist = fit(x, best_k, epoch, iter)
    print(f"Final Cost: {J_hist}")
    plot_clusters(x, idx, cent)
    return cent, idx, J_hist

In [145]:
train_and_evaluate(r"C:\Users\user\Downloads\driver-data.csv", min_k=2, max_k=10, epoch=10, iter=100)


k = 2, Silhouette Score: 0.4947151504802449
k = 3, Silhouette Score: 0.4263135004459815
k = 4, Silhouette Score: 0.4820089738869869
k = 5, Silhouette Score: 0.5098747841368713
k = 6, Silhouette Score: 0.5144932283416057
k = 7, Silhouette Score: 0.4542618015677831
k = 8, Silhouette Score: 0.46673773237995464
k = 9, Silhouette Score: 0.42352679629827833
k = 10, Silhouette Score: 0.42229739799063054
Optimal k found: 6
Final Cost: 0.5219678550653898
Cannot plot data with more than 2 dimensions.


(array([[ 0.86699059, -0.48395103, -0.38829204],
        [ 0.05142487, -0.47849131,  1.68282789],
        [-0.88341832, -0.48973022, -0.37249507],
        [ 0.88186648,  1.94911212,  0.00913077],
        [ 0.14599255,  1.90269816,  4.36656909],
        [-0.85728173,  1.95571619, -0.02695281]]),
 array([1, 1, 1, ..., 3, 3, 5]),
 np.float64(0.5219678550653898))