In [None]:
import numpy as np
import matplotlib . pyplot as plt
import random

In [None]:
def euclidean_distance( x1 , x2 ) :
    return np.sqrt( np .sum (( x1 - x2 ) ** 2) )

def initialize_centroids(X , k ) :
    n_samples , n_features = X . shape
    centroids_indices = random . sample ( range ( n_samples ) , k )
    centroids = X [ centroids_indices ]
    return centroids

def assign_clusters (X , centroids ) :
    n_samples = X . shape [0]
    clusters = np . zeros ( n_samples , dtype =int)
    for i in range ( n_samples ) :
        min_distance = float ('inf')
        for j , centroid in enumerate ( centroids ) :
            distance = euclidean_distance ( X [ i ] , centroid )
            if distance < min_distance :
                min_distance = distance
                clusters[ i ] = j
    return clusters

def update_centroids (X , clusters , k ) :
    n_features = X . shape [1]
    centroids = np . zeros (( k , n_features ) )
    for j in range ( k ) :
        cluster_points = X [ clusters == j ]
        if len ( cluster_points ) > 0:
            centroids [ j ] = np . mean ( cluster_points , axis =0)
    return centroids

In [None]:
def kmeans (X , k , max_iterations =100 , tolerance =1e-4) :
    centroids = initialize_centroids (X , k )
    prev_centroids = np . zeros_like ( centroids )
    for i in range ( max_iterations ) :
        clusters = assign_clusters (X , centroids )
        prev_centroids = centroids . copy ()
        centroids = update_centroids (X , clusters , k )
        centroid_diff = np .sum ([ euclidean_distance ( centroids [ j ] ,
            prev_centroids [ j ]) for j in range ( k ) ])
        if centroid_diff < tolerance :
            print ( f" Converged after {i+1} iterations ")
            break
    if i == max_iterations - 1:
        print ( f" Maximum iterations ({ max_iterations }) reached ")
    return centroids , clusters
    
def plot_clusters (X , clusters , centroids , feature_names = None ) :
    plt . figure ( figsize =(10 , 8) )
    k = len( centroids )
    colors = plt . cm . rainbow ( np . linspace (0 , 1 , k ) )
    for i in range ( k ) :
        cluster_points = X [ clusters == i ]
        plt . scatter ( cluster_points [: , 0] , cluster_points [: , 1] ,
        s =50 , c =[ colors [ i ]] , label = f'Cluster {i+1} ')
        plt . scatter ( centroids [: , 0] , centroids [: , 1] ,
            s =200 , marker ='X', c ='black', label ='Centroids')
    if feature_names and len ( feature_names ) >= 2:
        plt . xlabel ( feature_names [0])
        plt . ylabel ( feature_names [1])
    else :
        plt . xlabel ('Feature 1')
        plt . ylabel ('Feature 2')

    plt . title ('K- means Clustering Results')
    plt . legend ()
    plt . grid ( True , linestyle ='--', alpha =0.7)
    plt . tight_layout ()
    return plt

def generate_material_data ( n_samples =200) :
    mat1_mean = [8.5 , 2.1]
    mat1_cov = [[0.5 , 0.1] , [0.1 , 0.3]]
    mat1_samples = int ( n_samples * 0.33)
    mat2_mean = [5.2 , 8.7]
    mat2_cov = [[0.4 , -0.1] , [ -0.1 , 0.6]]
    mat2_samples = int ( n_samples * 0.33)
    mat3_mean = [2.8 , 5.5]
    mat3_cov = [[0.3 , 0.1] , [0.1 , 0.4]]
    mat3_samples = n_samples - mat1_samples - mat2_samples
    X1 = np . random . multivariate_normal ( mat1_mean , mat1_cov , mat1_samples )
    X2 = np . random . multivariate_normal ( mat2_mean , mat2_cov , mat2_samples )
    X3 = np . random . multivariate_normal ( mat3_mean , mat3_cov , mat3_samples )
    X = np . vstack ([ X1 , X2 , X3 ])
    y = np . hstack ([ np . zeros ( mat1_samples ) ,
        np . ones ( mat2_samples ) ,
        np . ones ( mat3_samples ) * 2])
    return X,y



In [None]:
if __name__ == "__main__":
    X, true_labels = generate_material_data(300)
    k = 3
    feature_names = ['Hardness', 'Thermal Conductivity']
    centroids, clusters = kmeans(X, k)
    plt_obj = plot_clusters(X, clusters, centroids, feature_names)
    plt.figure(figsize=(10, 8))
    for i in range(3):
        cluster_points = X[true_labels == i]
        plt.scatter(cluster_points[:, 0], cluster_points[:, 1],
                    s=50, alpha=0.7, label=f'True Material Type {i + 1}')
    plt.xlabel('Hardness')
    plt.ylabel('Thermal Conductivity')
    plt.title('Original Material Types (Ground Truth)')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    purity_matrix = np.zeros((k, k), dtype=int)
    for i in range(len(X)):
        purity_matrix[int(true_labels[i]), clusters[i]] += 1

    purity = np.sum(np.max(purity_matrix, axis=0)) / len(X)
    print(f"Clustering purity: {purity:.4f}")

    print("\nCluster Statistics:")
    for i in range(k):
        cluster_points = X[clusters == i]
        print(f"Cluster {i + 1}:")
        print(f"  Number of samples: {len(cluster_points)}")
        print(f"  Average Hardness: {np.mean(cluster_points[:, 0]):.2f}")
        print(f"  Average Thermal Conductivity: {np.mean(cluster_points[:, 1]):.2f}")
        print(f"  Standard Deviation - Hardness: {np.std(cluster_points[:, 0]):.2f}")
        print(f"  Standard Deviation - Thermal Conductivity: {np.std(cluster_points[:, 1]):.2f}")
        print()
