## 1.  

## PCA   

In [1]:
import numpy as np  
import random     
import math  
from collections import defaultdict   

In [2]:
with open('/Users/azura/Desktop/DDA3020_ass4/seeds_dataset.txt') as file:
    lines = file.readlines()    
    
data = np.array([[float(x) for x in line.split()] for line in lines])
  
X = data.T[:-1, :]    
       
# true labels  
Y = data.T[-1]    

In [3]:
K = 2    

In [4]:
mu = np.mean(X, axis=1).reshape((-1, 1))      
X_centered = X - mu    
sigma = (X_centered @ X_centered.T)/ X.shape[1]  

In [5]:
eigenvalues, eigenvectors = np.linalg.eig(sigma)     
idx = eigenvalues.argsort()[::-1][:K]     
eigenvalues_sorted = eigenvalues[idx]       

In [6]:
U = eigenvectors[:, idx]           

In [7]:
z = U.T @ X_centered    

## K-means      

In [8]:
def distance(a, b):  
    return math.sqrt(sum((x - y) ** 2 for x, y in zip(a, b)))          

In [9]:
def initialize_centroids(X, k):  
    centroids = X[:, np.random.choice(X.shape[1], k, replace=False)]
    return centroids    

In [10]:
def assign_lables(X, centroids, k):  
    labels = []
        
    for point in X.T:      
        closest_centroid_index = \
        min(range(k), key=lambda i: distance(point, centroids[:, i]))
        labels.append(closest_centroid_index)
          
    return labels    

In [11]:
def update_centroids(X, labels, k):       
    index_dict = defaultdict(list)      
     
    for i, num in enumerate(labels):      
        index_dict[num].append(i)      
    
    updated_centroids = [X[:, index_dict[i]].mean(axis=1) for i in range(k)]
    
    return np.array(updated_centroids).T       

In [12]:
def k_means(X, k, max_iterations):
    centroids = initialize_centroids(X, k)  
    
    for i in range(max_iterations):    
        labels = assign_lables(X, centroids, k)     
        
        new_centroids = update_centroids(X, labels, k)  
           
        if np.all(centroids == new_centroids):
            break
                  
        centroids = new_centroids        
          
    return labels     

In [13]:
k = 3      
max_iterations = 50 

new_lables = k_means(z, k, max_iterations)                                                  

In [14]:
new_lables  

[2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 1,
 2,
 2,
 2,
 1,
 2,
 2,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

## 2  

## Silhouette Coefficient  

In [15]:
def silhouette_coefficient(X, labels):
    n = len(X)
    s = np.zeros(n)        
    for i in range(n):  
        a_i = np.mean([distance(X[i], X[j]) for j in range(n) 
                       if labels[j] == labels[i] and i != j])
        b_i = min(np.mean([distance(X[i], X[j]) for j in range(n)  
                           if labels[j] == k]) for k in set(labels) if k != labels[i])
        s[i] = (b_i - a_i) / max(a_i, b_i)
    return np.mean(s)

In [16]:
silhouette_coefficient(z.T, new_lables)          

0.4765771593987354

## Rand Index    

In [17]:
def rand_index(labels_true, labels_pred):
    
    n = len(labels_true)
    a = 0  
    b = 0
    
    for i in range(n):  
        for j in range(i+1, n):  
            if (labels_true[i] == labels_true[j]) and (labels_pred[i] == labels_pred[j]): 
                a += 1
            if (labels_true[i] != labels_true[j]) and (labels_pred[i] != labels_pred[j]):  
                b += 1

    RI = (a + b) / (n * (n - 1) // 2)  
    return RI

In [18]:
a = rand_index(Y, new_lables)    
a   

0.8713602187286398