In [41]:
import torch
from torch import Tensor, linalg
from sklearn.decomposition import PCA
from IPython.display import Latex
import math
import os
import sklearn
from sklearn.cluster import KMeans
from sklearn import cluster, metrics
import pandas as pd

# Generate the Data Matrix and the Label vector
Read data into torch.Tensor

In [42]:
torch.set_default_dtype(torch.float64)

# Specify the top-level folder
top_folder = "data"

# Initialize an empty list to store flattened arrays
flattened_arrays = []
labels = torch.zeros(9120)
example_cnt, example_label = 0, 0

for root, dirs, files in os.walk(top_folder):
    for file in files:
        if file.endswith(".txt"):
            file_path = os.path.join(root, file)
            lines = []
            with open(file_path, "r") as file:
                for line in file:
                    values = line.strip().split(",")
                    lines.append([float(value) for value in values])

            flattened_array = torch.tensor(lines).view(-1)
            labels[example_cnt] = example_label

            flattened_arrays.append(flattened_array)
            example_cnt += 1

            if example_cnt % 480 == 0:
                example_label += 1

all_data = torch.stack(flattened_arrays)

# all_data_1 is a 2D tensor of shape (9120, 45) containing the mean of each column in each segment resulting in 45 features for each data point
all_data_1 = torch.zeros((all_data.shape[0], 45))

for i in range(all_data_1.shape[1]):
    all_data_1[:, i] = all_data[:, i * 125 : i * 125 + 125].mean(1)
    
# all_data_2 is a 2D tensor of shape (9120, 5625) containing 45 x 125 features for each data point
all_data_2 = all_data

# Split the Dataset into Training and Test sets
Split dataset into training and test data

In [43]:
training_indices = [i for i in range(len(all_data_1)) if (i % 60) < 48]
test_indices = [i for i in range(len(all_data_1)) if (i % 60) >= 48]

training_data_1 = all_data_1[training_indices]
test_data_1 = all_data_1[test_indices]

training_data_2 = all_data_2[training_indices]
test_data_2 = all_data_2[test_indices]

training_labels = labels[training_indices]
test_labels = labels[test_indices]

### Reduced data using PCA

In [44]:
pca = PCA(n_components=45)
pca.fit(training_data_2)
training_data_2_reduced = Tensor(pca.transform(training_data_2))

pca.fit(test_data_2)
test_data_2_reduced = Tensor(pca.transform(test_data_2))

# K-Means Algorithm

In [45]:
def k_means(points: Tensor, k: int, relative_error: float = 1e-6, max_iterations: int = 1000):
    n = len(points)
    means = points[torch.randperm(n)[:k]]

    error = 1
    iterations = 0
    while error > relative_error and iterations < max_iterations:
        iterations += 1
        distances = torch.cdist(points, means)
        closest_means = torch.argmin(distances, dim=1)

        new_means = torch.zeros_like(means)
        for i in range(k):
            new_means[i] = points[closest_means == i].mean(dim=0)

        error = torch.norm(means - new_means) / torch.norm(new_means)

        means = new_means

    return means, closest_means

## K-Ways normalised Cut Algorithm

In [46]:
def rbf_graph(data: Tensor, gamma: float):
    
    return torch.exp(-gamma*torch.cdist(data,data)**2)

In [47]:
def k_ways_normalised_cut(a: Tensor, k: int):
    delta = a.sum(dim=1).diag()
    inverse_delta = torch.diag(1 / delta.diag())

    # Replace inf and nan values with 0
    inverse_delta.masked_fill_(torch.isnan(inverse_delta) | torch.isinf(inverse_delta), 0)

    l_a = inverse_delta @ (delta - a)
        
    eigen_values, eigen_vectors = torch.linalg.eig(l_a)
    eigen_values = eigen_values.real
    eigen_vectors = eigen_vectors.real

    indices = torch.argsort(eigen_values)
    eigen_values = eigen_values[indices]
    eigen_vectors = eigen_vectors[:, indices]

    u = eigen_vectors[:, :k]
    y = u / torch.norm(u, dim=1, keepdim=True)
    y.masked_fill_(torch.isnan(y) | torch.isinf(y), 0)
    
    # Create KMeans object
    kmeans = KMeans(n_clusters=k)

    # Fit the model to the data
    kmeans.fit(y)

    # Predict the cluster labels
    centroids = Tensor(kmeans.cluster_centers_)
    predicted_labels = Tensor(kmeans.labels_)
    #means, closest_means = k_means(y, k)

    return centroids,predicted_labels

# Evaluation fucntions

#### Precision

In [48]:
def precision(clustering: Tensor, labels: Tensor):
    cluster_labels = torch.unique(clustering)

    total_precision = 0.0

    for cluster_label in cluster_labels:
        cluster_indices = (clustering == cluster_label).nonzero()

        actual_cluster_labels = labels[cluster_indices]
        mode = actual_cluster_labels.mode(dim=0)[0]
        total_precision += len(actual_cluster_labels[actual_cluster_labels == mode])

    return total_precision / len(clustering)

#### recall

In [49]:
def recall(clustering: Tensor, labels: Tensor):
    cluster_labels = torch.unique(clustering)

    total_recall = 0.0

    for cluster_label in cluster_labels:
        cluster_indices = (clustering == cluster_label).nonzero()

        actual_cluster_labels = labels[cluster_indices]
        mode = actual_cluster_labels.mode(dim=0)[0]
        total_recall += len(actual_cluster_labels[actual_cluster_labels == mode]) / len(labels[labels == mode])

    return total_recall / len(cluster_indices)

#### F1 score

In [50]:
def f1_score(clustering: Tensor, labels: Tensor):
    cluster_labels = torch.unique(clustering)

    total_f = 0.0

    for cluster_label in cluster_labels:
        cluster_indices = (clustering == cluster_label).nonzero()

        actual_cluster_labels = labels[cluster_indices]
        mode = actual_cluster_labels.mode(dim=0)[0]

        precision = len(actual_cluster_labels[actual_cluster_labels == mode]) / len(cluster_indices)
        recall = len(actual_cluster_labels[actual_cluster_labels == mode]) / len(labels[labels == mode])
        total_f += 2 * precision * recall / (precision + recall)

    return total_f / len(cluster_labels)

#### conditional entropy

In [51]:
def conditional_entropy(clustering: Tensor, labels: Tensor):
    cluster_labels = torch.unique(clustering)
    partition_labels = torch.unique(labels)

    total_entropy = 0.0
    for cluster_label in cluster_labels:
        cluster_entropy = 0.0
        cluster_indices = (clustering == cluster_label).nonzero()

        for partition_label in partition_labels:
            partition_indices = (labels == partition_label).nonzero()
            cluster_in_partition_count = (clustering[partition_indices] == cluster_label).sum()
            cluster_entropy -= cluster_in_partition_count / len(cluster_indices) * torch.log2(torch.Tensor([cluster_in_partition_count / len(cluster_indices)])) if cluster_in_partition_count > 0 else 0        
        
        total_entropy += len(cluster_indices) / len(labels) * cluster_entropy

    return total_entropy

# Clustering Using K-Means and Normalized Cut

### Solution1:Taking the mean of each column in each segment for each data point

##### Using kmeans

In [52]:
ks = [8, 13, 19, 28,38]
for k in ks:
    centroids,training_predicted_labels = k_means(training_data_1,k)
    test_predicted_labels = torch.empty_like(test_labels, dtype=torch.long)
    for i,point in enumerate(test_data_1):
        distances = torch.norm(point - centroids, dim=1)
        test_predicted_labels[i] = torch.argmin(distances)
        
    prec    = precision(test_predicted_labels,test_labels)
    rec     = recall(test_predicted_labels,test_labels)
    f_score = f1_score(test_predicted_labels,test_labels)
    entropy = conditional_entropy(test_predicted_labels,test_labels)
    
    print(f'Precision for k:{k} = {prec}')
    print(f'Recall for k:{k} = {rec}')
    print(f'Fscore for k:{k} = {f_score}')  
    print(f'Entropy for k:{k} = {entropy.item()}')  
    print("------------------------------------------")        

Precision for k:8 = 0.21875
Recall for k:8 = 0.01998197115384616
Fscore for k:8 = 0.31391652799917263
Entropy for k:8 = 3.1855471383950524
------------------------------------------
Precision for k:13 = 0.2461622807017544
Recall for k:13 = 0.3897569444444444
Fscore for k:13 = 0.280575096430404
Entropy for k:13 = 3.036060786441438
------------------------------------------
Precision for k:19 = 0.27576754385964913
Recall for k:19 = 0.23816287878787878
Fscore for k:19 = 0.2837556492824596
Entropy for k:19 = 2.7441339904987005
------------------------------------------
Precision for k:28 = 0.32730263157894735
Recall for k:28 = 0.32730263157894735
Fscore for k:28 = 0.23840293636602516
Entropy for k:28 = 2.6531349922563163
------------------------------------------
Precision for k:38 = 0.36239035087719296
Recall for k:38 = 3.442708333333334
Fscore for k:38 = 0.22662427662052004
Entropy for k:38 = 2.44323685016379
------------------------------------------


##### Using Normalized Cut

In [53]:
alpha,k = 0.01,19

sim_graph = rbf_graph(test_data_1, alpha)
centroids,test_predicted_labels = k_ways_normalised_cut(sim_graph,k)
#sim_graph = rbf_graph(training_data_1, alpha)
#centroids,training_predicted_labels = k_ways_normalised_cut(sim_graph,k)

# test_predicted_labels = torch.empty_like(test_labels, dtype=torch.long)
# for i,point in enumerate(test_data_1):
#     distances = torch.norm(point - centroids, dim=1)
#     test_predicted_labels[i] = torch.argmin(distances)
    
prec    = precision(test_predicted_labels,test_labels)
rec     = recall(test_predicted_labels,test_labels)
f_score = f1_score(test_predicted_labels,test_labels)
entropy = conditional_entropy(test_predicted_labels,test_labels)

print(f'Precision for k:{k} = {prec}')
print(f'Recall for k:{k} = {rec}')
print(f'Fscore for k:{k} = {f_score}')  
print(f'Entropy for k:{k} = {entropy.item()}')  

Precision for k:19 = 0.34594298245614036
Recall for k:19 = 0.05816740412979352
Fscore for k:19 = 0.3381320919832668
Entropy for k:19 = 2.4356764035651883


### solution2:Flattening all the features together for each data point

##### Using kmeans

In [54]:
ks = [8, 13, 19, 28,38]
centroids,training_predicted_labels = None,None
for k in ks:
    centroids,training_predicted_labels = k_means(training_data_2_reduced,k)
    
    test_predicted_labels = torch.empty_like(test_labels, dtype=torch.long)
    for i,point in enumerate(test_data_2_reduced):
        distances = torch.norm(point - centroids, dim=1)
        test_predicted_labels[i] = torch.argmin(distances)
        
    prec    = precision(test_predicted_labels,test_labels)
    rec     = recall(test_predicted_labels,test_labels)
    f_score = f1_score(test_predicted_labels,test_labels)
    entropy = conditional_entropy(test_predicted_labels,test_labels)
    
    print(f'Precision for k:{k} = {prec}')
    print(f'Recall for k:{k} = {rec}')
    print(f'Fscore for k:{k} = {f_score}')  
    print(f'Entropy for k:{k} = {entropy.item()}')  
    print("------------------------------------------")   

Precision for k:8 = 0.29660087719298245
Recall for k:8 = 0.04143688725490196
Fscore for k:8 = 0.49927880170146366
Entropy for k:8 = 2.56907576189462
------------------------------------------
Precision for k:13 = 0.31743421052631576
Recall for k:13 = 0.1884765625
Fscore for k:13 = 0.4198562789257785
Entropy for k:13 = 2.497385821915409
------------------------------------------
Precision for k:19 = 0.36019736842105265
Recall for k:19 = 0.14257812499999997
Fscore for k:19 = 0.33401574465458606
Entropy for k:19 = 2.351616560749551
------------------------------------------
Precision for k:28 = 0.4621710526315789
Recall for k:28 = 0.049332865168539325
Fscore for k:28 = 0.3488378016051017
Entropy for k:28 = 1.9742813900558873
------------------------------------------
Precision for k:38 = 0.48739035087719296
Recall for k:38 = 0.2893880208333333
Fscore for k:38 = 0.3279790862821624
Entropy for k:38 = 1.8983126571875855
------------------------------------------


##### Using Normalized Cut

In [55]:
alpha,k = 0.01,19

sim_graph = rbf_graph(test_data_2_reduced, alpha)
centroids,test_predicted_labels = k_ways_normalised_cut(sim_graph,k)

# sim_graph = rbf_graph(training_data_2_reduced, alpha)
# centroids,training_predicted_labels = k_ways_normalised_cut(sim_graph,k)

# test_predicted_labels = torch.empty_like(test_labels, dtype=torch.long)
# for i,point in enumerate(test_data_2_reduced):
#     distances = torch.norm(point - centroids, dim=1)
#     test_predicted_labels[i] = torch.argmin(distances)
    
prec    = precision(test_predicted_labels,test_labels)
rec     = recall(test_predicted_labels,test_labels)
f_score = f1_score(test_predicted_labels,test_labels)
entropy = conditional_entropy(test_predicted_labels,test_labels)

print(f'Precision for k:{k} = {prec}')
print(f'Recall for k:{k} = {rec}')
print(f'Fscore for k:{k} = {f_score}')  
print(f'Entropy for k:{k} = {entropy.item()}')  

Precision for k:19 = 0.2894736842105263
Recall for k:19 = 0.16666666666666669
Fscore for k:19 = 0.28931303351584203
Entropy for k:19 = 2.779322705846486


# Hierarchical clustering

In [56]:
def power_method(a: Tensor, k: int):
    n = len(a)
    v = torch.randn(n)
    v /= torch.norm(v)

    for _ in range(1000):
        v = a @ v
        v /= torch.norm(v)

    return k_means(v[:, None], k)