In [172]:
import torch
from torch import Tensor, linalg
from sklearn.decomposition import PCA
from IPython.display import Latex
import math
import queue
import os
from sklearn.cluster import KMeans
from sklearn import cluster, metrics

# Generate the Data Matrix and the Label vector
Read data into torch.Tensor

In [152]:
torch.set_default_dtype(torch.float64)

# Specify the top-level folder
top_folder = "data"

# Initialize an empty list to store flattened arrays
flattened_arrays = []
labels = torch.zeros(9120)
example_cnt, example_label = 0, 0

for root, dirs, files in os.walk(top_folder):
    for file in files:
        if file.endswith(".txt"):
            file_path = os.path.join(root, file)
            lines = []
            with open(file_path, "r") as file:
                for line in file:
                    values = line.strip().split(",")
                    lines.append([float(value) for value in values])

            flattened_array = torch.tensor(lines).view(-1)
            labels[example_cnt] = example_label

            flattened_arrays.append(flattened_array)
            example_cnt += 1

            if example_cnt % 480 == 0:
                example_label += 1

all_data = torch.stack(flattened_arrays)

# all_data_1 is a 2D tensor of shape (9120, 45) containing the mean of each column in each segment resulting in 45 features for each data point
all_data_1 = torch.zeros((all_data.shape[0], 45))

for i in range(all_data_1.shape[1]):
    all_data_1[:, i] = all_data[:, i * 125 : i * 125 + 125].mean(1)
    
# all_data_2 is a 2D tensor of shape (9120, 5625) containing 45 x 125 features for each data point
all_data_2 = all_data

# Split the Dataset into Training and Test sets
Split dataset into training and test data

In [153]:
training_indices = [i for i in range(len(all_data)) if (i % 60) < 48]
test_indices = [i for i in range(len(all_data)) if (i % 60) >= 48]

training_data_1 = all_data_1[training_indices]
test_data_1 = all_data_1[test_indices]

training_data_2 = all_data_2[training_indices]
test_data_2 = all_data_2[test_indices]

training_labels = labels[training_indices]
test_labels = labels[test_indices]

### Reduced data using PCA

In [154]:
pca = PCA(n_components=45)
pca.fit(training_data_2)
training_data_2_reduced = Tensor(pca.transform(training_data_2))

pca.fit(test_data_2)
test_data_2_reduced = Tensor(pca.transform(test_data_2))

# K-Means Algorithm

In [155]:
def k_means(points: Tensor, k: int, relative_error: float = 1e-6, max_iterations: int = 1000):
    n = len(points)
    means = points[torch.randperm(n)[:k]]

    error = 1
    iterations = 0
    while error > relative_error and iterations < max_iterations:
        iterations += 1
        distances = torch.cdist(points, means)
        closest_means = torch.argmin(distances, dim=1)

        new_means = torch.zeros_like(means)
        for i in range(k):
            new_means[i] = points[closest_means == i].mean(dim=0)

        error = torch.norm(means - new_means) / torch.norm(new_means)

        means = new_means

    return means, closest_means

## K-Ways normalised Cut Algorithm

In [221]:
def KNN_similarity_graph(data,k):
    n = data.shape[0]
    sim_graph = torch.zeros((n,n))
    distances = torch.cdist(data, data)
    distances.view(-1)[::distances.size(0) + 1] = float('inf')
    _, indices = torch.topk(distances, k, largest=False)
    row_indices = torch.arange(n).unsqueeze(1).expand(n,k)
    sim_graph[row_indices,indices] = 1
        
    return sim_graph  

In [156]:
def rbf_graph(data: Tensor, gamma: float):
    
    return torch.exp(-gamma*torch.cdist(data,data)**2)

In [157]:
def k_ways_normalised_cut(a: Tensor, k: int):
    delta = a.sum(dim=1).diag()
    inverse_delta = torch.diag(1 / delta.diag())

    # Replace inf and nan values with 0
    inverse_delta.masked_fill_(torch.isnan(inverse_delta) | torch.isinf(inverse_delta), 0)

    l_a = inverse_delta @ (delta - a)
        
    eigen_values, eigen_vectors = torch.linalg.eig(l_a)
    eigen_values = eigen_values.real
    eigen_vectors = eigen_vectors.real

    indices = torch.argsort(eigen_values)
    eigen_values = eigen_values[indices]
    eigen_vectors = eigen_vectors[:, indices]

    u = eigen_vectors[:, :k]
    y = u / torch.norm(u, dim=1, keepdim=True)
    y.masked_fill_(torch.isnan(y) | torch.isinf(y), 0)
    
    # Create KMeans object
    kmeans = KMeans(n_clusters=k)

    # Fit the model to the data
    kmeans.fit(y)

    # Predict the cluster labels
    centroids = Tensor(kmeans.cluster_centers_)
    predicted_labels = Tensor(kmeans.labels_)
    #means, closest_means = k_means(y, k)

    return centroids,predicted_labels

# Evaluation fucntions

#### Precision

In [158]:
def precision(clustering: Tensor, labels: Tensor):
    cluster_labels = torch.unique(clustering)

    total_precision = 0.0

    for cluster_label in cluster_labels:
        cluster_indices = (clustering == cluster_label).nonzero()

        actual_cluster_labels = labels[cluster_indices]
        mode = actual_cluster_labels.mode(dim=0)[0]
        total_precision += len(actual_cluster_labels[actual_cluster_labels == mode])

    return total_precision / len(clustering)

#### recall

In [159]:
def recall(clustering: Tensor, labels: Tensor):
    cluster_labels = torch.unique(clustering)

    total_recall = 0.0

    for cluster_label in cluster_labels:
        cluster_indices = (clustering == cluster_label).nonzero()

        actual_cluster_labels = labels[cluster_indices]
        mode = actual_cluster_labels.mode(dim=0)[0]
        total_recall += len(actual_cluster_labels[actual_cluster_labels == mode]) / len(labels[labels == mode])

    return total_recall / len(cluster_indices)

#### F1 score

In [160]:
def f1_score(clustering: Tensor, labels: Tensor):
    cluster_labels = torch.unique(clustering)

    total_f = 0.0

    for cluster_label in cluster_labels:
        cluster_indices = (clustering == cluster_label).nonzero()

        actual_cluster_labels = labels[cluster_indices]
        mode = actual_cluster_labels.mode(dim=0)[0]

        precision = len(actual_cluster_labels[actual_cluster_labels == mode]) / len(cluster_indices)
        recall = len(actual_cluster_labels[actual_cluster_labels == mode]) / len(labels[labels == mode])
        total_f += 2 * precision * recall / (precision + recall)

    return total_f / len(cluster_labels)

#### conditional entropy

In [161]:
def conditional_entropy(clustering: Tensor, labels: Tensor):
    cluster_labels = torch.unique(clustering)
    partition_labels = torch.unique(labels)

    total_entropy = 0.0
    for cluster_label in cluster_labels:
        cluster_entropy = 0.0
        cluster_indices = (clustering == cluster_label).nonzero()

        for partition_label in partition_labels:
            partition_indices = (labels == partition_label).nonzero()
            cluster_in_partition_count = (clustering[partition_indices] == cluster_label).sum()
            cluster_entropy -= cluster_in_partition_count / len(cluster_indices) * torch.log2(torch.Tensor([cluster_in_partition_count / len(cluster_indices)])) if cluster_in_partition_count > 0 else 0        
        
        total_entropy += len(cluster_indices) / len(labels) * cluster_entropy

    return total_entropy

# Clustering Using K-Means and Normalized Cut

### Solution1:Taking the mean of each column in each segment for each data point

##### Using kmeans

In [169]:
ks = [8, 13, 19, 28,38]
for k in ks:
    centroids,training_predicted_labels = k_means(training_data_1,k)
    test_predicted_labels = torch.empty_like(test_labels, dtype=torch.long)
    for i,point in enumerate(test_data_1):
        distances = torch.norm(point - centroids, dim=1)
        test_predicted_labels[i] = torch.argmin(distances)
    
    prec_train     = precision(training_predicted_labels,training_labels)
    rec_train      = recall(training_predicted_labels,training_labels)
    f_score_train  = f1_score(training_predicted_labels,training_labels)
    entropy_train  = conditional_entropy(training_predicted_labels,training_labels)
    
    prec_test    = precision(test_predicted_labels,test_labels)
    rec_test     = recall(test_predicted_labels,test_labels)
    f_score_test = f1_score(test_predicted_labels,test_labels)
    entropy_test  = conditional_entropy(test_predicted_labels,test_labels)
    
    print(f'------ For k = {k} ------')
    print("training:")
    print(f'Precision for training set = {prec_train}')
    print(f'Recall for training set = {rec_train}')
    print(f'Fscore for training set = {f_score_train}')  
    print(f'Entropy for training set= {entropy_train.item()}')     
    print("test:")
    print(f'Precision for test set = {prec_test}')
    print(f'Recall for test set = {rec_test}')
    print(f'Fscore for test set = {f_score_test}')  
    print(f'Entropy for test set = {entropy_test.item()}')    

------ For k = 8 ------
training:
Precision for training set = 0.23012609649122806
Recall for training set = 0.0036804678731762065
Fscore for training set = 0.33235224169793587
Entropy for training set= 3.1477321000007628
test:
Precision for test set = 0.22149122807017543
Recall for test set = 0.014663182346109178
Fscore for test set = 0.32636551041255585
Entropy for test set = 3.166743999840403
------ For k = 13 ------
training:
Precision for training set = 0.24246162280701755
Recall for training set = 0.0041614912676904555
Fscore for training set = 0.24587193125743811
Entropy for training set= 3.0717479349558277
test:
Precision for test set = 0.22149122807017543
Recall for test set = 0.01535888077858881
Fscore for test set = 0.21714736960285735
Entropy for test set = 3.1383983635236783
------ For k = 19 ------
training:
Precision for training set = 0.28686951754385964
Recall for training set = 0.013694775963149075
Fscore for training set = 0.26076386313952643
Entropy for training set

##### Using Normalized Cut

In [239]:
alpha,k = 0.1,19

#sim_graph = rbf_graph(test_data_1, alpha)
sim_graph = KNN_similarity_graph(test_data_1,100)
centroids,test_predicted_labels = k_ways_normalised_cut(sim_graph,k)
    
prec    = precision(test_predicted_labels,test_labels)
rec     = recall(test_predicted_labels,test_labels)
f_score = f1_score(test_predicted_labels,test_labels)
entropy = conditional_entropy(test_predicted_labels,test_labels)

print(f'Precision for k:{k} = {prec}')
print(f'Recall for k:{k} = {rec}')
print(f'Fscore for k:{k} = {f_score}')  
print(f'Entropy for k:{k} = {entropy.item()}')  

Precision for k:19 = 0.32127192982456143
Recall for k:19 = 0.07536008230452676
Fscore for k:19 = 0.3161804259185218
Entropy for k:19 = 2.590691426867207


### solution2:Flattening all the features together for each data point

##### Using kmeans

In [170]:
ks = [8, 13, 19, 28,38]
centroids,training_predicted_labels = None,None
for k in ks:
    centroids,training_predicted_labels = k_means(training_data_2_reduced,k)
    
    test_predicted_labels = torch.empty_like(test_labels, dtype=torch.long)
    for i,point in enumerate(test_data_2_reduced):
        distances = torch.norm(point - centroids, dim=1)
        test_predicted_labels[i] = torch.argmin(distances)
        
    prec_train     = precision(training_predicted_labels,training_labels)
    rec_train      = recall(training_predicted_labels,training_labels)
    f_score_train  = f1_score(training_predicted_labels,training_labels)
    entropy_train  = conditional_entropy(training_predicted_labels,training_labels)
    
    prec_test    = precision(test_predicted_labels,test_labels)
    rec_test     = recall(test_predicted_labels,test_labels)
    f_score_test = f1_score(test_predicted_labels,test_labels)
    entropy_test  = conditional_entropy(test_predicted_labels,test_labels)
    
    print(f'------ For k = {k} ------')
    print("training:")
    print(f'Precision for training set = {prec_train}')
    print(f'Recall for training set = {rec_train}')
    print(f'Fscore for training set = {f_score_train}')  
    print(f'Entropy for training set= {entropy_train.item()}')     
    print("test:")
    print(f'Precision for test set = {prec_test}')
    print(f'Recall for test set = {rec_test}')
    print(f'Fscore for test set = {f_score_test}')  
    print(f'Entropy for test set = {entropy_test.item()}')  

------ For k = 8 ------
training:
Precision for training set = 0.31537828947368424
Recall for training set = 0.015604654947916666
Fscore for training set = 0.5335763981555788
Entropy for training set= 2.4628966977251396
test:
Precision for test set = 0.25548245614035087
Recall for test set = 0.050564236111111105
Fscore for test set = 0.41208124477861324
Entropy for test set = 2.70392040891322
------ For k = 13 ------
training:
Precision for training set = 0.35635964912280704
Recall for training set = 0.017632378472222224
Fscore for training set = 0.43072130245822415
Entropy for training set= 2.2824933094104822
test:
Precision for test set = 0.3267543859649123
Recall for test set = 0.06467013888888888
Fscore for test set = 0.384606765084729
Entropy for test set = 2.4709029559561007
------ For k = 19 ------
training:
Precision for training set = 0.42269736842105265
Recall for training set = 0.004972910216718266
Fscore for training set = 0.406460595120177
Entropy for training set= 2.06545

##### Using Normalized Cut

In [240]:
alpha,k = 0.1,19

#sim_graph = rbf_graph(test_data_2_reduced, alpha)
sim_graph = KNN_similarity_graph(test_data_2_reduced,100)
centroids,test_predicted_labels = k_ways_normalised_cut(sim_graph,k)
    
prec    = precision(test_predicted_labels,test_labels)
rec     = recall(test_predicted_labels,test_labels)
f_score = f1_score(test_predicted_labels,test_labels)
entropy = conditional_entropy(test_predicted_labels,test_labels)

print(f'Precision for k:{k} = {prec}')
print(f'Recall for k:{k} = {rec}')
print(f'Fscore for k:{k} = {f_score}')  
print(f'Entropy for k:{k} = {entropy.item()}')  

Precision for k:19 = 0.4780701754385965
Recall for k:19 = 0.133578431372549
Fscore for k:19 = 0.48242725869926173
Entropy for k:19 = 1.8723464335154814


# Hierarchical clustering

In [166]:
def power_method(a: Tensor, k: int):
    n = len(a)
    v = torch.randn(n)
    v /= torch.norm(v)

    for _ in range(1000):
        v = a @ v
        v /= torch.norm(v)

    return k_means(v[:, None], k)