In [13]:
from PIL import Image, ImageOps
import torch
from torch import Tensor, linalg
import numpy as np
from sklearn.decomposition import PCA
from IPython.display import Latex
import matplotlib.pyplot as plt
import math
import os

# Generate the Data Matrix and the Label vector
Read data into torch.Tensor

In [14]:
torch.set_default_dtype(torch.float64)

# Specify the top-level folder
top_folder = 'data'

# Initialize an empty list to store flattened arrays
flattened_arrays = []
labels = torch.zeros(9120)
example_cnt,example_label = 0,0

# Recursively iterate over each file in each folder
for root, dirs, files in os.walk(top_folder):
    
    for file in files:
        # Check if the file is a text file
        if file.endswith('.txt'):
            file_path = os.path.join(root, file)
            
            # Initialize an empty list to store lines
            lines = []
            
            # Open the file in read mode
            with open(file_path, 'r') as file:
                # Read each line in the file
                for line in file:
                    # Remove newline character and split the line by comma
                    values = line.strip().split(',')
                    # Convert values to integers and append to the list of lines
                    lines.append([float(value) for value in values])
            
            # Flatten the list of lines into a 1D array
            flattened_array = torch.tensor(lines).view(-1)
            labels[example_cnt] = example_label
            
            # Append the flattened array to the list of flattened arrays
            flattened_arrays.append(flattened_array)
            example_cnt += 1
            
            if example_cnt%480 == 0:
                example_label += 1

# Convert the list of flattened arrays to a 2D tensor
all_data_1 = torch.stack(flattened_arrays)

all_data_2 = torch.zeros((all_data_1.shape[0],45))

for i in range(all_data_2.shape[1]):
    all_data_2[:,i] = all_data_1[:,i*125:i*125+125].mean(1)

# Print the resulting 2D tensor
print(labels)


tensor([ 0.,  0.,  0.,  ..., 18., 18., 18.])


# Split the Dataset into Training and Test sets
Split dataset into training and test data

In [15]:
training_indices = [i for i in range(len(all_data_1)) if (i % 60) < 48]
test_indices = [i for i in range(len(all_data_1)) if (i % 60) >= 48]

training_data_1 = all_data_1[training_indices]
test_data_1 = all_data_1[test_indices]

training_data_2 = all_data_2[training_indices]
test_data_2 = all_data_2[test_indices]

training_labels = labels[training_indices]
test_labels = labels[test_indices]

### Reduced data using PCA

In [16]:
pca = PCA(n_components=100)
pca.fit(training_data_1)
training_data_1_reduced = Tensor(pca.transform(training_data_1))

# Clustering Using K-Means and Normalized Cut

In [17]:
def k_means(points: Tensor, k: int, relative_error: float = 1e-6, max_iterations: int = 1000):
    n = len(points)
    means = points[torch.randperm(n)[:k]]

    error = 1
    iterations = 0
    while error > relative_error and iterations < max_iterations:
        iterations += 1
        distances = torch.cdist(points, means)
        closest_means = torch.argmin(distances, dim=1)

        new_means = torch.zeros_like(means)
        for i in range(k):
            new_means[i] = points[closest_means == i].mean(dim=0)

        error = torch.norm(means - new_means) / torch.norm(new_means)

        means = new_means

    return means, closest_means

In [18]:
def k_means_2(points:Tensor,k):
    
    '''
        points: dataset
        k: number of clusters
    '''
    plen = len(points[0]) if points.ndim > 1 else 1
    means = points[torch.randperm(len(points))[:k]]
    itr = 0
    while True:
        itr+=1
        classes = [[] for _ in range(k)]
        
        for j,point in enumerate(points):
            min_distance = torch.inf
            nearest_mean = None
        
            for i in range (len(means)):
                distance = torch.linalg.norm(point - means[i])
                if distance < min_distance:
                    min_distance = distance
                    nearest_mean = i
            
            classes[nearest_mean].append((j+1,point))
           
        new_means = torch.tensor([[0.0 for _ in range(plen)] for _ in range(k)]) 
        for i,c in enumerate(classes):
            n = float(len(c))
            new_mean = torch.tensor([0.0 for _ in range(plen)])
            for j,point in c:
                new_mean+=point
            new_mean = new_mean/n
            new_means[i] = new_mean
        
        if(torch.equal(new_means,means)):
            return classes
        
        means = new_means

### solution1:Taking the mean of each column in each segment for each data point

### solution2:Flattening all the features together for each data point

In [59]:
ks = [8, 13, 19, 28,38]
ks = [13]
classes = [[] for i in range(19)]
for k in ks:
    means, mylabels = k_means(training_data_1_reduced,k)
    # for i,lbl in enumerate(mylabels):
    #     classes[lbl].append(i)
    
    # classes = k_means_2(training_data_1_reduced,k)
    
    # for i in range(len(classes)):
    #     classes[i] = [classes[i][j][0] - 1 for j in range(len(classes[i]))]
               
    # mylabels = np.zeros(19)
    # for i,c in  enumerate(classes):
    #     for idx in c:
    #         mylabels[idx-1] = i
        
    #accuracy = torch.sum(labels == mylabels)/len(labels)
    
    #print(f'accuracy for k = {k} is {accuracy}')

In [77]:
def conditional_entropy(clusters,labels,sizes):
    '''
        labels: true labels for each point
        sizes : size of each true label(no. of points in each true label)
    '''
    conditional_entropy_t = 0
    num_of_samples = len(labels)
    for cluster in clusters:
        if(len(cluster) == 0): continue
        count = torch.zeros(len(sizes))
        for sample in cluster:
            count[torch.tensor(labels[sample], dtype=torch.long)]+=1
        htc_i = 0
        for cnt in count:    
            div = cnt/len(cluster)
            htc_i += (-cnt/len(cluster) * (math.log2(div) if div else 0))

        conditional_entropy_t += (len(cluster)/num_of_samples)*htc_i
        
    return conditional_entropy_t

In [78]:
sizes = torch.full((19,), 384)

val = conditional_entropy(classes,training_labels,sizes)

print(val)


tensor(2.0956)


  count[torch.tensor(labels[sample], dtype=torch.long)]+=1
