In [2]:
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from functools import partial as partial_func
from sklearn.cluster import KMeans

# Data Preprocessing

### Generating training data and testing data

In [3]:
path = "D:\\CSED\\semester-6\\pattern-recognetion\\labs\\lab2-clustering\\data"

In [4]:
def read_file(filename):
  """
    Return list of all 125 rows (125 * 45)
  """
  data = []
  with open(filename, 'r') as file:
    for line in file:
      row = [float(value) for value in line.strip().split(',')] # Converting to float
      data.append(row)
  return data

In [5]:
def approach_2_generator(list_of_rows):
    return [item for row in list_of_rows for item in row]

In [6]:
def approach_1_generator(list_of_rows):
    n = len(list_of_rows)
    mean_sample = [0 for _ in range(len(list_of_rows[0]))]
    for row in list_of_rows:
        for i in range(len(row)):
            mean_sample[i] += row[i]
    return [x / n for x in mean_sample]  

In [7]:
def generate_data(approach): 
    """
        This function generates and splits tainging and tesing data condering the approach desired. 
        approach = 1 -> Taking the mean of each column in each segment resulting in 45 features for each data point.
        approach = 2 -> Flattening all the features together in 45 x 125 = 5625 features for each data point.
    """
    training_data, training_labels, testing_data, testing_labels = [], [], [], []

    for activity in sorted(os.listdir(path)):
        label = int(activity[1:]) - 1   # To make it zero-based
        subjects_path = os.path.join(path, activity)
        # subject_path = path + '/' + activity
        for subject in sorted(os.listdir(subjects_path)):
            segments_path = os.path.join(subjects_path, subject)
            
            for segment in sorted(os.listdir(segments_path)):
                file_name =  os.path.join(segments_path, segment)
                data_sample = []
                
                if approach == 1:
                    data_sample = approach_1_generator(read_file(file_name))
                elif approach == 2:
                    data_sample = approach_2_generator(read_file(file_name))
    
                if int(segment[1:3]) <= 48: # Belongs to training data
                    training_data.append(data_sample)
                    training_labels.append(label)
                else:
                    testing_data.append(data_sample)
                    testing_labels.append(label)
    return  training_data , training_labels , testing_data , testing_labels

### Data generated by taking the mean of each column in each segment

In [8]:
training_data_1 , training_labels_1 , testing_data_1 , testing_labels_1 = generate_data(1)

### Data generated by flattening all the features together

In [9]:
training_data_2 , training_labels_2 , testing_data_2 , testing_labels_2 = generate_data(2)

### Normalizing features

In [10]:
scaler_1, scaler_2 = StandardScaler(), StandardScaler()
# Normalizing approach 1
scaler_1.fit(training_data_1)
normalized_training_data_1 = scaler_1.transform(training_data_1)
normalized_testing_data_1 = scaler_1.transform(testing_data_1)

# Normalizing approach 2
scaler_2.fit(training_data_2)
normalized_training_data_2 = scaler_2.transform(training_data_2)
normalized_testing_data_2 = scaler_2.transform(testing_data_2)

### Applying dimensionality reduction using PCA

In [11]:
pca = PCA(n_components=0.95)
pca.fit(normalized_training_data_2)
reduced_training_data_2 = pca.transform(normalized_training_data_2)
reduced_testing_data_2 = pca.transform(normalized_testing_data_2)

In [12]:
print("Reduced training data shape: ", reduced_training_data_2.shape)

Reduced training data shape:  (7296, 870)


# K-means

In [13]:
def kmeans(data, k, max_iter=1000, tol=1e-6):
    # Initialize centroids randomly
    centroids = data[np.random.choice(range(len(data)), k, replace=False)]
    
    # Initialize cluster assignments
    clusters = np.zeros(len(data))
    
    for _ in range(max_iter):
        # Assign each data point to the nearest centroid
        for i, point in enumerate(data):
            distances = np.linalg.norm(point - centroids, axis=1)
            clusters[i] = np.argmin(distances)
        
        # Update centroids
        new_centroids = np.array([np.mean(data[clusters == i], axis=0) for i in range(k)])
        
        # Check convergence
        if np.linalg.norm(new_centroids - centroids) < tol:
            break
        
        centroids = new_centroids
    
    return clusters, centroids

In [14]:
# simple example usage for the kmeans function
data = np.array([[1, 1], [2, 1], [3, 2], [7, 5], [6, 4], [8, 6], [10, 1], [11, 3], [12, 2]])
# expected results
# clusters = [0, 0, 0, 1, 1, 1, 2, 2, 2]
# centroids = [[1.5, 2.5], [3.5, 4.5], [5.5, 6.5]]
clusters, centroids = kmeans(data, 3, tol=1e-6)
print(clusters)
print(centroids)

[0. 0. 0. 1. 1. 1. 2. 2. 2.]
[[ 2.          1.33333333]
 [ 7.          5.        ]
 [11.          2.        ]]


In [15]:
def predict(data, centroids):
    clusters = np.zeros(len(data))
    for i, point in enumerate(data):
        distances = np.linalg.norm(point - centroids, axis=1)
        clusters[i] = np.argmin(distances)
    return clusters

In [32]:
def get_confusion_mat(clusters, labels, k, n):
    confusion_matrix = np.zeros((k, n))
    for i in range(len(labels)):
        confusion_matrix[int(clusters[i])][labels[i]] += 1
    return confusion_matrix

In [33]:

def assign_clusters_labels(clusters, labels, k, n):
    confusion_matrix = get_confusion_mat(clusters, labels, k, n)
    return np.argmax(confusion_matrix, axis=1)

In [35]:
def get_accuracy(clusters, labels, cluster_assignments):
    correct = 0
    for i in range(len(clusters)):
        if cluster_assignments[int(clusters[i])] == labels[i]:
            correct += 1
    return correct / len(labels)

In [39]:
def print_separator():
    print("--------------------------------------------------------------------------------------------------------------------")

In [41]:
# KMeans for approach 1
Ks = [8, 13, 19, 28, 38]
for k in Ks:
    print("KMeans for approach 1 with K =", k)
    # Training
    training_predictions, training_centroids = kmeans(normalized_training_data_1, k)

    # Testing
    testing_predictions = predict(normalized_testing_data_1, training_centroids)

    # Assigning labels
    cluster_assignments = assign_clusters_labels(training_predictions, training_labels_1, k, 19)

    # Calculating accuracy
    training_accuracy = get_accuracy(training_predictions, training_labels_1, cluster_assignments)
    testing_accuracy = get_accuracy(testing_predictions, testing_labels_1, cluster_assignments)

    print("Training accuracy: ", training_accuracy)
    print("Testing accuracy: ", testing_accuracy)
    print_separator()

# KMeans for approach 2
for k in Ks:
    print("KMeans for approach 2 with K =", k)
    # Training
    training_predictions, training_centroids = kmeans(reduced_training_data_2, k)

    # Testing
    testing_predictions = predict(reduced_testing_data_2, training_centroids)

    # Assigning labels
    cluster_assignments = assign_clusters_labels(training_predictions, training_labels_2, k, 19)

    # Calculating accuracy
    training_accuracy = get_accuracy(training_predictions, training_labels_2, cluster_assignments)
    testing_accuracy = get_accuracy(testing_predictions, testing_labels_2, cluster_assignments)

    print("Training accuracy: ", training_accuracy)
    print("Testing accuracy: ", testing_accuracy)
    print_separator()
    


KMeans for approach 1 with K = 8
Training accuracy:  0.3404605263157895
Testing accuracy:  0.3393640350877193
--------------------------------------------------------------------------------------------------------------------
KMeans for approach 1 with K = 13
Training accuracy:  0.47423245614035087
Testing accuracy:  0.4758771929824561
--------------------------------------------------------------------------------------------------------------------
KMeans for approach 1 with K = 19
Training accuracy:  0.5677083333333334
Testing accuracy:  0.5581140350877193
--------------------------------------------------------------------------------------------------------------------
KMeans for approach 1 with K = 28
Training accuracy:  0.6136239035087719
Testing accuracy:  0.6112938596491229
--------------------------------------------------------------------------------------------------------------------
KMeans for approach 1 with K = 38
Training accuracy:  0.696546052631579
Testing accuracy