In [42]:
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from functools import partial as partial_func
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

<h1 style='color:rgb(100, 149, 237)'> Data preprocessing<h1>

### Generating taining and testing data

In [10]:
path = "/home/mohamed/CSED_25/Year_3/term_2/PR-ML/my_labs/lab_2/data"

In [11]:
print(os.listdir("/home/mohamed/CSED_25/Year_3/term_2/PR-ML/my_labs/lab_2/data/a03/p1"))

['s41.txt', 's11.txt', 's04.txt', 's50.txt', 's43.txt', 's35.txt', 's20.txt', 's15.txt', 's31.txt', 's37.txt', 's49.txt', 's07.txt', 's26.txt', 's21.txt', 's44.txt', 's58.txt', 's53.txt', 's25.txt', 's48.txt', 's34.txt', 's19.txt', 's17.txt', 's30.txt', 's42.txt', 's47.txt', 's13.txt', 's51.txt', 's14.txt', 's27.txt', 's09.txt', 's28.txt', 's12.txt', 's18.txt', 's59.txt', 's56.txt', 's46.txt', 's22.txt', 's33.txt', 's08.txt', 's54.txt', 's05.txt', 's55.txt', 's10.txt', 's03.txt', 's24.txt', 's57.txt', 's29.txt', 's23.txt', 's45.txt', 's32.txt', 's60.txt', 's36.txt', 's16.txt', 's52.txt', 's01.txt', 's38.txt', 's06.txt', 's02.txt', 's40.txt', 's39.txt']


In [12]:
def read_file(filename):
  """
    Return list of all 125 rows (125 * 45)
  """
  data = []
  with open(filename, 'r') as file:
    for line in file:
      row = [float(value) for value in line.strip().split(',')] # Converting to float
      data.append(row)
  return data

In [13]:
def approach_2_generator(list_of_rows):
    return [item for row in list_of_rows for item in row]

In [14]:
def approach_1_generator(list_of_rows):
    n = len(list_of_rows)
    mean_sample = [0 for _ in range(len(list_of_rows[0]))]
    for row in list_of_rows:
        for i in range(len(row)):
            mean_sample[i] += row[i]
    return [x / n for x in mean_sample]  

In [15]:
def generate_data(approach): 
    """
        This function generates and splits tainging and tesing data condering the approach desired. 
        approach = 1 -> Taking the mean of each column in each segment resulting in 45 features for each data point.
        approach = 2 -> Flattening all the features together in 45 x 125 = 5625 features for each data point.
    """
    training_data, training_labels, testing_data, testing_labels = [], [], [], []

    for activity in sorted(os.listdir(path)):
        label = int(activity[1:]) - 1   # To make it zero-based
        subjects_path = os.path.join(path, activity)
        # subject_path = path + '/' + activity
        for subject in sorted(os.listdir(subjects_path)):
            segments_path = os.path.join(subjects_path, subject)
            
            for segment in sorted(os.listdir(segments_path)):
                file_name =  os.path.join(segments_path, segment)
                data_sample = []
                
                if approach == 1:
                    data_sample = approach_1_generator(read_file(file_name))
                elif approach == 2:
                    data_sample = approach_2_generator(read_file(file_name))
    
                if int(segment[1:3]) <= 48: # Belongs to training data
                    training_data.append(data_sample)
                    training_labels.append(label)
                else:
                    testing_data.append(data_sample)
                    testing_labels.append(label)
    return  training_data , training_labels , testing_data , testing_labels
                
            

### Data generated by taking the mean of each column in each segment

In [16]:
training_data_1 , training_labels_1 , testing_data_1 , testing_labels_1 = generate_data(1)

### Data generated by flattening all the features together

In [17]:
training_data_2 , training_labels_2 , testing_data_2 , testing_labels_2 = generate_data(2)

### Testing the data

In [18]:
print(f"Training data in approach-1 is considered to be (7296 * 45) but it's ({len(training_data_1)} * {len(training_data_1[0])})")

Training data in approach-1 is considered to be (7296 * 45) but it's (7296 * 45)


In [19]:
print(f"Training data in approach-2 is considered to be (7296 * 5625) but it's ({len(training_data_2)} * {len(training_data_2[0])})")

Training data in approach-2 is considered to be (7296 * 5625) but it's (7296 * 5625)


### Normalizing featrues

In [20]:
scaler_1, scaler_2 = MinMaxScaler(), MinMaxScaler()
# Normalizing approach 1
# TODO: remove
scaler_1.fit(training_data_1)
normalized_training_data_1 = scaler_1.transform(training_data_1)
normalized_testing_data_1 = scaler_1.transform(testing_data_1)

# Normalizing approach 2
scaler_2.fit(training_data_2)
normalized_training_data_2 = scaler_2.transform(training_data_2)
normalized_testing_data_2 = scaler_2.transform(testing_data_2)

### Applying dimensionality reduction using PCA

In [21]:
# pca = PCA(n_components=0.95)
pca = PCA(n_components=0.95)
pca.fit(normalized_training_data_2)
reduced_training_data_2 = pca.transform(normalized_training_data_2)
reduced_testing_data_2 = pca.transform(normalized_testing_data_2)

### Showing the effect of PCA

In [22]:
print(f"The dimensions of reduced flattened training data : {len(reduced_training_data_2)} * {len(reduced_training_data_2[0])}")
print(f"The dimensions of reduced flattened testing data : {len(reduced_testing_data_2)} * {len(reduced_testing_data_2[0])}")

The dimensions of reduced flattened training data : 7296 * 299
The dimensions of reduced flattened testing data : 1824 * 299


<h1 style='color:rgb(100, 149, 237)'> Clustering evaluation<h1>

### External evaluation

In [23]:
def contingency_table(cluster_ids, labels):
    """
        Computes contingency_table given cluster ids (zero-based), labels (zero-based).
        returns contingency_table of size: |unique cluster_ids| * |unique labels|
    """    
    assert len(cluster_ids) == len(labels)
    n = max(cluster_ids) + 1
    m = max(labels) + 1
    contingency_table = np.zeros((n, m))
    for cluster, label in zip(cluster_ids, labels):
        contingency_table[cluster][label] += 1
    return contingency_table    

In [24]:
def match_cluster_label(contingency_table):
    """The index of the list is the cluster id, whereas list element is the corresponding label"""
    return np.argmax(contingency_table, axis = 1) # 0-based index labels if you want to match the real label add plus

In [25]:
def true_positive(contingency_table):
    tp = 0
    for row in contingency_table:
        for elem in row:
            tp += (elem * (elem - 1) / 2)
    return tp

In [26]:
def false_positive(contingency_table):
    fp = 0
    for row in contingency_table:
        for i in range(len(row)):
            for j in range(i + 1, len(row)):
                fp += (row[i] * row [j])
    return fp            

In [27]:
def confusion_matrix(contingency_table):
    """
        Calculates confusion matrix from contingency table
        return: True positive, True negative, False positive, False negative
    """
    tp, tn, fp, fn = 0 ,0 ,0 ,0
    # True positive
    tp  = true_positive(contingency_table)
    
    # False positive 
    fp = false_positive(contingency_table)
                
    # False negative
    fn = false_positive(contingency_table.T)
    
     
    # True negative
    tn = np.sum(contingency_table) - (tp + fp + fn)
    
    return tp, tn, fp, fn
            

In [28]:
def precision(contingency_table, number_of_samples):
    return sum(np.max(contingency_table, axis=1)) / number_of_samples

In [29]:
def precision_confusion(tp, tn, fp, fn):
    return tp / (tp + fp)

In [30]:
def recall(tp, tn, fp, fn):
    return tp / (tp + fn)

In [31]:
def f_measure(contingency_table):
    f_score = 0
    col_sum_list = np.sum(contingency_table, axis=0)
    for row in contingency_table:
        max_element = np.max(row)
        max_ind = np.argmax(row)
        purity = (max_element / np.sum(row))
        recall = (max_element / col_sum_list[max_ind])
        f_score += 2 * ((purity * recall) / (purity + recall))
    return f_score / len(contingency_table)
        

In [32]:
def conditional_entropy(contingency_table, number_of_samples):
    entropy = 0
    for row in contingency_table:
        num_elements_of_cluster = np.sum(row)
        cluster_entropy = 0
        for col in row:
            p = col / num_elements_of_cluster
            if p != 0:
                cluster_entropy -=  (p * np.log2(p))
        entropy += ((num_elements_of_cluster / number_of_samples) * cluster_entropy)
    return entropy    

In [33]:
def perform_external_measures(contingency_table, number_of_samples):
    tp, tn, fp, fn = confusion_matrix(contingency_table)
    print(f"Precision is {precision(contingency_table, number_of_samples)}")
    print(f"Precision confusion is {precision_confusion(tp, tn, fp, fn)}")
    print(f"Recall is {recall(tp, tn, fp, fn)}")
    print(f"F-score is {f_measure(contingency_table)}")
    print(f"Conditional entropy is {conditional_entropy(contingency_table, number_of_samples)}")

# K-means

## Work flow

In [34]:
k_list = [8, 13, 19, 28, 38]

## Approach 1

### Evaluation 1

## Approach 2

### Evaluation 2

<h2 style="color:purple">Spectral clustering</h2>

## Work flow

In [35]:
def RBF_kernel(v1, v2, gamma):
    d = np.linalg.norm(v1 - v2)
    return np.exp(-gamma * d**2)

In [59]:
def knn_similarity_k(k):
    def knn_similarity(D):
        model = NearestNeighbors(n_neighbors=k, metric='euclidean')
        model.fit(D)

        # Compute k-NN indices
        distances, indices = model.kneighbors(D)
        
        n = len(D)
        # Create an empty adjacency matrix
        A = np.zeros((n, n))

        # Fill in the similarity matrix
        for i in range(n):
            for j in indices[i]:
                A[i, j] = 1
                A[j, i] = 1  # For symmetry: j is also a neighbor of i

        return A
    return knn_similarity

In [57]:
def spectral_clustering(D, k, sim_func):
  epsilon = 1e-10
  # Compute similarity matrix
  A = sim_func(D)

  # Compute degree matrix
  degrees = np.sum(A, axis=1)
  degree_mat = np.diag(degrees)

  # Compute Laplacian asymetric matrix
  la = np.eye(len(D)) - np.linalg.inv(degree_mat) @ A

  eigenvalues, eigenvectors = np.linalg.eig(la)

  # Sort eigenvalues and eigenvectors in ascending order
  sorted_indices = np.argsort(eigenvalues)
  sorted_eigenvalues = eigenvalues[sorted_indices]
  sorted_eigenvectors = eigenvectors[:, sorted_indices]

  # Choose the smallest K eigenvectors
  smallest_eigenvectors = sorted_eigenvectors[:k, :]
  
  # Normalize each row
  reduced_data  = (np.real(smallest_eigenvectors).T)
  
  for i in range(len(reduced_data)):
        reduced_data[i] = reduced_data[i] / (np.linalg.norm(reduced_data[i]) + epsilon)
  return reduced_data
      

In [61]:
# State init
gamma_list = [0.0001] # Don't ever use gamma = 1
k = 19
# Function init
knn_sim = knn_similarity_k(20)
sim_func_list = [knn_sim, cosine_similarity, np.corrcoef]

<h2 style="color:white">Applying spectral clustering to app. 1</h2>

In [62]:
number_of_samples_spectral_app_1 = len(normalized_training_data_1)
for sim_func in sim_func_list:
    # RBF_kernel_gamma = partial_func(RBF_kernel, gamma=gamma)
    print(f"Evaluation of spectral clustering using similarity measure : {sim_func.__name__}")
    reduced_training_data_spectral_cosine_app_1 = spectral_clustering(normalized_training_data_1, k, sim_func) # Replace test

    # Cluster traing data using k-means and return cluster id for each training sample
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(reduced_training_data_spectral_cosine_app_1)
    cluster_ids_spectral_app_1 = kmeans.labels_
    
    # Create contingency table matrix for matching training data clusters
    contingency_table_spectral_app_1 = contingency_table(cluster_ids_spectral_app_1, training_labels_1)
    
    # Evaluate clustering
    perform_external_measures(contingency_table_spectral_app_1, number_of_samples_spectral_app_1)
    print("\n")
    

Evaluation of spectral clustering using similarity measure : knn_similarity
Precision is 0.05646929824561404
Precision confusion is 0.052501777261782824
Recall is 0.9893392709907929
F-score is 0.012920064489737421
Conditional entropy is 4.2288508697757194
Evaluation of spectral clustering using similarity measure : cosine_similarity
Precision is 0.4041940789473684
Precision confusion is 0.27964679575782936
Recall is 0.3314567014795474
F-score is 0.40658191423190837
Conditional entropy is 2.127430300505754
Evaluation of spectral clustering using similarity measure : corrcoef
Precision is 0.40803179824561403
Precision confusion is 0.2605719377591839
Recall is 0.31017460835509136
F-score is 0.41287654117202466
Conditional entropy is 2.1739462543140062


## Applying spectral clustering to app. 2

In [40]:
gamma_list = [0.0001] # Don't ever use gamma = 1
k = 19
number_of_samples_spectral_app_2 = len(reduced_training_data_2)
for gamma in gamma_list:
    # RBF_kernel_gamma = partial_func(RBF_kernel, gamma=gamma)
    reduced_training_data_spectral_cosine_app_2 = spectral_clustering(reduced_training_data_2, k, cosine_sim) # Replace test

    # Cluster traing data using k-means and return cluster id for each training sample
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(reduced_training_data_spectral_cosine_app_2)
    cluster_ids_spectral_app_2 = kmeans.labels_
    
    # Create contingency table for matching training data clusters
    contingency_table_spectral_app_2 = contingency_table(cluster_ids_spectral_app_2, training_labels_2)
    
    # Evaluate clustering
    perform_external_measures(contingency_table_spectral_app_2, number_of_samples_spectral_app_2)
    

Precision is 0.44051535087719296
Precision confusion is 0.25394631765858255
Recall is 0.37166901424579724
F-score is 0.46152189650248354
Conditional entropy is 1.8807243670517555
