In [12]:
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from functools import partial as partial_func
from sklearn.cluster import KMeans

# Data preprocessing

### Generating taining and testing data

In [14]:
path = "/home/mohamed/CSED_25/Year_3/term_2/PR-ML/my_labs/lab_2/data"

In [15]:
print(os.listdir("/home/mohamed/CSED_25/Year_3/term_2/PR-ML/my_labs/lab_2/data/a03/p1"))

['s41.txt', 's11.txt', 's04.txt', 's50.txt', 's43.txt', 's35.txt', 's20.txt', 's15.txt', 's31.txt', 's37.txt', 's49.txt', 's07.txt', 's26.txt', 's21.txt', 's44.txt', 's58.txt', 's53.txt', 's25.txt', 's48.txt', 's34.txt', 's19.txt', 's17.txt', 's30.txt', 's42.txt', 's47.txt', 's13.txt', 's51.txt', 's14.txt', 's27.txt', 's09.txt', 's28.txt', 's12.txt', 's18.txt', 's59.txt', 's56.txt', 's46.txt', 's22.txt', 's33.txt', 's08.txt', 's54.txt', 's05.txt', 's55.txt', 's10.txt', 's03.txt', 's24.txt', 's57.txt', 's29.txt', 's23.txt', 's45.txt', 's32.txt', 's60.txt', 's36.txt', 's16.txt', 's52.txt', 's01.txt', 's38.txt', 's06.txt', 's02.txt', 's40.txt', 's39.txt']


In [16]:
def read_file(filename):
  """
    Return list of all 125 rows (125 * 45)
  """
  data = []
  with open(filename, 'r') as file:
    for line in file:
      row = [float(value) for value in line.strip().split(',')] # Converting to float
      data.append(row)
  return data

In [17]:
def approach_2_generator(list_of_rows):
    return [item for row in list_of_rows for item in row]

In [18]:
def approach_1_generator(list_of_rows):
    n = len(list_of_rows)
    mean_sample = [0 for _ in range(len(list_of_rows[0]))]
    for row in list_of_rows:
        for i in range(len(row)):
            mean_sample[i] += row[i]
    return [x / n for x in mean_sample]  

In [20]:
def generate_data(approach): 
    """
        This function generates and splits tainging and tesing data condering the approach desired. 
        approach = 1 -> Taking the mean of each column in each segment resulting in 45 features for each data point.
        approach = 2 -> Flattening all the features together in 45 x 125 = 5625 features for each data point.
    """
    training_data, training_labels, testing_data, testing_labels = [], [], [], []

    for activity in sorted(os.listdir(path)):
        label = int(activity[1:]) - 1   # To make it zero-based
        subjects_path = os.path.join(path, activity)
        # subject_path = path + '/' + activity
        for subject in sorted(os.listdir(subjects_path)):
            segments_path = os.path.join(subjects_path, subject)
            
            for segment in sorted(os.listdir(segments_path)):
                file_name =  os.path.join(segments_path, segment)
                data_sample = []
                
                if approach == 1:
                    data_sample = approach_1_generator(read_file(file_name))
                elif approach == 2:
                    data_sample = approach_2_generator(read_file(file_name))
    
                if int(segment[1:3]) <= 48: # Belongs to training data
                    training_data.append(data_sample)
                    training_labels.append(label)
                else:
                    testing_data.append(data_sample)
                    testing_labels.append(label)
    return  training_data , training_labels , testing_data , testing_labels
                
            

### Data generated by taking the mean of each column in each segment

In [21]:
training_data_1 , training_labels_1 , testing_data_1 , testing_labels_1 = generate_data(1)

### Data generated by flattening all the features together

In [22]:
training_data_2 , training_labels_2 , testing_data_2 , testing_labels_2 = generate_data(2)

### Testing the data

In [23]:
print(f"Training data in approach-1 is considered to be (7296 * 45) but it's ({len(training_data_1)} * {len(training_data_1[0])})")

Training data in approach-1 is considered to be (7296 * 45) but it's (7296 * 45)


In [24]:
print(f"Training data in approach-2 is considered to be (7296 * 5625) but it's ({len(training_data_2)} * {len(training_data_2[0])})")

Training data in approach-2 is considered to be (7296 * 5625) but it's (7296 * 5625)


In [25]:
print(training_data_1[0])

[7.975714400000001, 1.0831504800000005, 5.6068464, 0.004897151999999998, 0.026122679999999985, -0.003726408, -0.7907260800000004, -0.06849034400000001, 0.13589679999999996, 0.67913376, 5.713088000000003, 7.926788800000002, 0.012881520000000002, -0.0010626799999999997, -0.0029510400000000007, -0.5693321600000001, -0.56161984, -0.21113576000000006, 3.4033168000000016, -8.375712799999999, 3.8520752000000007, -0.0048876879999999985, -2.1167999999999978e-05, -0.007604448000000001, -0.64785496, 0.34177119999999994, 0.07227340800000005, -3.5357359999999978, 9.0633328, -0.9345346399999999, 0.00044342399999999993, 0.007741840000000001, -0.004339024, 0.7302159200000004, -0.25215672000000006, -0.035894191999999984, -2.8148328, -9.085131199999996, 2.618207200000001, -0.0050357039999999985, 0.002166143999999999, -0.003154824000000001, 0.7396147199999998, 0.3013141599999999, -0.05711876800000003]


### Normalizing featrues for approach 2

In [26]:
scaler = StandardScaler()
scaler.fit(training_data_2)
normalized_training_data_2 = scaler.transform(training_data_2)
normalized_testing_data_2 = scaler.transform(testing_data_2)

### Applying dimensionality reduction using PCA

In [27]:
pca = PCA(n_components=0.95)
pca.fit(normalized_training_data_2)
reduced_training_data_2 = pca.transform(normalized_training_data_2)
reduced_testing_data_2 = pca.transform(normalized_testing_data_2)

### Showing the effect of PCA

In [28]:
print(f"The dimensions of reduced flattened training data : {len(reduced_training_data_2)} * {len(reduced_training_data_2[0])}")
print(f"The dimensions of reduced flattened testing data : {len(reduced_testing_data_2)} * {len(reduced_testing_data_2[0])}")

The dimensions of reduced flattened training data : 7296 * 870
The dimensions of reduced flattened testing data : 1824 * 870


# Spectural clustering

In [29]:
def RBF_kernel(v1, v2, gamma):
 d = np.linalg.norm(v1 - v2)
 return np.exp(-gamma * d**2)

In [30]:
def dot_product(v1, v2):
    return np.dot(v1, v2);

In [40]:
def spectrlar_clustering(D, k, sim_func):
 # Compute similarity matrix
  A = np.zeros((len(D), len(D)))
  for ind_row in range(len(D)):
      for ind_col in range(len(D)):
        A[ind_row][ind_col] = sim_func(D[ind_row], D[ind_col])
        
  # Compute degree matrix
  degrees = np.sum(A, axis=1)
  degree_mat = np.diag(degrees)

  # Compute Laplacian asymetric matrix
  la = np.eye(len(D)) - np.linalg.inv(degree_mat) @ A

  eigenvalues, eigenvectors = np.linalg.eig(la)

  # Sort eigenvalues and eigenvectors in ascending order
  sorted_indices = np.argsort(eigenvalues)
  sorted_eigenvalues = eigenvalues[sorted_indices]
  sorted_eigenvectors = eigenvectors[:, sorted_indices]

  # Choose the smallest K eigenvectors
  smallest_eigenvectors = sorted_eigenvectors[:k, :]
  
  # Normalize each row
  reduced_data  = (np.real(smallest_eigenvectors).T)
  for i in range(len(reduced_data)):
      reduced_data[i] = (reduced_data[i] / np.linalg.norm(reduced_data[i]))

  return reduced_data
      

### Applying Normalized Cut to app. 1

In [31]:
def confusion_matrix(cluster_ids, labels):
    """
        Computes confusion matrix given cluster ids (zero-based), labels (zero-based).
        returns confusion matrix of size: |unique cluster_ids| * |unique labels|
    """    
    assert len(cluster_ids) == len(labels)
    n = max(cluster_ids) + 1
    m = max(labels) + 1
    confusion_mat = np.zeros((n, m))
    for cluster, label in zip(cluster_ids, labels):
        confusion_mat[cluster][label] += 1
    return confusion_mat    

In [32]:
def match_cluster_label(confusion_mat):
    """The index of the list is the cluster id, whereas list element is the corresponding label"""
    return np.argmax(confusion_mat, axis = 1)

In [4]:
# gamma_list = [1] # [0.001, 0.01, 0.1, 1, 10, 100]
# k = 19

# for gamma in gamma_list:
#     RBF_kernel_gamma = partial_func(RBF_kernel, gamma=gamma)
#     reduced_training_data_spectral_RBF = spectrlar_clustering(reduced_training_data_2, k, RBF_kernel_gamma)

#     # Cluster traing data using k-means and return cluster id for each training sample
#     kmeans = KMeans(n_clusters=k)
#     kmeans.fit(reduced_training_data_spectral_RBF)
#     cluster_ids = kmeans.labels_
    
#     # Create confusion matrix for matching training data clusters
#     confusion_mat = confusion_matrix(cluster_ids, training_labels_2)
    
    
#     # # Match the clusters with labels and return the mapping : cluster_id -> label
#     # cluster_label_mapping_list = match_cluster_label(confusion_mat)
    
#     # # Pridect testing data clusters
#     # kmeans.predict(reduced_testing_data_2)
#     # # Transform cluster_id into label
#     # # Compute accuracy
    

### Applying Normalized Cut to app. 2

In [45]:
gamma_list = [1] # [0.001, 0.01, 0.1, 1, 10, 100]
k = 19

# for gamma in gamma_list:
gamma = 1
RBF_kernel_gamma = partial_func(RBF_kernel, gamma=gamma)

reduced_training_data_spectral_RBF = spectrlar_clustering(reduced_training_data_2, k, RBF_kernel_gamma)

# Cluster traing data using k-means and return cluster id for each training sample
kmeans = KMeans(n_clusters=k)
kmeans.fit(reduced_training_data_spectral_RBF)
cluster_ids = kmeans.labels_

# Create confusion matrix for matching training data clusters
confusion_mat = confusion_matrix(cluster_ids, training_labels_2)
perform_external_measures(confusion_mat, len(reduced_training_data_spectral_RBF))

# CLustering evaluation

### External evaluation

In [42]:
def precision(confusion_mat, number_of_samples):
    return sum(np.max(confusion_mat, axis=1)) / number_of_samples

In [None]:
# def recall(confusion_mat, number_of_samples):

In [43]:
def f_measure(confusion_mat):
    f_score = 0
    col_sum_list = np.sum(confusion_mat, axis=0)
    for row in confusion_mat:
        max_element = np.max(row)
        max_ind = np.argmax(row)
        purity = (max_element / np.sum(row))
        recall = (max_element / col_sum_list[max_ind])
        f_score += 2 * ((purity * recall) / (purity + recall))
    return f_score
        

In [35]:
def conditional_entropy(confusion_mat, number_of_samples):
    entropy = 0
    for row in confusion_mat:
        num_elements_of_cluster = np.sum(row)
        cluster_entropy = 0
        for col in row:
            p = col / num_elements_of_cluster
            cluster_entropy -=  (p * np.log2(p))
        entropy += ((num_elements_of_cluster / number_of_samples) * cluster_entropy)

In [44]:
def perform_external_measures(confusion_mat, number_of_samples):
    print(f"Precision is {precision(confusion_mat, number_of_samples)}")
    print(f"F-score is {f_measure(confusion_mat)}")
    print(f"Conditional entropy is {conditional_entropy(confusion_mat, number_of_samples)}")