## Project Phase 3 - Optimization

#### Import Modules

In [1]:
from pathlib import Path
import os
import sys
import time
import warnings
from itertools import product

import pandas as pd
import numpy as np

In [2]:
path = Path(os.getcwd())
path = str(path)
print(path)
sys.path.insert(1, path)

/Users/leon/Documents/Repos/Gesture-Recognition-Reading-Muscle-Activity


In [3]:
from utils import processing
import utils.clustering as clustering

In [4]:
warnings.filterwarnings("ignore")

#### Set up to time script run time

In [5]:
start = time.time()

#### Configuration

In [6]:
DESIGN_FILE_PATH = 'data/emg_design.csv'
TARGET_ATTR = 'gesture class (0-3)'
SOLUTION_PICKLE_FILE_PATH = 'data/optimization_results.pkl'

In [7]:
# Attr list from phase_2
NUMERICAL_ATTRS = ['muscle reading 1 sensor 1', 'muscle reading 1 sensor 2', 'muscle reading 1 sensor 3', 'muscle reading 1 sensor 4', 'muscle reading 1 sensor 5', 'muscle reading 1 sensor 6', 'muscle reading 1 sensor 7', 'muscle reading 1 sensor 8', 'muscle reading 2 sensor 1', 'muscle reading 2 sensor 2', 'muscle reading 2 sensor 3', 'muscle reading 2 sensor 4', 'muscle reading 2 sensor 5', 'muscle reading 2 sensor 6', 'muscle reading 2 sensor 7', 'muscle reading 2 sensor 8', 'muscle reading 3 sensor 1', 'muscle reading 3 sensor 2', 'muscle reading 3 sensor 3', 'muscle reading 3 sensor 4', 'muscle reading 3 sensor 5', 'muscle reading 3 sensor 6', 'muscle reading 3 sensor 7', 'muscle reading 3 sensor 8', 'muscle reading 4 sensor 1', 'muscle reading 4 sensor 2', 'muscle reading 4 sensor 3', 'muscle reading 4 sensor 4', 'muscle reading 4 sensor 5', 'muscle reading 4 sensor 6', 'muscle reading 4 sensor 7', 'muscle reading 4 sensor 8', 'muscle reading 5 sensor 1', 'muscle reading 5 sensor 2', 'muscle reading 5 sensor 3', 'muscle reading 5 sensor 4', 'muscle reading 5 sensor 5', 'muscle reading 5 sensor 6', 'muscle reading 5 sensor 7', 'muscle reading 5 sensor 8', 'muscle reading 6 sensor 1', 'muscle reading 6 sensor 2', 'muscle reading 6 sensor 3', 'muscle reading 6 sensor 4', 'muscle reading 6 sensor 5', 'muscle reading 6 sensor 6', 'muscle reading 6 sensor 7', 'muscle reading 6 sensor 8', 'muscle reading 7 sensor 1', 'muscle reading 7 sensor 2', 'muscle reading 7 sensor 3', 'muscle reading 7 sensor 4', 'muscle reading 7 sensor 5', 'muscle reading 7 sensor 6', 'muscle reading 7 sensor 7', 'muscle reading 7 sensor 8', 'muscle reading 8 sensor 1', 'muscle reading 8 sensor 2', 'muscle reading 8 sensor 3', 'muscle reading 8 sensor 4', 'muscle reading 8 sensor 5', 'muscle reading 8 sensor 6', 'muscle reading 8 sensor 7', 'muscle reading 8 sensor 8']
NOMINAL_ATTRS = []

#### Import Data File

In [8]:
df = pd.read_csv(DESIGN_FILE_PATH)
print(df.shape)
df.head()

(11678, 65)


Unnamed: 0,id,muscle reading 1 sensor 1,muscle reading 1 sensor 2,muscle reading 1 sensor 3,muscle reading 1 sensor 4,muscle reading 1 sensor 5,muscle reading 1 sensor 6,muscle reading 1 sensor 7,muscle reading 1 sensor 8,muscle reading 2 sensor 1,...,muscle reading 7 sensor 7,muscle reading 7 sensor 8,muscle reading 8 sensor 1,muscle reading 8 sensor 2,muscle reading 8 sensor 3,muscle reading 8 sensor 4,muscle reading 8 sensor 5,muscle reading 8 sensor 6,muscle reading 8 sensor 7,muscle reading 8 sensor 8
0,0,26.0,4.0,5.0,8.0,-1.0,-13.0,-109.0,-66.0,-9.0,...,21.0,-28.0,61.0,4.0,8.0,5.0,4.0,-7.0,-59.0,16.0
1,1,-47.0,-6.0,-5.0,-7.0,13.0,-1.0,35.0,-10.0,10.0,...,-105.0,-25.0,47.0,6.0,6.0,5.0,13.0,21.0,111.0,15.0
2,2,-19.0,-8.0,-8.0,-8.0,-21.0,-6.0,-79.0,12.0,0.0,...,-128.0,-83.0,7.0,7.0,1.0,-8.0,7.0,21.0,114.0,48.0
3,3,2.0,3.0,0.0,2.0,0.0,22.0,106.0,-14.0,-16.0,...,-54.0,-38.0,-11.0,4.0,7.0,11.0,33.0,39.0,119.0,43.0
4,4,6.0,0.0,0.0,-2.0,-14.0,10.0,-51.0,5.0,7.0,...,60.0,38.0,-35.0,-8.0,2.0,6.0,-13.0,-24.0,-112.0,-69.0


#### Apply the preprocessing pipeline built in phase_2 to the data frame

In [9]:
preprocessor = processing.get_default_preprocessor(NUMERICAL_ATTRS, NOMINAL_ATTRS)
df_transformed = pd.DataFrame(
    data=preprocessor.fit_transform(df), 
    columns=NUMERICAL_ATTRS + NOMINAL_ATTRS
)
print(df_transformed.shape)
df_transformed.head()

(11678, 64)


Unnamed: 0,muscle reading 1 sensor 1,muscle reading 1 sensor 2,muscle reading 1 sensor 3,muscle reading 1 sensor 4,muscle reading 1 sensor 5,muscle reading 1 sensor 6,muscle reading 1 sensor 7,muscle reading 1 sensor 8,muscle reading 2 sensor 1,muscle reading 2 sensor 2,...,muscle reading 7 sensor 7,muscle reading 7 sensor 8,muscle reading 8 sensor 1,muscle reading 8 sensor 2,muscle reading 8 sensor 3,muscle reading 8 sensor 4,muscle reading 8 sensor 5,muscle reading 8 sensor 6,muscle reading 8 sensor 7,muscle reading 8 sensor 8
0,1.428445,0.401724,1.150179,1.173139,-0.04711,-0.482211,-4.293826,-4.24047,-0.459858,0.22416,...,0.894454,-1.785637,3.396947,0.394898,1.76449,0.772653,0.232428,-0.259303,-2.278454,1.069524
1,-2.503492,-0.448155,-0.853937,-0.842622,0.73722,-0.017247,1.445765,-0.60605,0.588529,-0.282565,...,-4.087375,-1.587727,2.627872,0.561499,1.362038,0.772653,0.736892,0.836588,4.451912,1.00513
2,-0.995352,-0.618131,-1.455172,-0.977006,-1.167581,-0.210982,-3.098077,0.821758,0.036746,0.477522,...,-4.996756,-5.414002,0.430515,0.6448,0.35591,-0.987888,0.400583,0.836588,4.570683,3.130128
3,0.135753,0.316736,0.148121,0.366835,0.008913,0.873935,4.275702,-0.865651,-0.846106,-0.113656,...,-2.07092,-2.44534,-0.558296,0.394898,1.563264,1.585211,1.857924,1.541089,4.768635,2.808159
4,0.351202,0.061772,0.148121,-0.170702,-0.775416,0.408971,-1.982046,0.367456,0.422994,0.055252,...,2.436448,2.5684,-1.87671,-0.604712,0.557136,0.908079,-0.720448,-0.924666,-4.376745,-4.403956


In [10]:
# Sample for testing time cost
# df_transformed = df_transformed.sample(frac=0.01, axis=0, random_state=42)
# print(df_transformed.shape)

In [11]:
df_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11678 entries, 0 to 11677
Data columns (total 64 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   muscle reading 1 sensor 1  11678 non-null  float64
 1   muscle reading 1 sensor 2  11678 non-null  float64
 2   muscle reading 1 sensor 3  11678 non-null  float64
 3   muscle reading 1 sensor 4  11678 non-null  float64
 4   muscle reading 1 sensor 5  11678 non-null  float64
 5   muscle reading 1 sensor 6  11678 non-null  float64
 6   muscle reading 1 sensor 7  11678 non-null  float64
 7   muscle reading 1 sensor 8  11678 non-null  float64
 8   muscle reading 2 sensor 1  11678 non-null  float64
 9   muscle reading 2 sensor 2  11678 non-null  float64
 10  muscle reading 2 sensor 3  11678 non-null  float64
 11  muscle reading 2 sensor 4  11678 non-null  float64
 12  muscle reading 2 sensor 5  11678 non-null  float64
 13  muscle reading 2 sensor 6  11678 non-null  flo

#### Hyperparameters for UMAP dimensionality reduction

In [12]:
n_neighbors_list = np.logspace(0.5, 2, 5).astype(int)
n_neighbors_list

array([  3,   7,  17,  42, 100])

In [13]:
min_dist_list = [0.0] + np.logspace(-5, -1, 3)
min_dist_list

array([1.e-05, 1.e-03, 1.e-01])

In [14]:
metric_list = ['euclidean', 'cosine']
metric_list

['euclidean', 'cosine']

In [15]:
n_components_list = [2, 3, 5, 10, 15]
n_components_list

[2, 3, 5, 10, 15]

#### Optimize the clustering pipeline by searching over hyperparameter values and clustering algorithms

In [16]:
clustering_results_dict_list = []
for n_neighbors, min_dist, metric, n_components in product(n_neighbors_list, min_dist_list, metric_list, n_components_list):
    print('\n')
    print('*' * 60)
    print(f'n_neighbors={n_neighbors}, min_dist={min_dist}, metirc={metric}, n_components={n_components}\n')

    umap_results_dict = clustering.umap_dim_red(
        df_transformed, 
        n_neighbors, 
        min_dist, 
        metric, 
        n_components
    )
    clustering_results_dict = clustering.clustering(umap_results_dict )
    if clustering_results_dict is not None:
        clustering_results_dict_list.append(clustering_results_dict)

clustering_results_df = pd.DataFrame(clustering_results_dict_list)

clustering_results_df['sort_key'] = clustering_results_df.apply(lambda x: x['silhouette_score'] if x['algo'] == 'k_means' else x['validity_index'], axis=1)
clustering_results_df.sort_values(by=['sort_key'], ascending=False, inplace=True)
clustering_results_df.drop(columns=['sort_key'], inplace=True)

clustering_results_df



************************************************************
n_neighbors=3, min_dist=1e-05, metirc=euclidean, n_components=2
[92mSucceed[0m to find an elbow at 5 in the inertia curve
Elbow slope change test [92mpassed[0m
n_clusters_db_score_is_min=14
n_clusters_ch_score_is_max=4
n_clusters_silhouette_score_is_max=2
K-Means first test [91mfailed[0m
K-Means second test [91mfailed[0m

Try applying DBSCAN method:
   index  k       eps
0  11577  3  0.175833
1  11599  4  0.214479
2  11603  5  0.246470
3  11569  6  0.250923
max_eps: 0.25092335752623957, min_samples: 6
[92mSucceed[0m to find n_clusters=16, eps=0.3262003647841114, min_samples=6


************************************************************
n_neighbors=3, min_dist=1e-05, metirc=euclidean, n_components=3
[92mSucceed[0m to find an elbow at 6 in the inertia curve
Elbow slope change test [92mpassed[0m
n_clusters_db_score_is_min=15
n_clusters_ch_score_is_max=4
n_clusters_silhouette_score_is_max=15
K-Means first test 

Unnamed: 0,algo,eps,dbscan_min_samples,n_clusters_found,validity_index,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,fitted_dbscan,embedding,cluster_labels,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,fitted_k_means
47,k_means,,,3,,0.734961,7,0.00100,cosine,5,0.823645,,"[[3.2748647, 7.6493006, 3.9626865, 4.479932, 6...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",3.0,3.0,3.0,0.549104,"KMeans(n_clusters=3, n_init='auto', random_sta..."
48,k_means,,,3,,0.761623,7,0.00100,cosine,10,0.823610,,"[[3.227192, 7.637264, 3.9158268, 4.851348, 6.4...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",3.0,3.0,3.0,0.549001,"KMeans(n_clusters=3, n_init='auto', random_sta..."
38,k_means,,,3,,0.760266,7,0.00001,cosine,10,0.824708,,"[[3.2835078, 7.658021, 3.8094757, 4.7666316, 6...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",3.0,3.0,3.0,0.548391,"KMeans(n_clusters=3, n_init='auto', random_sta..."
39,k_means,,,3,,0.763695,7,0.00001,cosine,15,0.824834,,"[[3.2331347, 7.6844172, 3.920934, 4.8214865, 6...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",3.0,3.0,3.0,0.547967,"KMeans(n_clusters=3, n_init='auto', random_sta..."
37,k_means,,,3,,0.736636,7,0.00001,cosine,5,0.823768,,"[[3.161631, 7.5509453, 3.8223517, 4.6232133, 6...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",3.0,3.0,3.0,0.545115,"KMeans(n_clusters=3, n_init='auto', random_sta..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,dbscan,0.115614,6.0,3,-0.736907,0.453044,42,0.10000,euclidean,2,0.593051,"DBSCAN(eps=0.1156137251191497, min_samples=6)","[[-0.36745512, 7.3653316], [-0.5968582, 7.0834...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",,,,,
85,dbscan,0.152404,6.0,3,-0.743036,0.424897,17,0.10000,cosine,2,0.750826,"DBSCAN(eps=0.1524039997582118, min_samples=6)","[[2.2316277, 0.73641324], [2.1696315, 0.589929...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,,,,
100,dbscan,0.096328,6.0,2,-0.752965,0.449576,42,0.00100,euclidean,2,0.597704,"DBSCAN(eps=0.09632816797667515, min_samples=6)","[[0.5836985, 8.050481], [0.41431034, 7.96385],...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",,,,,
25,dbscan,0.473131,6.0,13,-0.773474,0.332102,3,0.10000,cosine,2,0.755032,"DBSCAN(eps=0.473131018951767, min_samples=6)","[[9.46398, 3.5633922], [9.801375, 2.924061], [...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,,,,


#### Save the optimization results for following false discovery checking and external indices validation 

In [17]:
clustering_results_df.to_pickle(SOLUTION_PICKLE_FILE_PATH)

#### Time to run script

In [18]:
end = time.time()
print(f'script run time: {(end - start)/60} minutes')

script run time: 333.93652676741283 minutes
