GAIT ANALYSIS (Oversampled data (INSIDE PIPELINE) + Stratified KFold + No Train/Test Split) - same as in gait_final_output_v1_7 with R1, R2, R3 transfomed into mean, mediam and std, BUT USING ONLY 107 features at a time either mean, median or mode.

Data Credits: https://archive.ics.uci.edu/dataset/604/gait+classification

Dataset
- Size: 48 samples (16 subjects, 3 samples/class)
- No train/test split: All 48 samples used for cross-validation
- CV Strategy: StratifiedKFold (n_splits=2)

SUMMARY:
- mean features:   k=25, accuracy:0.8958333333333334
- median features: k=20, accuracy:0.7708333333333334
- std features:    k=20, accuracy:0.8958333333333334

In [16]:
# Import lib
from sklearn.model_selection import StratifiedKFold, cross_val_predict, train_test_split
from imblearn.pipeline import Pipeline      
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from imblearn.over_sampling import RandomOverSampler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np
import pprint
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")

In [17]:
# Load and identify X and y
df = pd.read_csv("gait_final_output_updated.csv")
print(f'df.shape: {df.shape}')
print("---")
df.sample(3)

df.shape: (48, 322)
---


Unnamed: 0,Speed_R1,Variability_R1,Symmetry_R1,HeelPressTime_R1,CycleTime_R1,Cadence_R1,Posture_R1,Oscillation_R1,Loading_R1,FootPress_R1,...,P99_R3,P100_R3,P101_R3,P102_R3,P103_R3,P104_R3,P105_R3,P106_R3,P107_R3,Subject_ID_Y
38,1.26,0.0,-3.3,1.114,1.112,1.07,1.073,0.374,0.265,0.069,...,0.015,0.022,0.019,0.041,0.066,0.174,0.177,0.932,1.073,12
43,1.22,0.0,7.5,1.165,1.167,1.13,1.13,0.328,0.332,0.04,...,0.026,0.032,0.032,0.098,0.097,0.199,0.23,0.824,1.214,14
14,1.49,0.0,1.8,1.05,1.033,0.99,0.985,0.502,0.381,0.045,...,0.028,0.025,0.022,0.057,0.034,0.212,0.234,0.938,1.066,4


In [18]:
# Check for class imbalance

print( df['Subject_ID_Y'].value_counts().to_list() )

min_y_count = df['Subject_ID_Y'].value_counts().min()
max_y_count = df['Subject_ID_Y'].value_counts().max()

if min_y_count/max_y_count > 5: 
    print(f"Classes are imbalanced. Max-to-min count ratio is: {min_y_count/max_y_count}")
else:
    print(f"Classes are balanced. Max-to-min count ratio is: {min_y_count/max_y_count}")

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Classes are balanced. Max-to-min count ratio is: 1.0


In [19]:
# Form a new df, with mean, meadian and mode using df_new

cols_to_aggregate, prefixes = [], []

# identify columns to aggregate (excluding target)
cols_to_aggregate = [col for col in df.columns if col!='Subject_ID_Y']
# print( "cols_to_aggregate:",cols_to_affregate )

# extract unique prefix before the underscore
prefixes =  [col.split('_') for col in cols_to_aggregate] 
# print("prefixes:", prefixes ) # list of lists

# create a new dictionary
new_columns_mean = {}
new_columns_median = {}
new_columns_std = {}

for prefix in prefixes:
    # print(f"prefix:{prefix} | first_val:{prefix[0]}")
    # find all columns with prefix prefix[0]
    # print("prefix[0]:", prefix[0])
    matching_cols = [ col for col in cols_to_aggregate if col.startswith( prefix[0] ) ]
    # print( "matching_cols", matching_cols )    
    
    if len(matching_cols) <3 :
        print(f"Either the matching_cols list is empty or has less than 3 columns")    
        
    elif len(matching_cols) == 3: # we know it has 3 columns, R1, R2, R3
        # calculate mean, median and mode
        new_columns_mean[f'{prefix[0]}_mean']     = df[matching_cols].mean(axis=1)
        new_columns_median[f'{prefix[0]}_median'] = df[matching_cols].median(axis=1)
        new_columns_std[f'{prefix[0]}_std']       = df[matching_cols].std(axis=1)
        
# create a dataframe from this dictionary 
df_mean   = pd.DataFrame(new_columns_mean)
df_median = pd.DataFrame(new_columns_median)
df_std    = pd.DataFrame(new_columns_std)

print(f"df_mean.shape:{df_mean.shape} | df_median.shape:{df_median.shape} | df_std.shape:{df_std.shape}")


df_mean.shape:(48, 107) | df_median.shape:(48, 107) | df_std.shape:(48, 107)


In [20]:
y =  df['Subject_ID_Y']

for X_iter, X in enumerate([df_mean, df_median, df_std]): 
    print(f"==========================iteration i: {X_iter+1}==========================")

    cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    
    accuracy = {}
    
    k_vals = [2, 3, 4, 5, 10, 15, 20, 25]
    for k in k_vals:
        accuracy[k] = {}
        
        pipeline = Pipeline([
            ("scaler", StandardScaler()),
            ("oversample", RandomOverSampler()),
            ("feature_selector", SelectKBest(mutual_info_classif, k=k)),
            ( 'clf'           , KNeighborsClassifier(
                                                    n_neighbors=2,  
                                                    weights='distance', 
                                                    algorithm='auto', 
                                                    leaf_size=30, 
                                                    p=2, 
                                                    metric='manhattan', 
                                                    metric_params=None, 
                                                    n_jobs=-1) )  
        ])
        
        y_pred = cross_val_predict(pipeline, X, y, cv=cv, n_jobs=-1)   
        score = accuracy_score(y, y_pred)
        # print("Accuracy:", score)
        accuracy[k]['score']  =  score
        accuracy[k]['report'] =  classification_report(y, y_pred, zero_division=0)
        
    # pprint.pprint("accuracy: ", accuracy)
    
    best_acc = []
    for i, k in enumerate(k_vals):
        print(f"i:{i}, k:{k}, accuracy:{accuracy[k]['score']}" )
        best_acc.append( accuracy[k]['score'] )
    
    best_idx = np.argmax(best_acc)
    print("best_idx:", best_idx)
    
    print("="*50)
    print(f"Classification report for best accuracy at k={ k_vals[best_idx] }, accuracy:{accuracy[ k_vals[best_idx] ]['score']}" )
    pprint.pprint( accuracy[ k_vals[best_idx] ]['report'] )
    print("="*50)

i:0, k:2, accuracy:0.2708333333333333
i:1, k:3, accuracy:0.625
i:2, k:4, accuracy:0.5625
i:3, k:5, accuracy:0.7083333333333334
i:4, k:10, accuracy:0.75
i:5, k:15, accuracy:0.7708333333333334
i:6, k:20, accuracy:0.8541666666666666
i:7, k:25, accuracy:0.8958333333333334
best_idx: 7
Classification report for best accuracy at k=25, accuracy:0.8958333333333334
('              precision    recall  f1-score   support\n'
 '\n'
 '           0       1.00      1.00      1.00         3\n'
 '           1       1.00      1.00      1.00         3\n'
 '           2       1.00      0.67      0.80         3\n'
 '           3       1.00      0.33      0.50         3\n'
 '           4       1.00      1.00      1.00         3\n'
 '           5       1.00      1.00      1.00         3\n'
 '           6       1.00      1.00      1.00         3\n'
 '           7       1.00      1.00      1.00         3\n'
 '           8       0.60      1.00      0.75         3\n'
 '           9       1.00      1.00      1.00 