- GAIT ANALYSIS (Optimized Approach with GridSearchCV, No train/test split) - same as in gait_final_output_v1_2 with R1, R2, R3 transfomed into mean, mediam and std, BUT USING ONLY 107 features at a time either mean, median or mode.

- Aim: Multi-class classification to identify subjects (16 individuals) based on gait characteristics, using model comparison and hyperparameter tuning WITHOUT train/test split.

- Data Credits: https://archive.ics.uci.edu/dataset/604/gait+classification

SUMMARY:
- The best performing model is KNeighborsClassifier
- mean features: Mean CV Accuracy: 0.8125
- median features: Mean CV Accuracy: 0.5416666666666666
- std features: Mean CV Accuracy: 0.7083333333333334

In [8]:
# Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, cross_val_predict
from sklearn.model_selection import LeaveOneOut, LeaveOneGroupOut, StratifiedKFold, GroupKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import NearestNeighbors

from imblearn.pipeline import Pipeline

# these clean up the noisy data
from imblearn.combine import SMOTETomek, SMOTEENN

# these do not clean up the noisy data
from imblearn.over_sampling import RandomOverSampler, ADASYN, SMOTE

# avoid as it only duplicates data
# from imblearn.over_sampling import RandomOverSampler 

from sklearn.feature_selection import SelectKBest, mutual_info_classif

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

import time

import warnings
warnings.filterwarnings("ignore")

In [9]:
# Load data

df = pd.read_csv('gait_final_output_updated.csv')
print(f'df.shape: {df.shape}')
print("---")
df.sample(3)

df.shape: (48, 322)
---


Unnamed: 0,Speed_R1,Variability_R1,Symmetry_R1,HeelPressTime_R1,CycleTime_R1,Cadence_R1,Posture_R1,Oscillation_R1,Loading_R1,FootPress_R1,...,P99_R3,P100_R3,P101_R3,P102_R3,P103_R3,P104_R3,P105_R3,P106_R3,P107_R3,Subject_ID_Y
22,1.34,4.71,-0.7,1.132,1.133,1.135,1.125,0.073,0.053,0.067,...,0.024,0.045,0.018,0.057,-0.033,0.208,0.125,1.316,0.76,7
2,1.25,5.06,-3.8,1.109,1.109,1.105,1.115,0.048,0.056,0.041,...,0.034,0.048,0.037,0.101,0.056,0.225,0.236,0.989,1.011,0
43,1.22,0.0,7.5,1.165,1.167,1.13,1.13,0.328,0.332,0.04,...,0.026,0.032,0.032,0.098,0.097,0.199,0.23,0.824,1.214,14


In [10]:
# Check for class imbalance

print( df['Subject_ID_Y'].value_counts().to_list() )

min_y_count = df['Subject_ID_Y'].value_counts().min()
max_y_count = df['Subject_ID_Y'].value_counts().max()

if min_y_count/max_y_count > 5: 
    print(f"Classes are imbalanced. Max-to-min count ratio is: {min_y_count/max_y_count}")
else:
    print(f"Classes are balanced. Max-to-min count ratio is: {min_y_count/max_y_count}")

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Classes are balanced. Max-to-min count ratio is: 1.0


In [11]:
# Form a new df, with mean, meadian and mode using df_new

cols_to_aggregate, prefixes = [], []

# identify columns to aggregate (excluding target)
cols_to_aggregate = [col for col in df.columns if col!='Subject_ID_Y']
# print( "cols_to_aggregate:",cols_to_affregate )

# extract unique prefix before the underscore
prefixes =  [col.split('_') for col in cols_to_aggregate] 
# print("prefixes:", prefixes ) # list of lists

# create a new dictionary
new_columns_mean = {}
new_columns_median = {}
new_columns_std = {}

for prefix in prefixes:
    # print(f"prefix:{prefix} | first_val:{prefix[0]}")
    # find all columns with prefix prefix[0]
    # print("prefix[0]:", prefix[0])
    matching_cols = [ col for col in cols_to_aggregate if col.startswith( prefix[0] ) ]
    # print( "matching_cols", matching_cols )    
    
    if len(matching_cols) <3 :
        print(f"Either the matching_cols list is empty or has less than 3 columns")    
        
    elif len(matching_cols) == 3: # we know it has 3 columns, R1, R2, R3
        # calculate mean, median and mode
        new_columns_mean[f'{prefix[0]}_mean']     = df[matching_cols].mean(axis=1)
        new_columns_median[f'{prefix[0]}_median'] = df[matching_cols].median(axis=1)
        new_columns_std[f'{prefix[0]}_std']       = df[matching_cols].std(axis=1)
        
# create a dataframe from this dictionary 
df_mean   = pd.DataFrame(new_columns_mean)
df_median = pd.DataFrame(new_columns_median)
df_std    = pd.DataFrame(new_columns_std)

print(f"df_mean.shape:{df_mean.shape} | df_median.shape:{df_median.shape} | df_std.shape:{df_std.shape}")


df_mean.shape:(48, 107) | df_median.shape:(48, 107) | df_std.shape:(48, 107)


In [12]:
y =  df['Subject_ID_Y']
i = 1

for X in [df_mean, df_median, df_std]: 
    print(f"==========================iteration i: {i}==========================")
    i += 1
    print("===STEP 0: Split X and y===")
    
    print(f"Total samples: {len(y)}")
    print(f"Total features: {X.shape[1]}")
    print(f"Number of classes: {len(y.unique())}")
    print(f"Class distribution in full dataset:")
    print(y.value_counts().sort_index().to_list())
    
    print("===Define oversampler TYPES===")
    print( "min(10, X.shape[1], len(X) // 10) :",min(10, X.shape[1], len(X) // 10)  )
    
    oversampler_types = ['smoteenn', 'smotetomek']
    oversampler_count = len(oversampler_types)
    
    results = {}
    
    for key in oversampler_types:
        print(f'\n=== {key} ===')
        results[key] = {}
        
        # Train-test split: NO NO NO
        # X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=42 )
        # print("y_train distribution:", y_train.value_counts().sort_index().to_list())
        
        # Get sample counts in train data
        n_samples = y.value_counts().min()
        k_neighbors = max(1, min(3, n_samples - 1))
        print("k_neighbors:", k_neighbors)
        print("y.value_counts().min():", n_samples)
        
        # Choose appropriate oversampler based on sample size
        if n_samples < 6:  # Too few samples for SMOTEENN/SMOTETomek
            print(f"Only {n_samples} samples per class - using RandomOverSampler instead of {key}")
            oversampler = RandomOverSampler(random_state=42)
        else:
            sampling_strategy = {cls: target_samples_per_class for cls in np.unique(y)}
            
            if key == 'smoteenn':
                oversampler = SMOTEENN(
                    smote=SMOTE(random_state=42, sampling_strategy=sampling_strategy, k_neighbors=k_neighbors),
                    random_state=42,
                    n_jobs=-1
                )
            elif key == 'smotetomek':
                oversampler = SMOTETomek(
                    smote=SMOTE(random_state=42, sampling_strategy=sampling_strategy, k_neighbors=k_neighbors),
                    random_state=42,
                    n_jobs=-1
                )
    
        print("oversampler used: ", oversampler)
    
        # Now try differnt models in the pipeline, not just one model.
        models = {
            'GaussianNB': {
                'model': GaussianNB(),
                'params': {                
                    'var_smoothing': np.logspace(-10, -7, num=4) 
                }
            },
            'SVC-RBF': {
                'model': SVC(random_state=42, decision_function_shape='ovr'),
                'params': {
                    'C':      [0.1, 1, 10],
                    'gamma':  ['scale', 0.1],
                    'kernel': ['rbf', 'linear']
                }
            },    
           'Logistic': {
                'model': LogisticRegression(max_iter=1000, random_state=42, multi_class='ovr'),
                'params': {
                    'C':       [0.01, 0.1, 1, 10],
                    'penalty': ['l1', 'l2'],
                    'solver':  ['liblinear', 'saga']
                    }
            },
            'KNeighborsClassifier': {
                'model': KNeighborsClassifier(),
                'params': {
                    'n_neighbors':[1, 2, 3],
                    'weights':	  ['uniform', 'distance'],
                    'metric':	  ['euclidean', 'manhattan', 'minkowski'],
                    'p':          [1, 2]
                    }
            },
            'RandomForest': {
                'model': RandomForestClassifier(random_state=42, n_jobs=-1),
                'params': {
                    'n_estimators':      [50, 100],        # Not too many
                    'max_depth':         [2, 3, 4],        # VERY shallow (key!)
                    'min_samples_split': [2, 3, 5],        # Require multiple samples
                    'min_samples_leaf':  [1, 2]            # Prevent tiny leaves
                    }
            },
            'DecisionTree': {
                'model': DecisionTreeClassifier(random_state=42),
                'params': {
                    'max_depth':         [2, 3, 4, 5],
                    'min_samples_split': [2, 3, 5],
                    'min_samples_leaf':  [1, 2, 3]
                    }
            }
        }
    
        
        # Check all model's performance
        for models_name, models_config in models.items():
            print('---')
            model_start_time = time.time()
                        
            # Pipeline        
            pipeline = Pipeline([
                ('oversample', oversampler),
                ('scaler',     StandardScaler()),
                ('selector',   SelectKBest(mutual_info_classif, k= min(10, X.shape[1], len(X) // 10) )),
                ('model',      models_config['model'] )
            ])
            param_grid = models_config['params']
            
            #print("param_grid:", param_grid) 
            #print( {f'model__{k}': v for k, v in models_config['params'].items()} )
            param_grid = {f'model__{k}':v for k,v in models_config['params'].items() } 
            
            # GridSearchCV 
            cv = StratifiedKFold(n_splits=y.value_counts().min(), shuffle=True, random_state=42)
            # n_splits=3 cannot be greater than the number of members in each class.
            try:
                grid_search = GridSearchCV(
                    estimator  = pipeline,
                    param_grid = param_grid,
                    cv         = cv,
                    scoring    = 'accuracy', # can explore this later for precision/recall
                    n_jobs     = -1,
                    verbose    = 0
                )
                grid_search.fit(X, y)
    
                # evaluate
                train_acc = grid_search.score(X, y)
                #test_acc  = grid_search.score(X_test, y_test)
    
                results[key][models_name] = {
                    'grid'          : grid_search,
                    'train_acc'     : train_acc,
                    #'test_acc'      : test_acc,
                    'best_param'    : grid_search.best_params_,
                    'best_estimator': grid_search.best_estimator_,
                    'best_cv_score' : grid_search.best_score_,
                    'model_run_time': time.time() - model_start_time
                }
                                
            except Exception as e:
                print(f"{models_name} ERROR: {str(e)}")
                continue
                
            print("-------end of model for loop iter---------")
        print("-------end of sampler for loop iter---------")
    print("---------end of execution---------")
        
    print("===Print results after the execution ends, nice looking table===")
    data = []
    for keys, vals in results.items():
        for key, val in vals.items():
            data.append({
                'sampler':       keys,
                'model'  :       key,
                'train_acc':     round(val['train_acc'],2),
                'best_cv_score': round(val['best_cv_score'],2),
                'model_run_time':round(val['model_run_time'],2)
            })
    
    df = pd.DataFrame(data).sort_values(by='best_cv_score', ascending=False)
    print( df )
    
    print("The above performance is not consistent. However, KNeighborsClassifier with smotetomek is consistently the highest performer.")
    
    # Get the first value in the table
    first_sampler = df['sampler'].iloc[0]
    first_model   = df['model'].iloc[0]
    
    # Get the best estimator and params from results
    estimator   = results[first_sampler][first_model]['best_estimator']
    model_param = results[first_sampler][first_model]['best_param']
    
    print('-'*50)
    print("Best param: ", model_param)
    print('-'*50)
    print("\nBest estimator: ", estimator)
    print('-'*50)
    
    # Verify they match by extracting params from estimator
    best_estimator_params = estimator.get_params()
    tuned_params = {k: v for k, v in best_estimator_params.items() if k in model_param}
    
    print("\nBest params from results:", model_param)
    print('-'*50)
    print("\nBest params from estimator:", tuned_params)
    print('-'*50)
    
    print("===Define the best model/params to see if I get the same result===")
    
    print("===Use the best_estimator directly===")
    best_pipeline = results[first_sampler][first_model]['best_estimator']
    
    print("===Fit on your training data & predict output===")
    best_pipeline.fit(X, y)
    y_pred = best_pipeline.predict(X)
    
    print("===Evaluate===")       
    cv_accuracy = cross_val_score(
        estimator = best_pipeline, 
        X=X,
        y=y, 
        scoring='accuracy',
        cv=cv
    )
    print('Mean CV Accuracy:', cv_accuracy.mean())
    
    report = classification_report(y, y_pred)
    print(report)
           

===STEP 0: Split X and y===
Total samples: 48
Total features: 107
Number of classes: 16
Class distribution in full dataset:
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
===Define oversampler TYPES===
min(10, X.shape[1], len(X) // 10) : 4

=== smoteenn ===
k_neighbors: 2
y.value_counts().min(): 3
Only 3 samples per class - using RandomOverSampler instead of smoteenn
oversampler used:  RandomOverSampler(random_state=42)
---
-------end of model for loop iter---------
---
-------end of model for loop iter---------
---
-------end of model for loop iter---------
---
-------end of model for loop iter---------
---
-------end of model for loop iter---------
---
-------end of model for loop iter---------
-------end of sampler for loop iter---------

=== smotetomek ===
k_neighbors: 2
y.value_counts().min(): 3
Only 3 samples per class - using RandomOverSampler instead of smotetomek
oversampler used:  RandomOverSampler(random_state=42)
---
-------end of model for loop iter---------
---
-------e