Gait Analysis (baseline approach (as in gait_final_output_v1_1_modified_features.ipynb) with R1, R2, R3 transfomed into mean, mediam and std, BUT USING ONLY 107 features at a time either mean, median or mode)

Aim: 
- Multi-class classification (16 subjects) using train/test split with SMOTE-based oversampling in pipeline - the first attempt/baseline approach.

Data Source/Credit: 
- https://archive.ics.uci.edu/dataset/604/gait+classification

Dataset
- Size:         48 samples (16 subjects * 3 trials)
- Features:     Transformed features into mean, median and mode for R1, R2 and R3 to derive new 321 features and use those for classification. ONLY 107 USED AT A TIME (EITHER MEAN/MEDIAN/MODE)
- Target:       Subject identification (16 classes)
- Missing data: 1 NA value in CycleTime_R2 (imputed with median)
- Split:        67% train / 33% test -> ~2 training samples/class + 1 test sample/class 

Conclusion from this analysis: 
- same performance as gait_final_output_v1_1.ipynb

In [2]:
# Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, cross_val_predict
from sklearn.model_selection import LeaveOneOut, LeaveOneGroupOut, StratifiedKFold, GroupKFold
from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import NearestNeighbors

from imblearn.pipeline import Pipeline

# these clean up the noisy data
from imblearn.combine import SMOTETomek, SMOTEENN

# these do not clean up the noisy data
from imblearn.over_sampling import RandomOverSampler, ADASYN, SMOTE

# avoid as it only duplicates data
# from imblearn.over_sampling import RandomOverSampler 


from sklearn.feature_selection import SelectKBest, mutual_info_classif


In [4]:
# Load data

df = pd.read_csv('gait_final_output_updated.csv')
print(f'df.shape: {df.shape}')
print("---")
df.sample(3)


df.shape: (48, 322)
---


Unnamed: 0,Speed_R1,Variability_R1,Symmetry_R1,HeelPressTime_R1,CycleTime_R1,Cadence_R1,Posture_R1,Oscillation_R1,Loading_R1,FootPress_R1,...,P99_R3,P100_R3,P101_R3,P102_R3,P103_R3,P104_R3,P105_R3,P106_R3,P107_R3,Subject_ID_Y
11,1.58,0.0,-7.4,1.114,1.093,1.045,1.035,0.56,0.401,0.055,...,0.015,0.035,0.019,0.105,0.162,0.213,0.227,0.867,1.153,3
30,1.32,6.19,1.9,1.169,1.113,1.105,1.113,0.461,0.069,0.073,...,0.015,0.027,0.021,0.125,0.16,0.207,0.224,0.857,1.167,10
37,1.26,0.0,-3.2,1.128,1.146,1.075,1.08,0.421,0.597,0.056,...,0.012,0.028,0.018,0.06,0.105,0.17,0.171,0.918,1.09,12


In [5]:
# Check for class imbalance

print( df['Subject_ID_Y'].value_counts().to_list() )

min_y_count = df['Subject_ID_Y'].value_counts().min()
max_y_count = df['Subject_ID_Y'].value_counts().max()

if min_y_count/max_y_count > 5: 
    print(f"Classes are imbalanced. Max-to-min count ratio is: {min_y_count/max_y_count}")
else:
    print(f"Classes are balanced. Max-to-min count ratio is: {min_y_count/max_y_count}")
    

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Classes are balanced. Max-to-min count ratio is: 1.0


In [10]:
# Form a new df, with mean, meadian and mode using df_new

cols_to_aggregate, prefixes = [], []

# identify columns to aggregate (excluding target)
cols_to_aggregate = [col for col in df.columns if col!='Subject_ID_Y']
# print( "cols_to_aggregate:",cols_to_affregate )

# extract unique prefix before the underscore
prefixes =  [col.split('_') for col in cols_to_aggregate] 
# print("prefixes:", prefixes ) # list of lists

# create a new dictionary
new_columns_mean = {}
new_columns_median = {}
new_columns_std = {}

for prefix in prefixes:
    # print(f"prefix:{prefix} | first_val:{prefix[0]}")
    # find all columns with prefix prefix[0]
    # print("prefix[0]:", prefix[0])
    matching_cols = [ col for col in cols_to_aggregate if col.startswith( prefix[0] ) ]
    # print( "matching_cols", matching_cols )    
    
    if len(matching_cols) <3 :
        print(f"Either the matching_cols list is empty or has less than 3 columns")    
        
    elif len(matching_cols) == 3: # we know it has 3 columns, R1, R2, R3
        # calculate mean, median and mode
        new_columns_mean[f'{prefix[0]}_mean']     = df[matching_cols].mean(axis=1)
        new_columns_median[f'{prefix[0]}_median'] = df[matching_cols].median(axis=1)
        new_columns_std[f'{prefix[0]}_std']       = df[matching_cols].std(axis=1)
        
# create a dataframe from this dictionary 
df_mean   = pd.DataFrame(new_columns_mean)
df_median = pd.DataFrame(new_columns_median)
df_std    = pd.DataFrame(new_columns_std)

print(f"df_mean.shape:{df_mean.shape} | df_median.shape:{df_median.shape} | df_std.shape:{df_std.shape}")


df_mean.shape:(48, 107) | df_median.shape:(48, 107) | df_std.shape:(48, 107)


In [21]:
# STEP 0: Split X and y

y =  df['Subject_ID_Y']

for X in [df_mean, df_median, df_std]:
    # print(f"processing: {X}") # process each dataframe one by one
    print('-'*50)
    print(f"Total samples: {len(y)}")
    print(f"Total features: {X.shape[1]}")
    print(f"Number of classes: {len(y.unique())}")
    print(f"Class distribution in full dataset:")
    print(y.value_counts().sort_index().to_list())
    
    # Define oversampler TYPES
    oversampler_types = ['smoteenn', 'smotetomek']
    oversampler_count = len(oversampler_types)
    
    for key in oversampler_types:
        print(f'\n=== {key} ===')
        
        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=42 )
        print("y_train distribution:", y_train.value_counts().sort_index().to_list())
        
        # Get sample counts in train data
        n_samples = y_train.value_counts().min()
        k_neighbors = max(1, min(3, n_samples - 1))
        print("k_neighbors:", k_neighbors)
        print("y_train.value_counts().min():", n_samples)
        
        # Choose appropriate oversampler based on sample size
        if n_samples < 6:  # Too few samples for SMOTEENN/SMOTETomek
            print(f"Only {n_samples} samples per class - using RandomOverSampler instead of {key}")
            oversampler = RandomOverSampler(random_state=42)
        else:
            sampling_strategy = {cls: target_samples_per_class for cls in np.unique(y_train)}
            
            if key == 'smoteenn':
                oversampler = SMOTEENN(
                    smote=SMOTE(random_state=42, sampling_strategy=sampling_strategy, k_neighbors=k_neighbors),
                    random_state=42,
                    n_jobs=-1
                )
            elif key == 'smotetomek':
                oversampler = SMOTETomek(
                    smote=SMOTE(random_state=42, sampling_strategy=sampling_strategy, k_neighbors=k_neighbors),
                    random_state=42,
                    n_jobs=-1
                )
    
        print("oversampler used: ", oversampler)    
        
        # Pipeline with GaussianNB
        max_k = min(10, X_train.shape[1], len(X_train) // 10)
        pipeline = Pipeline([
            ('oversample', oversampler),
            ('scaler', StandardScaler()),
            ('selector', SelectKBest(mutual_info_classif, k=max_k)),
            ('model', GaussianNB())
        ])
        
        try:
            # Check if we have enough samples for cross-validation
            if n_samples < 3:
                print(f"Skipping cross-validation - only {n_samples} samples per class")
                print("Training on training set and evaluating on test set only...")
                
                # Just fit on training data and evaluate on test
                pipeline.fit(X_train, y_train)
                y_pred_test = pipeline.predict(X_test)
                
                print("\nTest Report:")
                print(classification_report(y_test, y_pred_test, zero_division=0))
            else:
                # Cross-validation
                n_splits = min(5, n_samples)
                cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
                
                y_pred_train = cross_val_predict(pipeline, X_train, y_train, cv=cv, n_jobs=-1)
                print("\nTraining CV Report:")
                print(classification_report(y_train, y_pred_train, zero_division=0))
                
                # Final evaluation
                pipeline.fit(X_train, y_train)
                y_pred_test = pipeline.predict(X_test)
                print("\nTest Report:")
                print(classification_report(y_test, y_pred_test, zero_division=0))
    
            
        except ValueError as e:
            print(f"Error with {key}: {e}")
            print("Debug info:")
            print(f"  - X_train shape: {X_train.shape}")
            print(f"  - y_train distribution: {y_train.value_counts().to_dict()}")
            continue

--------------------------------------------------
Total samples: 48
Total features: 107
Number of classes: 16
Class distribution in full dataset:
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

=== smoteenn ===
y_train distribution: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
k_neighbors: 1
y_train.value_counts().min(): 2
Only 2 samples per class - using RandomOverSampler instead of smoteenn
oversampler used:  RandomOverSampler(random_state=42)
Skipping cross-validation - only 2 samples per class
Training on training set and evaluating on test set only...

Test Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      1.00      0.67         1
           2       0.50      1.00      0.67         1
           3       0.50      1.00      0.67         1
           4       0.00      0.00      0.00         1
           5       1.00      1.00      1.00         1
           6       0.20      1.00    