In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
import shap

# -------------------------
# Common Parameters
# -------------------------
target_task = "read text aloud"

def extract_rows(folder_path, label):
    all_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    rows = []            

    for file in all_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        
        # Filter task
        filtered = df[df['recording_name'] == target_task]
        if not filtered.empty:
            row = filtered.iloc[0]
            row['label'] = label  # Add class label
            rows.append(row)
    
    return rows

DATA_DIR = "../data"

# -------------------------
# Paths to both classes
# -------------------------
confirmed_path = os.path.join(DATA_DIR, "raw", "Confirmed HF Patients")
suspected_path = os.path.join(DATA_DIR, "raw", "Suspected HF Patients")

# Extract data
confirmed_rows = extract_rows(confirmed_path, label=1)
suspected_rows = extract_rows(suspected_path, label=0)

# Combine into one DataFrame
df_all = pd.DataFrame(confirmed_rows + suspected_rows)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['label'] = label  # Add class label
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['label'] = label  # Add class label
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['label'] = label  # Add class label
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['label'] = label  # Add class label
A value 

In [18]:
# Drop non-feature columns
df_all = df_all.drop(['subject_number', 'recording_name', 'total_duration'], axis=1, errors='ignore')

# Shuffle data
df_all = df_all.sample(frac=1, random_state=42).reset_index(drop=True)

# -------------------------
# Classification
# -------------------------

# Separate features and labels
X = df_all.drop('label', axis=1)
y = df_all['label']

# Handle missing values if any
# X = X.fillna(X.mean())

# Check if any missing values
print(X.isnull().values.any())

# Handle missing values using KNNImputer
imputer = KNNImputer(n_neighbors=2, weights="uniform")
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)

# Check if any missing values 
print(X.isnull().values.any())

# Start Leave-One-Out approach
loo = LeaveOneOut()

y_true = []
y_pred = []
y_pred_reduced = []

# To store SHAP values and corresponding test samples
all_shap_values = []
all_test_samples = []

for train_index, test_index in loo.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Model
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)

    # Store prediction
    y_true.append(y_test.iloc[0])
    y_pred.append(pred[0])

    # SHAP explainer
    explainer = shap.TreeExplainer(clf, feature_perturbation="tree_path_dependent")
    shap_values = explainer.shap_values(X_train)

        
    # Save SHAP values and test sample
    all_shap_values.append(shap_values)  # shape: (1, n_features, n_classes)
    all_test_samples.append(X_test)


    # We'll compute mean absolute SHAP values for each feature across all classes
    mean_abs_shap = np.abs(shap_values).mean(axis=(0, 2))  # shape: (num_features,)
    assert mean_abs_shap.shape[0] == X_train.shape[-1]
    # Get non-zero SHAP features
    threshold = 0.006
    non_zero_indices = np.where(mean_abs_shap > threshold)[0]
    print("The non zero indices")
    print(non_zero_indices)
    non_zero_features = X_test.columns[non_zero_indices]

    print(f"Selected {len(non_zero_features)} non-zero SHAP features:")
    print(non_zero_features.tolist())

    ### Now use these reduced features to train the classifier again ###
    # Here we are transforming input X according to the reduced features
    X_train_reduced = X_train[non_zero_features]
    X_test_reduced = X_test[non_zero_features]

      
    
    # Model
    clf_reduced = RandomForestClassifier(random_state=42)

    clf_reduced.fit(X_train_reduced, y_train)
    pred_reduced = clf_reduced.predict(X_test_reduced)
    
    y_pred_reduced.append(pred_reduced[0])

     
    
# Combine all SHAP values and samples after the loop
all_shap_values = np.concatenate(all_shap_values, axis=0)  # shape: (n_samples, n_features, n_classes)
all_test_samples = pd.concat(all_test_samples, axis=0)

# Final evaluation
accuracy = accuracy_score(y_true, y_pred)
print(f"Leave-One-Out Accuracy Before Feature Reduction: {accuracy:.4f}")

report = classification_report(y_true, y_pred, target_names=['Class 0', 'Class 1'])
print("\nClassification Report Before Feature Reduction:")
print(report)

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
print("\n🧾 Confusion Matrix Before Feature Reduction:")
print(cm)

# Final Evaluation after Reduced Features
accuracy_reduced_features = accuracy_score(y_true, y_pred_reduced)
print(f"Leave-One-Out Accuracy After Feature Reduction: {accuracy_reduced_features:.4f}")


# Evaluation
report_reduced_features = classification_report(y_true, y_pred_reduced, target_names=['Class 0', 'Class 1'])

# Generate the report
print("\n Classification Report After Reduced Features:")
print(report_reduced_features)

# Create the confusion matrix
cm_reduced_features = confusion_matrix(y_true, y_pred_reduced)

# Print the Confusion Matrix
print("\n🧾 Confusion Matrix After Feature Reduction:")
print(cm_reduced_features)






True
False
The non zero indices
[ 7  9 11 13 17 18 22 26 29 31 42 54 55 58 59 60 66 73 75 78 79 83 91 92]
Selected 24 non-zero SHAP features:
['whole_second_longest_continuous_phonation', 'whole_std_phonation_length', 'whole_pitch_mean', 'whole_loudness_mean', 'whole_mfcc_2_mean', 'whole_mfcc_2_std', 'whole_mfcc_4_std', 'whole_mfcc_6_std', 'whole_mfcc_8_mean', 'whole_mfcc_9_mean', 'whole_cpp_std', '5-95_second_longest_continuous_phonation', '5-95_third_longest_continuous_phonation', '5-95_pitch_mean', '5-95_pitch_std', '5-95_loudness_mean', '5-95_mfcc_3_mean', '5-95_mfcc_6_std', '5-95_mfcc_7_std', '5-95_mfcc_9_mean', '5-95_mfcc_9_std', '5-95_mfcc_11_std', '5-95_stdev_f0', '5-95_jitter']
The non zero indices
[ 7  9 11 12 13 18 22 25 26 31 36 40 42 54 55 58 59 60 66 73 79 80 81 83
 89 91]
Selected 26 non-zero SHAP features:
['whole_second_longest_continuous_phonation', 'whole_std_phonation_length', 'whole_pitch_mean', 'whole_pitch_std', 'whole_loudness_mean', 'whole_mfcc_2_std', 'whole_m