In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

In [4]:
df = pd.read_excel("VOC-ALS/VOC-ALS.xlsx", sheet_name="VOC-ALS_Data",header=1)

In [5]:
vowels = ['A', 'E', 'I', 'O', 'U']
syllables = ['PA', 'TA', 'KA']
metrics = ['meanF0Hz', 'stdevF0Hz', 'HNR', 'localJitter', 'localShimmer']


In [6]:
acoustic_features = []
for sound in vowels + syllables:
    for metric in metrics:
        feature_name = f"{metric}_{sound}"
        acoustic_features.append(feature_name)

In [25]:
acoustic_features
len(acoustic_features)

40

In [8]:
X = df[acoustic_features].copy()
y = df['Category'].map({'ALS': 1, 'HC': 0})

In [9]:
X = X.fillna(X.mean())

In [10]:
X['Age'] = df['Age (years)']
X['Sex'] = df['Sex'].map({'M': 1, 'F': 0})

In [11]:
for sound in vowels + syllables:
    X[f'jitter_shimmer_ratio_{sound}'] = X[f'localJitter_{sound}'] / X[f'localShimmer_{sound}']
    X[f'f0_variability_{sound}'] = X[f'stdevF0Hz_{sound}'] / X[f'meanF0Hz_{sound}']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100,50), max_iter=500, random_state=42)
}

In [15]:
results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

Random Forest Accuracy: 0.7097
              precision    recall  f1-score   support

           0       0.55      0.60      0.57        10
           1       0.80      0.76      0.78        21

    accuracy                           0.71        31
   macro avg       0.67      0.68      0.68        31
weighted avg       0.72      0.71      0.71        31

SVM Accuracy: 0.6774
              precision    recall  f1-score   support

           0       0.50      0.10      0.17        10
           1       0.69      0.95      0.80        21

    accuracy                           0.68        31
   macro avg       0.59      0.53      0.48        31
weighted avg       0.63      0.68      0.60        31

XGBoost Accuracy: 0.6452
              precision    recall  f1-score   support

           0       0.45      0.50      0.48        10
           1       0.75      0.71      0.73        21

    accuracy                           0.65        31
   macro avg       0.60      0.61      0.60        

In [16]:
best_model_name = max(results, key=results.get)
print(f"\nTuning hyperparameters for {best_model_name}...")


Tuning hyperparameters for Random Forest...


In [17]:
best_model_name = max(results, key=results.get)
print(f"\nTuning hyperparameters for {best_model_name}...")

if best_model_name == 'XGBoost':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2]
    }
    grid_search = GridSearchCV(XGBClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
    
elif best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
    
elif best_model_name == 'SVM':
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.1, 0.01]
    }
    grid_search = GridSearchCV(SVC(kernel='rbf', probability=True, random_state=42), param_grid, cv=5, scoring='accuracy')
    
else:  # Neural Network
    param_grid = {
        'hidden_layer_sizes': [(50,), (100,), (100,50)],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate_init': [0.001, 0.01]
    }
    grid_search = GridSearchCV(MLPClassifier(max_iter=500, random_state=42), param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train_scaled, y_train)
print(f"Best parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_
best_pred = best_model.predict(X_test_scaled)
best_accuracy = accuracy_score(y_test, best_pred)
print(f"Tuned {best_model_name} Accuracy: {best_accuracy:.4f}")
print(classification_report(y_test, best_pred))


Tuning hyperparameters for Random Forest...
Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Tuned Random Forest Accuracy: 0.6774
              precision    recall  f1-score   support

           0       0.50      0.60      0.55        10
           1       0.79      0.71      0.75        21

    accuracy                           0.68        31
   macro avg       0.64      0.66      0.65        31
weighted avg       0.70      0.68      0.68        31



In [18]:
import joblib
joblib.dump(best_model, 'als_detection_model.pkl')
joblib.dump(scaler, 'als_detection_scaler.pkl')

['als_detection_scaler.pkl']

In [19]:
def predict_als(new_data):
    """
    Predict ALS from acoustic features
    
    Parameters:
    new_data (dict): Dictionary with acoustic features
    
    Returns:
    tuple: (prediction, probability)
    """
    # Convert to DataFrame
    new_df = pd.DataFrame([new_data])
    
    # Fill missing values
    new_df = new_df.fillna(X.mean())
    
    # Add engineered features
    for sound in vowels + syllables:
        new_df[f'jitter_shimmer_ratio_{sound}'] = new_df[f'localJitter_{sound}'] / new_df[f'localShimmer_{sound}']
        new_df[f'f0_variability_{sound}'] = new_df[f'stdevF0Hz_{sound}'] / new_df[f'meanF0Hz_{sound}']
    
    # Scale features
    new_scaled = scaler.transform(new_df)
    
    # Predict
    prediction = best_model.predict(new_scaled)[0]
    probability = best_model.predict_proba(new_scaled)[0][1]
    
    return prediction, probability