In [1]:
import numpy as np
import pandas as pd
import os
import re
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import IPython.display as ipd
from tqdm import tqdm

In [2]:
from hnr import *
from jitters import *
from shimmers import *
from sound import Waveform

In [3]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

In [4]:
data_path = "/home/bmis/Documents/AI-Workspace/ALS/bmis_data/All"

control_path = 'Control'
als_with_dysarthria_path = 'ALSwDysarthria'
als_without_dysarthria_path = 'ALSwoDysarthria'

control_dir = os.path.join(data_path, control_path)
control = os.listdir(control_dir)[0]

als_with_dir = os.path.join(data_path, als_with_dysarthria_path)
als_with_dys = os.listdir(als_with_dir)[1]

als_without_dir = os.path.join(data_path, als_without_dysarthria_path)
als_without_dys = os.listdir(als_without_dir)[1]

In [5]:
def extract_td_feature(audio_path):
    td_features = []
    _, sample_rate = librosa.load(audio_path)
    #print(audio_path)
    sound = Waveform(path=audio_path, sample_rate=sample_rate)

    td_features.append(sound.jitters()['localabsoluteJitter'])
    td_features.append(sound.jitters()['localJitter'])
    td_features.append(sound.jitters()['rapJitter'])
    td_features.append(sound.jitters()['ppq5Jitter'])

    td_features.append(sound.shimmers()['localShimmer'])
    td_features.append(sound.shimmers()['localdbShimmer'])
    td_features.append(sound.shimmers()['apq3Shimmer'])
    td_features.append(sound.shimmers()['apq5Shimmer'])
    td_features.append(sound.shimmers()['apq11Shimmer'])

    _ =(sound.hnr())
    hnr = sound.hnr()
    td_features.append(hnr)

    return np.array(td_features)

def get_all_td_features(data_path, label):
    data = []
    labels = []

    for file in tqdm(os.listdir(data_path)):
        #print(f'loading file: {file}')
        try:
            feature = extract_td_feature(os.path.join(data_path, file))
            data.append(feature)
            labels.append(label)
        except:
            print("Error encountered while parsing file: ", file)
            continue
    return data, labels

In [6]:
def extract_MFCC_features(audio_path):
    audio, sample_rate = librosa.load(audio_path)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=128)
    #mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate)
    mfccs_scaled = np.mean(mfccs.T,axis=0)
    return mfccs_scaled


def get_MFCC_data(data_path, label):
    data = []
    labels = []
    for file in tqdm(os.listdir(data_path)):
        try:
            feature = extract_MFCC_features(os.path.join(data_path, file))
            data.append(feature)
            labels.append(label)
        except:
            print("Error encountered while parsing file: ", file)
            continue
    return data, labels

## 2 Class Classification

In [7]:
#control_data, control_labels = get_all_td_features(control_dir, 0)
#als_with_data, als_with_labels = get_all_td_features(als_with_dir, 1)
#als_without_data, als_without_labels = get_all_td_features(als_without_dir, 1)

control_data, control_labels = get_MFCC_data(control_dir, 0)
als_with_data, als_with_labels = get_MFCC_data(als_with_dir, 1)
als_without_data, als_without_labels = get_MFCC_data(als_without_dir, 1)

X = np.concatenate([control_data, als_with_data, als_without_data], axis=0)
y = np.concatenate([control_labels, als_with_labels, als_without_labels], axis=0)
X = np.array(X)


# Data Cleaning

X[X == None] = np.nan
X = np.nan_to_num(X, nan=0.0)
df = pd.DataFrame(X)
df.fillna(0.0, inplace=True)
X = df.to_numpy()

X, y = shuffle(X, y, random_state=42)
#X = X.reshape(-1,10,1)

100%|██████████| 199/199 [00:05<00:00, 39.50it/s]
100%|██████████| 291/291 [00:11<00:00, 25.69it/s]
100%|██████████| 176/176 [00:04<00:00, 38.20it/s]


In [8]:
scorers = ['accuracy', 'precision', 'recall', 'f1']

## Random Forest Classifier

In [9]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_cv_scores = cross_validate(rf_classifier, X, y, cv=10)

print("Random Forest Classifier:")
print(f"Cross-validation scores: {rf_cv_scores['test_score']}")
print(f"Mean accuracy: {rf_cv_scores['test_score'].mean()}\n")


scores = cross_validate(rf_classifier, X, y, cv=10, scoring=scorers)
print("Cross-validation scores for 2 Class RF:")
for metric_name, result in scores.items():
    print(f"{metric_name}: {result.mean():.4f}")

Random Forest Classifier:
Cross-validation scores: [0.8358209  0.89552239 0.94029851 0.88059701 0.89552239 0.82089552
 0.84848485 0.87878788 0.75757576 0.83333333]
Mean accuracy: 0.858683853459973
Cross-validation scores for 2 Class RF:
fit_time: 0.4223
score_time: 0.0088
test_accuracy: 0.8587
test_precision: 0.8516
test_recall: 0.9699
test_f1: 0.9063


## Logistic Regression

In [10]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg_cv_scores = cross_validate(log_reg, X, y, cv=10)

print("Logistic Regression:")
print(f"Cross-validation scores: {log_reg_cv_scores['test_score']}")
print(f"Mean accuracy: {log_reg_cv_scores['test_score'].mean()}\n")

scores = cross_validate(log_reg, X, y, cv=10, scoring=scorers)
print("Cross-validation scores for 2 Class LR:")
for metric_name, result in scores.items():
    print(f"{metric_name}: {result.mean():.4f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression:
Cross-validation scores: [0.80597015 0.8358209  0.89552239 0.82089552 0.89552239 0.8358209
 0.81818182 0.92424242 0.83333333 0.83333333]
Mean accuracy: 0.849864314789688


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation scores for 2 Class LR:
fit_time: 0.5863
score_time: 0.0085
test_accuracy: 0.8499
test_precision: 0.8875
test_recall: 0.9016
test_f1: 0.8940


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Linear Discriminant Analysis

In [11]:
lda = LinearDiscriminantAnalysis()
lda_cv_scores = cross_validate(lda, X, y, cv=10)

print("Linear Discriminant Analysis:")
print(f"Cross-validation scores: {lda_cv_scores['test_score']}")
print(f"Mean accuracy: {lda_cv_scores['test_score'].mean()}\n")

scores = cross_validate(lda, X, y, cv=10, scoring=scorers)
print("Cross-validation scores for 2 Class LDA:")
for metric_name, result in scores.items():
    print(f"{metric_name}: {result.mean():.4f}")

Linear Discriminant Analysis:
Cross-validation scores: [0.85074627 0.80597015 0.79104478 0.82089552 0.89552239 0.8358209
 0.81818182 0.89393939 0.78787879 0.87878788]
Mean accuracy: 0.8378787878787879
Cross-validation scores for 2 Class LDA:
fit_time: 0.0312
score_time: 0.0168
test_accuracy: 0.8379
test_precision: 0.8708
test_recall: 0.9059
test_f1: 0.8868


## Multi-layer Perceptron

In [12]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=500, random_state=42)
mlp_cv_scores = cross_validate(mlp_classifier, X, y, cv=10)

print("Neural Network (MLPClassifier):")
print(f"Cross-validation scores: {mlp_cv_scores['test_score']}")
print(f"Mean accuracy: {mlp_cv_scores['test_score'].mean()}\n")

scores = cross_validate(mlp_classifier, X, y, cv=10, scoring=scorers)
print("Cross-validation scores for 2 Class MLP:")
for metric_name, result in scores.items():
    print(f"{metric_name}: {result.mean():.4f}")

Neural Network (MLPClassifier):
Cross-validation scores: [0.92537313 0.95522388 0.92537313 0.92537313 0.91044776 0.92537313
 0.90909091 0.84848485 0.90909091 0.89393939]
Mean accuracy: 0.9127770239710538
Cross-validation scores for 2 Class MLP:
fit_time: 1.9456
score_time: 0.0106
test_accuracy: 0.9128
test_precision: 0.9330
test_recall: 0.9442
test_f1: 0.9382


## Support Vector Machine with RBF Kernel

In [13]:
svm_rbf = SVC(kernel='rbf', gamma='scale', random_state=42)
svm_rbf_cv_scores = cross_validate(svm_rbf, X, y, cv=10)

print("Support Vector Machine with RBF Kernel:")
print(f"Cross-validation scores: {svm_rbf_cv_scores['test_score']}")
print(f"Mean accuracy: {svm_rbf_cv_scores['test_score'].mean()}\n")

scores = cross_validate(svm_rbf, X, y, cv=10, scoring=scorers)
print("Cross-validation scores for 2 Class SVM RBF:")
for metric_name, result in scores.items():
    print(f"{metric_name}: {result.mean():.4f}")

Support Vector Machine with RBF Kernel:
Cross-validation scores: [0.70149254 0.70149254 0.70149254 0.70149254 0.70149254 0.70149254
 0.71212121 0.6969697  0.6969697  0.6969697 ]
Mean accuracy: 0.7011985526910901
Cross-validation scores for 2 Class SVM RBF:
fit_time: 0.0156
score_time: 0.0066
test_accuracy: 0.7012
test_precision: 0.7012
test_recall: 1.0000
test_f1: 0.8244


## Support Vector Machine with Linear Kernel

In [14]:
svm_linear = SVC(kernel='linear', random_state=42)
svm_linear_cv_scores = cross_validate(svm_linear, X, y, cv=10)

print("Support Vector Machine with Linear Kernel:")
print(f"Cross-validation scores: {svm_linear_cv_scores['test_score']}")
print(f"Mean accuracy: {svm_linear_cv_scores['test_score'].mean()}\n")

scores = cross_validate(svm_linear, X, y, cv=10, scoring=scorers)
print("Cross-validation scores for 2 Class SVM Linear Kernel:")
for metric_name, result in scores.items():
    print(f"{metric_name}: {result.mean():.4f}")

Support Vector Machine with Linear Kernel:
Cross-validation scores: [0.82089552 0.79104478 0.85074627 0.8358209  0.86567164 0.82089552
 0.84848485 0.86363636 0.81818182 0.84848485]
Mean accuracy: 0.836386250565355
Cross-validation scores for 2 Class SVM Linear Kernel:
fit_time: 2.6810
score_time: 0.0043
test_accuracy: 0.8364
test_precision: 0.8908
test_recall: 0.8759
test_f1: 0.8825


## 3 Class Classification

In [15]:
#control_data, control_labels = get_all_td_features(control_dir, 0)
#als_with_data, als_with_labels = get_all_td_features(als_with_dir, 1)
#als_without_data, als_without_labels = get_all_td_features(als_without_dir, 2)


control_data, control_labels = get_MFCC_data(control_dir, 0)
als_with_data, als_with_labels = get_MFCC_data(als_with_dir, 1)
als_without_data, als_without_labels = get_MFCC_data(als_without_dir, 2)


X = np.concatenate([control_data, als_with_data, als_without_data], axis=0)
y = np.concatenate([control_labels, als_with_labels, als_without_labels], axis=0)
X = np.array(X)


# Data Cleaning

X[X == None] = np.nan
X = np.nan_to_num(X, nan=0.0)
df = pd.DataFrame(X)
df.fillna(0.0, inplace=True)
X = df.to_numpy()

X, y = shuffle(X, y, random_state=42)
#X = X.reshape(-1,10,1)

100%|██████████| 199/199 [00:05<00:00, 38.76it/s]
100%|██████████| 291/291 [00:11<00:00, 25.18it/s]
100%|██████████| 176/176 [00:04<00:00, 35.45it/s]


In [16]:
scorers = ['precision_weighted', 'recall_weighted', 'f1_weighted']

## Random Forest Classifier

In [17]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_cv_scores = cross_validate(rf_classifier, X, y, cv=10)

print("Random Forest Classifier:")
print(f"Cross-validation scores: {rf_cv_scores['test_score']}")
print(f"Mean accuracy: {rf_cv_scores['test_score'].mean()}\n")

scores = cross_validate(rf_classifier, X, y, cv=10, scoring=scorers)
print("Cross-validation scores for 3 Class RF:")
for metric_name, result in scores.items():
    print(f"{metric_name}: {result.mean():.4f}")

Random Forest Classifier:
Cross-validation scores: [0.94029851 0.89552239 0.91044776 0.82089552 0.8358209  0.82089552
 0.84848485 0.8030303  0.75757576 0.87878788]
Mean accuracy: 0.8511759384893713
Cross-validation scores for 3 Class RF:
fit_time: 0.3770
score_time: 0.0084
test_precision_weighted: 0.8581
test_recall_weighted: 0.8512
test_f1_weighted: 0.8496


## Logistic Regression

In [18]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg_cv_scores = cross_validate(log_reg, X, y, cv=10)

print("Logistic Regression:")
print(f"Cross-validation scores: {log_reg_cv_scores['test_score']}")
print(f"Mean accuracy: {log_reg_cv_scores['test_score'].mean()}\n")

scores = cross_validate(log_reg, X, y, cv=10, scoring=scorers)
print("Cross-validation scores for 3 Class LR:")
for metric_name, result in scores.items():
    print(f"{metric_name}: {result.mean():.4f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression:
Cross-validation scores: [0.74626866 0.80597015 0.82089552 0.80597015 0.7761194  0.80597015
 0.84848485 0.81818182 0.72727273 0.77272727]
Mean accuracy: 0.7927860696517414


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation scores for 3 Class LR:
fit_time: 0.3067
score_time: 0.0035
test_precision_weighted: 0.7979
test_recall_weighted: 0.7928
test_f1_weighted: 0.7918


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Linear Discriminant Analysis

In [19]:
lda = LinearDiscriminantAnalysis()
lda_cv_scores = cross_validate(lda, X, y, cv=10)

print("Linear Discriminant Analysis:")
print(f"Cross-validation scores: {lda_cv_scores['test_score']}")
print(f"Mean accuracy: {lda_cv_scores['test_score'].mean()}\n")

scores = cross_validate(lda, X, y, cv=10, scoring=scorers)
print("Cross-validation scores for 3 Class LDA:")
for metric_name, result in scores.items():
    print(f"{metric_name}: {result.mean():.4f}")

Linear Discriminant Analysis:
Cross-validation scores: [0.82089552 0.79104478 0.73134328 0.71641791 0.79104478 0.76119403
 0.81818182 0.84848485 0.6969697  0.81818182]
Mean accuracy: 0.7793758480325645
Cross-validation scores for 3 Class LDA:
fit_time: 0.0217
score_time: 0.0073
test_precision_weighted: 0.7869
test_recall_weighted: 0.7794
test_f1_weighted: 0.7771


## Multi-layer Perceptron

In [20]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=500, random_state=42)
mlp_cv_scores = cross_validate(mlp_classifier, X, y, cv=10)

print("Neural Network (MLPClassifier):")
print(f"Cross-validation scores: {mlp_cv_scores['test_score']}")
print(f"Mean accuracy: {mlp_cv_scores['test_score'].mean()}\n")

scores = cross_validate(mlp_classifier, X, y, cv=10, scoring=scorers)
print("Cross-validation scores for 3 Class MLP:")
for metric_name, result in scores.items():
    print(f"{metric_name}: {result.mean():.4f}")

Neural Network (MLPClassifier):
Cross-validation scores: [0.86567164 0.95522388 0.89552239 0.86567164 0.8358209  0.86567164
 0.95454545 0.81818182 0.83333333 0.81818182]
Mean accuracy: 0.8707824513794662
Cross-validation scores for 3 Class MLP:
fit_time: 2.4773
score_time: 0.0077
test_precision_weighted: 0.8745
test_recall_weighted: 0.8708
test_f1_weighted: 0.8703


## Support Vector Machine with RBF Kernel

In [21]:
svm_rbf = SVC(kernel='rbf', gamma='scale', random_state=42)
svm_rbf_cv_scores = cross_validate(svm_rbf, X, y, cv=10)

print("Support Vector Machine with RBF Kernel:")
print(f"Cross-validation scores: {svm_rbf_cv_scores['test_score']}")
print(f"Mean accuracy: {svm_rbf_cv_scores['test_score'].mean()}\n")

scores = cross_validate(svm_rbf, X, y, cv=10, scoring=scorers)
print("Cross-validation scores for 3 Class SVM with RBF:")
for metric_name, result in scores.items():
    print(f"{metric_name}: {result.mean():.4f}")

Support Vector Machine with RBF Kernel:
Cross-validation scores: [0.49253731 0.49253731 0.44776119 0.46268657 0.41791045 0.43283582
 0.43939394 0.48484848 0.45454545 0.45454545]
Mean accuracy: 0.4579601990049751


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cross-validation scores for 3 Class SVM with RBF:
fit_time: 0.0208
score_time: 0.0070
test_precision_weighted: 0.3200
test_recall_weighted: 0.4580
test_f1_weighted: 0.3447


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Support Vector Machine with Linear Kernel

In [22]:
svm_linear = SVC(kernel='linear', random_state=42)
svm_linear_cv_scores = cross_validate(svm_linear, X, y, cv=10)

print("Support Vector Machine with Linear Kernel:")
print(f"Cross-validation scores: {svm_linear_cv_scores['test_score']}")
print(f"Mean accuracy: {svm_linear_cv_scores['test_score'].mean()}\n")

scores = cross_validate(svm_linear, X, y, cv=10, scoring=scorers)
print("Cross-validation scores for 3 Class SVM with Linear Kernel:")
for metric_name, result in scores.items():
    print(f"{metric_name}: {result.mean():.4f}")

Support Vector Machine with Linear Kernel:
Cross-validation scores: [0.73134328 0.68656716 0.80597015 0.7761194  0.79104478 0.71641791
 0.84848485 0.87878788 0.6969697  0.77272727]
Mean accuracy: 0.7704432383536861
Cross-validation scores for 3 Class SVM with Linear Kernel:
fit_time: 1.7603
score_time: 0.0040
test_precision_weighted: 0.7791
test_recall_weighted: 0.7704
test_f1_weighted: 0.7703
