## ML IN BIOMED SIGNALS

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score

## RANDOM FOREST

In [8]:
# Function to load data
def load_data(filepath):
    return pd.read_csv(filepath)

# Function to process data, train and evaluate model with Grid Search
def process_and_evaluate_with_grid_search(data):
    # Splitting data into features and target
    X = data.drop('label', axis=1)
    y = data['label']
    
    # Splitting the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Defining the parameter grid for Grid Search
    param_grid = {
        'n_estimators': [50, 100, 200],  # Number of trees in the forest
        'max_depth': [None, 10, 20, 30],  # Maximum number of levels in each decision tree
        'min_samples_split': [2, 5, 10],  # Minimum number of data points placed in a node before the node is split
        'min_samples_leaf': [1, 2, 4]  # Minimum number of data points allowed in a leaf node
    }
    
    # Creating the Random Forest Classifier
    model = RandomForestClassifier(random_state=42)
    
    # Setting up the GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1_weighted', verbose=1)
    
    # Running Grid Search to find the best parameters
    grid_search.fit(X_train, y_train)
    
    # Best estimator found by Grid Search
    best_model = grid_search.best_estimator_
    
    # Making predictions with the best model
    y_pred = best_model.predict(X_test)
    
    # Evaluating the model
    f1 = f1_score(y_test, y_pred, average='weighted')
    report = classification_report(y_test, y_pred)
    
    return f1, report, grid_search.best_params_

# Load both datasets
original_data = load_data('gender_audio_features_Final_greeshma.csv')
denoised_data = load_data('gender_audio_features_denoised_greeshma.csv')

# Evaluate the original dataset
f1_original, report_original, best_params_original = process_and_evaluate_with_grid_search(original_data)
print("F1 Score for Original Dataset:", f1_original)
print("Classification Report for Original Dataset:\n", report_original)
print("Best Parameters for Original Dataset:", best_params_original)

# Evaluate the denoised dataset
f1_denoised, report_denoised, best_params_denoised = process_and_evaluate_with_grid_search(denoised_data)
print("F1 Score for Denoised Dataset:", f1_denoised)
print("Classification Report for Denoised Dataset:\n", report_denoised)
print("Best Parameters for Denoised Dataset:", best_params_denoised)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
F1 Score for Original Dataset: 0.8777692895339955
Classification Report for Original Dataset:
               precision    recall  f1-score   support

      female       1.00      0.74      0.85        19
        male       0.82      1.00      0.90        23

    accuracy                           0.88        42
   macro avg       0.91      0.87      0.88        42
weighted avg       0.90      0.88      0.88        42

Best Parameters for Original Dataset: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Fitting 5 folds for each of 108 candidates, totalling 540 fits
F1 Score for Denoised Dataset: 0.8306595365418895
Classification Report for Denoised Dataset:
               precision    recall  f1-score   support

      female       0.87      0.72      0.79        18
        male       0.81      0.92      0.86        24

    accuracy                           0.83        42
   macro avg  

## SVM

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score

In [None]:
# Function to load data
def load_data(filepath):
    return pd.read_csv(filepath)

# Function to process data, train and evaluate model with Grid Search
def process_and_evaluate_with_grid_search(data):
    # Splitting data into features and target
    X = data.drop('label', axis=1)
    y = data['label']
    
    # Splitting the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Defining the parameter grid for Grid Search
    param_grid = {
        'C': [0.1, 1, 10, 100],  # Regularization parameter
        'gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
        'kernel': ['rbf', 'poly', 'sigmoid']  # Type of the kernel
    }
    
    # Creating the SVM Classifier
    model = SVC(random_state=42)
    
    # Setting up the GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1_weighted', verbose=1)
    
    # Running Grid Search to find the best parameters
    grid_search.fit(X_train, y_train)
    
    # Best estimator found by Grid Search
    best_model = grid_search.best_estimator_
    
    # Making predictions with the best model
    y_pred = best_model.predict(X_test)
    
    # Evaluating the model
    f1 = f1_score(y_test, y_pred, average='weighted')
    report = classification_report(y_test, y_pred)
    
    return f1, report, grid_search.best_params_

# Load both datasets
original_data = load_data('gender_audio_features_Final_greeshma.csv')
denoised_data = load_data('gender_audio_features_denoised_greeshma.csv')


# Evaluate the original dataset
f1_original, report_original, best_params_original = process_and_evaluate_with_grid_search(original_data)
print("F1 Score for Original Dataset:", f1_original)
print("Classification Report for Original Dataset:\n", report_original)
print("Best Parameters for Original Dataset:", best_params_original)

# Evaluate the denoised dataset
f1_denoised, report_denoised, best_params_denoised = process_and_evaluate_with_grid_search(denoised_data)
print("F1 Score for Denoised Dataset:", f1_denoised)
print("Classification Report for Denoised Dataset:\n", report_denoised)
print("Best Parameters for Denoised Dataset:", best_params_denoised)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
F1 Score for Original Dataset: 0.8571428571428571
Classification Report for Original Dataset:
               precision    recall  f1-score   support

      female       0.78      0.95      0.86        19
        male       0.95      0.78      0.86        23

    accuracy                           0.86        42
   macro avg       0.86      0.86      0.86        42
weighted avg       0.87      0.86      0.86        42

Best Parameters for Original Dataset: {'C': 0.1, 'gamma': 'auto', 'kernel': 'poly'}
Fitting 5 folds for each of 60 candidates, totalling 300 fits
F1 Score for Denoised Dataset: 0.7868131868131869
Classification Report for Denoised Dataset:
               precision    recall  f1-score   support

      female       0.71      0.83      0.77        18
        male       0.86      0.75      0.80        24

    accuracy                           0.79        42
   macro avg       0.79      0.79      0.78        42
wei

## LINEAR SVM

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report

# Function to load data
def load_data(filepath):
    return pd.read_csv(filepath)

# Function to process data, train and evaluate model with Grid Search
def process_and_evaluate_with_grid_search(data):
    # Splitting data into features and target
    X = data.drop('label', axis=1)
    y = data['label']
    
    # Splitting the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Defining the parameter grid for Grid Search
    param_grid = {
        'C': [0.1, 1, 10, 100]  # Regularization parameter
    }
    
    # Creating the SVM Classifier with a linear kernel
    model = SVC(kernel='linear', random_state=42)
    
    # Setting up the GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1_weighted', verbose=1)
    
    # Running Grid Search to find the best parameters
    grid_search.fit(X_train, y_train)
    
    # Best estimator found by Grid Search
    best_model = grid_search.best_estimator_
    
    # Making predictions with the best model
    y_pred = best_model.predict(X_test)
    
    # Evaluating the model
    f1 = f1_score(y_test, y_pred, average='weighted')
    report = classification_report(y_test, y_pred)
    
    return f1, report, grid_search.best_params_

# Load data
original_data = load_data('gender_audio_features_Final_greeshma.csv')
denoised_data = load_data('gender_audio_features_denoised_greeshma.csv')

# Evaluate the original dataset
f1_original, report_original, best_params_original = process_and_evaluate_with_grid_search(original_data)
print("F1 Score for Original Dataset:", f1_original)
print("Classification Report for Original Dataset:\n", report_original)
print("Best Parameters for Original Dataset:", best_params_original)

# Evaluate the denoised dataset
f1_denoised, report_denoised, best_params_denoised = process_and_evaluate_with_grid_search(denoised_data)
print("F1 Score for Denoised Dataset:", f1_denoised)
print("Classification Report for Denoised Dataset:\n", report_denoised)
print("Best Parameters for Denoised Dataset:", best_params_denoised)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
F1 Score for Original Dataset: 0.9196401141101602
Classification Report for Original Dataset:
               precision    recall  f1-score   support

      female       0.88      0.92      0.90        24
        male       0.95      0.92      0.93        38

    accuracy                           0.92        62
   macro avg       0.91      0.92      0.92        62
weighted avg       0.92      0.92      0.92        62

Best Parameters for Original Dataset: {'C': 1}
Fitting 5 folds for each of 4 candidates, totalling 20 fits
F1 Score for Denoised Dataset: 0.9025980819529206
Classification Report for Denoised Dataset:
               precision    recall  f1-score   support

      female       0.92      0.85      0.88        26
        male       0.89      0.94      0.92        36

    accuracy                           0.90        62
   macro avg       0.91      0.90      0.90        62
weighted avg       0.90      0.90      0.90 

## Performing Statistical Testing


In [24]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy.stats import ttest_rel

# Load a dataset (example using Iris dataset)
data = load_iris()
X = data.data
y = data.target

# Define classifiers
rf_classifier = RandomForestClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

# Define cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation
rf_scores = cross_val_score(rf_classifier, X, y, cv=cv, scoring='f1_macro')
svm_scores = cross_val_score(svm_classifier, X, y, cv=cv, scoring='f1_macro')

# Perform paired t-test
t_stat, p_value = ttest_rel(rf_scores, svm_scores)

print("Random Forest scores:", rf_scores)
print("SVM scores:", svm_scores)
print("Paired t-test t-statistic:", t_stat)
print("Paired t-test p-value:", p_value)

# Interpret the results
alpha = 0.05  # significance level
if p_value < alpha:
    print("There is a significant difference between the two models.")
else:
    print("There is no significant difference between the two models.")


Random Forest scores: [1.         1.         1.         0.93265993 0.86666667 0.93265993
 1.         1.         0.93265993 0.86111111]
SVM scores: [1.         1.         1.         0.93265993 0.93265993 0.86666667
 1.         1.         0.93265993 0.93265993]
Paired t-test t-statistic: -0.5881821660747185
Paired t-test p-value: 0.5708727981947372
There is no significant difference between the two models.
