# Classic Audio ML models

## Set up

### Install libraries

In [None]:
!pip install iterative-stratification
!pip install librosa

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting iterative-stratification
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,f1_score,roc_auc_score,recall_score
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.utils import resample
from sklearn.exceptions import ConvergenceWarning
import librosa
import warnings

from google.colab import drive

from tqdm import tqdm

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=ConvergenceWarning)
pd.options.mode.chained_assignment = None

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Functions

##### Utils

In [None]:
def inverse_frequency(df):
    neg, pos = np.bincount(df["PHQ8_Binary"])
    total = neg + pos
    print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
        total, pos, 100 * pos / total))

    weight_for_0 = (1 / (neg / total))
    weight_for_1 = (1 / (pos / total))

    class_weight = {0: weight_for_0, 1: weight_for_1}

  #initial_bias = np.log([pos/neg])

    return class_weight

def get_audio_waves(data):
    audio_path = "/content/gdrive/MyDrive/daiwoz/audios/"
    waves = {}
    for id_ in tqdm(data.patient_id.unique()):
        audio, sr = librosa.load(f"{audio_path}{id_}_AUDIO.wav",sr=None)
        waves.setdefault(id_,(audio,sr))
    return waves

def random_oversample(df,seed):
    # Separate the data based on class (assuming 'label' is the class column)
    # with 1 being the positive (minority) class
    df_minority = df[df['PHQ8_Binary'] == 1]
    df_majority = df[df['PHQ8_Binary'] == 0]

    # Perform oversampling on the minority class
    df_minority_oversampled = resample(df_minority,
                                      replace=True, # sample with replacement
                                      n_samples=int(len(df_majority)*0.95), # match number in majority class
                                      random_state=seed) # reproducible results

    # Combine the majority class with the oversampled minority class
    df_oversampled = pd.concat([df_majority, df_minority_oversampled])

    return df_oversampled

##### Dataset

In [None]:
import pandas as pd
import numpy as np
import librosa

def extract_features(audio, sample_rate):
    # 1. RMS Energy for loudness approximation
    rms = np.mean(librosa.feature.rms(y=audio))
    # 2. MFCCs
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40).T
    # 3. Pitch
    pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate)
    pitch = np.mean(pitches)
    return rms, mfccs, pitch


def preprocess(dataset, waveform_dict):
    rms_values = []
    mfccs_values = []
    pitch_values = []

    for index, row in dataset.iterrows():
        id_ = row['patient_id']
        start_time = row['start_time']
        end_time = row['end_time']

        # Get the corresponding waveform and trim it
        waveform,sample_rate = waveform_dict[id_]
        start_sample = int(start_time * sample_rate)
        end_sample = int(end_time * sample_rate)
        trimmed_waveform = waveform[start_sample:end_sample]

        # Extract features
        rms, mfccs, pitch = extract_features(trimmed_waveform, sample_rate)

        # Append the results
        rms_values.append(rms)
        mfccs_values.append(np.mean(mfccs, axis=0))  # Taking the mean across time
        pitch_values.append(pitch)

    # Create new columns in the dataset
    dataset['rms'] = rms_values
    dataset['pitch'] = pitch_values

    # Expand MFCCs into their own columns
    mfccs_df = pd.DataFrame(mfccs_values, columns=[f'mfcc_{i}' for i in range(mfccs_values[0].shape[0])])
    dataset = pd.concat([dataset.reset_index(drop=True), mfccs_df.reset_index(drop=True)],axis=1)

    return dataset



### Splitting function

In [None]:
def split_dataset(df, test_size=0.15, val_size=0.2, random_state=42):
    """
  Function that splits a df taking in consideration balancing o Gender, PHQ and length of intervention.

  Params
  ---
    -df: The data to split
    -test_size: the size of the test set
    -val_size: the size of the validation set out of the remaining set after extracting the test set.
    -random_state: The seed, for reproducibility

  Returns
  ---
    - The same dataframe with a column name 'split' that indicates where each data point corresponds

  """
    # Ensure the 'gender' column is numerical
    if df['Gender'].dtype == 'object':
        df['Gender'] = df['Gender'].astype('category').cat.codes

    # Combine the 'gender' and 'label' columns into a new 2D array
    y = df[['Gender', 'PHQ8_Binary']].values


    try:
      # Split the DataFrame into three equal parts representing short, medium, and long texts
        labels=['short', 'medium', 'long']
        df['text_length_category'] = pd.qcut(df['word_count'], 3, labels=labels)
    except:
        labels=['short', 'long']
        # Split into 'short' and 'long' based on the median
        median = df['word_count'].median()
        df['text_length_category'] = np.where(df['word_count'] <= median, 'short', 'long')


    df_train = pd.DataFrame()
    df_test = pd.DataFrame()


    df_columns_to_drop = ['patient_id', 'intervention', 'question', 'start_time', 'end_time',
       'question_text', 'Gender', 'PHQ8_Score', 'PHQ8_Binary', 'split',
       'word_count','text_length_category']

    # Loop over each category
    for category in labels:
        df_temp = df[df['text_length_category'] == category]

        # Create the initial splitter
        initial_split = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)

        # Split out the test set
        for other_index, test_index in initial_split.split(df_temp, df_temp[['Gender', 'PHQ8_Binary']].values):
            df_train_, df_test_ = df_temp.iloc[other_index], df_temp.iloc[test_index]


        df_train = pd.concat([df_train.copy(),df_train_])
        df_test = pd.concat([df_test.copy(),df_test_])

    df_train = df_train.dropna()
    df_test = df_test.dropna()

    y_train,x_train = df_train["PHQ8_Binary"], df_train.drop(df_columns_to_drop,axis=1)
    y_test,x_test = df_test["PHQ8_Binary"], df_test.drop(df_columns_to_drop,axis=1)

    return x_train,y_train,x_test,y_test

#### Define the datasets

In [None]:
data = pd.read_csv("/content/gdrive/MyDrive/daiwoz/best_questions.csv",index_col=0).rename(columns={"id":"patient_id"})
data["question"] = data.question + 1

In [None]:
waves = get_audio_waves(data)

100%|██████████| 186/186 [05:20<00:00,  1.72s/it]


#### Define the param_grids

In [None]:
models = [
    ('Logistic Regression', LogisticRegression(class_weight='balanced'), {
        'classifier__C': [0.1, 1, 5, 10],
        'classifier__penalty': ['none', 'l2']
    }),
    ('Random Forest', RandomForestClassifier(class_weight='balanced'), {
        'classifier__n_estimators': [10, 100, 200, 300],
        'classifier__max_depth': [None, 5, 10],
        'classifier__criterion': ['gini', 'entropy']
    }),
    ('Support Vector Machine', SVC(class_weight='balanced', probability=True), {
        'classifier__C': [0.1, 1, 5, 10],
        'classifier__gamma': ['scale', 'auto'],
        'classifier__kernel': ['rbf', 'linear']
    })
]

# Define the pipeline for text classification
pipeline = Pipeline([
    ('vectorizer', StandardScaler()),  # Convert text into numerical features #Feature selector sencillo
    ('classifier', None)  # Placeholder for the classifier
])


In [None]:
from tqdm import tqdm

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

def train_questions(data, waveforms,models, pipeline):
    # Train and evaluate each model

    results_df = pd.DataFrame(columns=['Question','Best Model','Train F1','Train Recall','Train AUC','Test F1','Test Recall','Test AUC'])

    for question in tqdm(data.question.unique()):
        best_model = None

        best_f1 = 0.0
        best_recall = 0.0
        best_auc = 0.0

        best_f1_mean = 0.0
        best_recall_mean = 0.0
        best_auc_mean = 0.0

        question_data = preprocess(data.loc[data.question==question],waveforms)

        kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        x_train, y_train, x_test, y_test = split_dataset(question_data)

        for model_name, model, param_grid in models:
            pipeline.set_params(classifier=model)
            grid_search = GridSearchCV(pipeline, param_grid, scoring=["f1","roc_auc","recall"], cv=kfold, n_jobs=-1, refit='f1')
            grid_search.fit(x_train, y_train)

            y_pred = grid_search.predict(x_test)

            f1 = f1_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred)

            if f1 > best_f1:

                best_model = model_name

                best_f1 = f1
                best_recall = recall
                best_auc = auc

                best_f1_mean = grid_search.best_score_
                best_recall_mean = grid_search.cv_results_['mean_test_recall'][grid_search.best_index_]
                best_auc_mean = grid_search.cv_results_['mean_test_roc_auc'][grid_search.best_index_]

        results_df = results_df.append({
            'Question': question,
            'Best Model': best_model,
            'Train F1': best_f1_mean,
            'Train Recall': best_recall_mean,
            'Train AUC': best_auc_mean,
            'Test F1': best_f1,
            'Test Recall': best_recall,
            'Test AUC': best_auc
        }, ignore_index=True)

    return results_df


In [None]:
results_df = train_questions(data,waves,models,pipeline)

100%|██████████| 13/13 [06:09<00:00, 28.45s/it]


In [None]:
results_df.rename(columns={"Question":"question"})

Unnamed: 0,question,Best Model,Train F1,Train Recall,Train AUC,Test F1,Test Recall,Test AUC
0,1,Support Vector Machine,0.447028,0.531944,0.568904,0.454545,0.625,0.598214
1,2,Support Vector Machine,0.390119,0.445833,0.52683,0.48,0.75,0.6
2,3,Support Vector Machine,0.392554,0.466667,0.536471,0.571429,0.666667,0.666667
3,5,Support Vector Machine,0.469952,0.585714,0.620542,0.5,0.714286,0.666667
4,6,Random Forest,0.274604,0.222222,0.58254,0.615385,0.666667,0.758333
5,7,Logistic Regression,0.451876,0.600733,0.624154,0.352941,0.428571,0.530075
6,8,Logistic Regression,0.383065,0.419048,0.487848,0.470588,0.5,0.618421
7,9,Logistic Regression,0.367468,0.410256,0.55354,0.4,0.428571,0.582707
8,11,Support Vector Machine,0.446951,1.0,0.472631,0.451613,1.0,0.5
9,12,Support Vector Machine,0.322876,0.3875,0.440357,0.526316,0.625,0.6625
