In [2]:
import tensorflow as tf
print(tf.__version__)

2.9.1


In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
import pandas as pd
#pd.options.mode.chained_assignment = None
import numpy as np
from scipy import stats


In [5]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.applications import *


In [6]:
import sys
sys.path.append('../utils')  # Add the 'utils' folder to the Python path

from classification_cross_validation import pipeline_cross_val  # Import function from helper.py


In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.svm import SVC

In [9]:
import os

DATA_FOLDER = 'path_to_folder_containing_participant_data'
# Specifying colive voice extraction
EXTRACTION = "EXTRACT_2023_03_16_12_09_45"

EMBEDDINGS_FOLDER = '..\\embeddings\\for model training'

### Results for English-speaking male participants

##### Data reading 
(explicit participant data is private, the embeddings extracted fronm the audio recordings   are however available)

In [10]:
# reading dataframe 
df = pd.read_csv(os.path.join(DATA_FOLDER, EXTRACTION, "dataframe_name"))

In [11]:
df['currently_smoking'].value_counts()

1    200
0    200
Name: currently_smoking, dtype: int64

In [12]:
y_smk = df['currently_smoking'].values
y_smk.shape

(400,)

##### Features

In [15]:
reading_wav2vec_embeddings = np.load(os.path.join(EMBEDDINGS_FOLDER, 'wav2vec_embeddings_reading_male_english.npy'))
counting_egemaps_embeddings = np.load(os.path.join(EMBEDDINGS_FOLDER, 'egemaps_embeddings_counting_male_english.npy'))
a_vowel_phonation_byola_embeddings = np.load(os.path.join(EMBEDDINGS_FOLDER, 'byola_512_a_vowel_phonation_embeddings_male_english.npy'))

In [14]:
RANDOM_STATE = 42
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
scaler = StandardScaler()

#### Reading

In [23]:
embeddings_var = reading_wav2vec_embeddings
embeddings_var.shape

(400, 1024)

##### Grid search with PCA

In [34]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [3, 5, 7],
                 'classifier__weights': ['uniform', 'distance'],
                 }


pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', KNeighborsClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

knn_neighbours = clf.best_params_['classifier__n_neighbors']
knn_weights = clf.best_params_['classifier__weights']

print(clf.best_params_)

{'classifier': KNeighborsClassifier(n_neighbors=7), 'classifier__n_neighbors': 7, 'classifier__weights': 'uniform', 'pca__n_components': None}


In [35]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [LogisticRegression(solver='liblinear')],
                'classifier__C': [0.01, 0.1, 1.0, 10]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', LogisticRegression())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lr_c = clf.best_params_['classifier__C']

print(print(clf.best_params_))

{'classifier': LogisticRegression(C=0.1, solver='liblinear'), 'classifier__C': 0.1, 'pca__n_components': None}
None


In [36]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [ExtraTreesClassifier(random_state=RANDOM_STATE)],
                'classifier__n_estimators': [10, 30, 50, 100, 200],
                'classifier__max_depth': [8, 10, 12, 20]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', ExtraTreesClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

etc_max_depth = clf.best_params_['classifier__max_depth']
etc_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': ExtraTreesClassifier(max_depth=12, n_estimators=10, random_state=42), 'classifier__max_depth': 12, 'classifier__n_estimators': 10, 'pca__n_components': 50}


In [37]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [RandomForestClassifier(max_depth=4, max_leaf_nodes=5, random_state=RANDOM_STATE)],
                'classifier__max_depth': [5, 10, None],
                'classifier__n_estimators': [10, 20, 30, 40, 50, 100]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', RandomForestClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

rf_max_depth = clf.best_params_['classifier__max_depth']
rf_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': RandomForestClassifier(max_depth=5, max_leaf_nodes=5, n_estimators=20,
                       random_state=42), 'classifier__max_depth': 5, 'classifier__n_estimators': 20, 'pca__n_components': 50}


In [38]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [LinearDiscriminantAnalysis()],
                'classifier__solver': ['svd', 'lsqr']}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', LinearDiscriminantAnalysis())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lda_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)


{'classifier': LinearDiscriminantAnalysis(), 'classifier__solver': 'svd', 'pca__n_components': 50}


In [39]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [MLPClassifier(random_state=RANDOM_STATE, max_iter = 1000)],
                 'classifier__hidden_layer_sizes': [(4,), (8,), (20,), (50,), (32,), (32,6,), (100,), (256, 128,)],
                 'classifier__activation': ['relu', 'tanh', 'logistic'],
                 'classifier__solver': ['adam', 'lbfgs']}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', MLPClassifier(random_state=1, max_iter=1000))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

mlp_hidden_layer = clf.best_params_['classifier__hidden_layer_sizes']
mlp_activation = clf.best_params_['classifier__activation']
mlp_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)

{'classifier': MLPClassifier(activation='tanh', hidden_layer_sizes=(256, 128), max_iter=1000,
              random_state=42), 'classifier__activation': 'tanh', 'classifier__hidden_layer_sizes': (256, 128), 'classifier__solver': 'adam', 'pca__n_components': 100}


In [40]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [SVC(kernel='rbf')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', SVC(kernel='rbf'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_c = clf.best_params_['classifier__C']
svc_g = clf.best_params_['classifier__gamma']


print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=1e-06), 'classifier__C': 10000.0, 'classifier__gamma': 1e-06, 'pca__n_components': None}


In [41]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [SVC(kernel='sigmoid')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', SVC(kernel='sigmoid'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_sig_c = clf.best_params_['classifier__C']
svc_sig_g = clf.best_params_['classifier__gamma']

print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=1e-06, kernel='sigmoid'), 'classifier__C': 10000.0, 'classifier__gamma': 1e-06, 'pca__n_components': 50}


##### Cross-validation

In [42]:
# Model building to train
names = ['K Nearest Neighbors','Logistic Regression', 'ExtraTrees Classifier','Random Forest','LDA' ,'MLP Classifier' ,'SVM rbf', 'SVM sigmoid']
Classifiers = [
               KNeighborsClassifier(n_neighbors=knn_neighbours, weights=knn_weights),
               LogisticRegression(solver='liblinear', C=lr_c), 
               ExtraTreesClassifier(random_state= RANDOM_STATE, max_depth=etc_max_depth, n_estimators=etc_n_estimators),
               RandomForestClassifier(max_leaf_nodes=5, random_state= RANDOM_STATE, max_depth=rf_max_depth, n_estimators=rf_n_estimators), 
               LinearDiscriminantAnalysis(solver = lda_solver),
               MLPClassifier(random_state= RANDOM_STATE, hidden_layer_sizes=mlp_hidden_layer, activation=mlp_activation, solver=mlp_solver, max_iter=1000),
               svm.SVC(kernel='rbf', C=svc_c, gamma=svc_g),
               svm.SVC(kernel='sigmoid', C=svc_sig_c, gamma=svc_sig_g)]

In [45]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=200, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.56 (0.03),0.56 (0.03),0.56 (0.04),0.47 (0.06),0.51 (0.05),0.55 (0.02)
1,Logistic Regression,0.62 (0.03),0.62 (0.03),0.63 (0.05),0.60 (0.06),0.61 (0.03),0.63 (0.05)
2,ExtraTrees Classifier,0.56 (0.06),0.56 (0.06),0.56 (0.05),0.53 (0.11),0.54 (0.08),0.56 (0.05)
3,Random Forest,0.49 (0.05),0.49 (0.05),0.50 (0.07),0.43 (0.10),0.46 (0.08),0.54 (0.06)
4,LDA,0.58 (0.03),0.58 (0.03),0.58 (0.03),0.57 (0.07),0.58 (0.04),0.60 (0.04)
5,MLP Classifier,0.61 (0.05),0.61 (0.05),0.61 (0.05),0.60 (0.09),0.60 (0.06),0.64 (0.06)
6,SVM rbf,0.61 (0.03),0.61 (0.03),0.62 (0.03),0.61 (0.05),0.61 (0.04),0.64 (0.05)
7,SVM sigmoid,0.60 (0.06),0.60 (0.05),0.60 (0.06),0.57 (0.09),0.58 (0.07),0.65 (0.06)


In [46]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=100, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.54 (0.03),0.54 (0.03),0.55 (0.04),0.45 (0.05),0.50 (0.05),0.54 (0.02)
1,Logistic Regression,0.58 (0.05),0.58 (0.05),0.59 (0.06),0.56 (0.04),0.57 (0.05),0.62 (0.07)
2,ExtraTrees Classifier,0.56 (0.05),0.56 (0.05),0.56 (0.05),0.53 (0.10),0.54 (0.07),0.57 (0.05)
3,Random Forest,0.56 (0.06),0.56 (0.06),0.57 (0.07),0.53 (0.05),0.55 (0.05),0.55 (0.05)
4,LDA,0.59 (0.07),0.59 (0.07),0.60 (0.08),0.57 (0.07),0.58 (0.06),0.62 (0.08)
5,MLP Classifier,0.65 (0.06),0.65 (0.06),0.65 (0.05),0.61 (0.09),0.63 (0.07),0.64 (0.06)
6,SVM rbf,0.59 (0.05),0.59 (0.05),0.60 (0.05),0.56 (0.09),0.57 (0.06),0.62 (0.08)
7,SVM sigmoid,0.59 (0.06),0.59 (0.06),0.60 (0.06),0.57 (0.07),0.58 (0.06),0.64 (0.07)


In [47]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=50, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.53 (0.02),0.53 (0.02),0.54 (0.02),0.43 (0.04),0.48 (0.03),0.53 (0.03)
1,Logistic Regression,0.60 (0.04),0.60 (0.04),0.60 (0.04),0.59 (0.08),0.60 (0.05),0.63 (0.06)
2,ExtraTrees Classifier,0.61 (0.04),0.61 (0.04),0.63 (0.04),0.57 (0.06),0.60 (0.05),0.64 (0.05)
3,Random Forest,0.56 (0.03),0.57 (0.03),0.58 (0.04),0.48 (0.08),0.52 (0.05),0.60 (0.05)
4,LDA,0.60 (0.05),0.60 (0.05),0.60 (0.05),0.60 (0.09),0.60 (0.06),0.64 (0.07)
5,MLP Classifier,0.58 (0.06),0.58 (0.06),0.59 (0.06),0.54 (0.10),0.56 (0.08),0.61 (0.06)
6,SVM rbf,0.59 (0.06),0.59 (0.06),0.60 (0.06),0.57 (0.08),0.58 (0.06),0.64 (0.07)
7,SVM sigmoid,0.60 (0.05),0.60 (0.05),0.61 (0.05),0.58 (0.05),0.59 (0.05),0.65 (0.06)


#### Counting

In [17]:
embeddings_var = counting_egemaps_embeddings
embeddings_var.shape

(400, 89)

##### Grid search without PCA 

In [18]:
search_space = { 'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [3, 5, 7],
                 'classifier__weights': ['uniform', 'distance'],
                 }

pca_model =  Pipeline([('scaler', scaler), ('classifier', KNeighborsClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

knn_neighbours = clf.best_params_['classifier__n_neighbors']
knn_weights = clf.best_params_['classifier__weights']

print(clf.best_params_)

{'classifier': KNeighborsClassifier(n_neighbors=7), 'classifier__n_neighbors': 7, 'classifier__weights': 'uniform'}


In [19]:
search_space = {'classifier': [LogisticRegression(solver='liblinear')],
                'classifier__C': [0.01, 0.1, 1.0, 10]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', LogisticRegression())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lr_c = clf.best_params_['classifier__C']

print(print(clf.best_params_))

{'classifier': LogisticRegression(C=0.01, solver='liblinear'), 'classifier__C': 0.01}
None


In [20]:
search_space = {'classifier': [ExtraTreesClassifier(random_state=RANDOM_STATE)],
                'classifier__n_estimators': [10, 30, 50, 100, 200],
                'classifier__max_depth': [8, 10, 12, 20]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', ExtraTreesClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

etc_max_depth = clf.best_params_['classifier__max_depth']
etc_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': ExtraTreesClassifier(max_depth=8, n_estimators=200, random_state=42), 'classifier__max_depth': 8, 'classifier__n_estimators': 200}


In [21]:
search_space = {'classifier': [RandomForestClassifier(max_depth=4, max_leaf_nodes=5, random_state=RANDOM_STATE)],
                'classifier__max_depth': [5, 10, None],
                'classifier__n_estimators': [10, 20, 30, 40, 50, 100]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', RandomForestClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

rf_max_depth = clf.best_params_['classifier__max_depth']
rf_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': RandomForestClassifier(max_depth=5, max_leaf_nodes=5, n_estimators=20,
                       random_state=42), 'classifier__max_depth': 5, 'classifier__n_estimators': 20}


In [22]:
search_space = {'classifier': [LinearDiscriminantAnalysis()],
                'classifier__solver': ['svd', 'lsqr']}

pca_model =  Pipeline([('scaler', scaler), ('classifier', LinearDiscriminantAnalysis())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lda_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)


{'classifier': LinearDiscriminantAnalysis(), 'classifier__solver': 'svd'}


In [23]:
search_space = { 'classifier': [MLPClassifier(random_state=RANDOM_STATE, max_iter = 1000)],
                 'classifier__hidden_layer_sizes': [(4,), (8,), (20,), (50,), (32,), (32,6,), (100,), (256, 128,)],
                 'classifier__activation': ['relu', 'tanh', 'logistic'],
                 'classifier__solver': ['adam', 'lbfgs']}

pca_model =  Pipeline([('scaler', scaler), ('classifier',MLPClassifier(random_state=1, max_iter=1000))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

mlp_hidden_layer = clf.best_params_['classifier__hidden_layer_sizes']
mlp_activation = clf.best_params_['classifier__activation']
mlp_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)

{'classifier': MLPClassifier(activation='logistic', max_iter=1000, random_state=42), 'classifier__activation': 'logistic', 'classifier__hidden_layer_sizes': (100,), 'classifier__solver': 'adam'}


In [24]:
search_space = {'classifier': [SVC(kernel='rbf')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', ('classifier', SVC(kernel='rbf')))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_c = clf.best_params_['classifier__C']
svc_g = clf.best_params_['classifier__gamma']


print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=1e-06), 'classifier__C': 10000.0, 'classifier__gamma': 1e-06}


In [25]:
search_space = {'classifier': [SVC(kernel='sigmoid')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', SVC(kernel='sigmoid'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_sig_c = clf.best_params_['classifier__C']
svc_sig_g = clf.best_params_['classifier__gamma']

print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=1e-06, kernel='sigmoid'), 'classifier__C': 10000.0, 'classifier__gamma': 1e-06}


##### Cross-validation

In [26]:
# Model building to train
names = ['K Nearest Neighbors','Logistic Regression', 'ExtraTrees Classifier','Random Forest','LDA' ,'MLP Classifier' ,'SVM rbf', 'SVM sigmoid']
Classifiers = [
               KNeighborsClassifier(n_neighbors=knn_neighbours, weights=knn_weights),
               LogisticRegression(solver='liblinear', C=lr_c), 
               ExtraTreesClassifier(random_state= RANDOM_STATE, max_depth=etc_max_depth, n_estimators=etc_n_estimators),
               RandomForestClassifier(max_leaf_nodes=5, random_state= RANDOM_STATE, max_depth=rf_max_depth, n_estimators=rf_n_estimators), 
               LinearDiscriminantAnalysis(solver = lda_solver),
               MLPClassifier(random_state= RANDOM_STATE, hidden_layer_sizes=mlp_hidden_layer, activation=mlp_activation, solver=mlp_solver, max_iter=1000),
               svm.SVC(kernel='rbf', C=svc_c, gamma=svc_g),
               svm.SVC(kernel='sigmoid', C=svc_sig_c, gamma=svc_sig_g)]

In [27]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.53 (0.02),0.53 (0.02),0.53 (0.02),0.42 (0.04),0.47 (0.03),0.53 (0.03)
1,Logistic Regression,0.59 (0.04),0.59 (0.04),0.59 (0.04),0.54 (0.11),0.56 (0.07),0.61 (0.04)
2,ExtraTrees Classifier,0.58 (0.05),0.58 (0.05),0.59 (0.05),0.54 (0.09),0.56 (0.07),0.59 (0.04)
3,Random Forest,0.56 (0.03),0.56 (0.03),0.57 (0.04),0.50 (0.04),0.53 (0.03),0.56 (0.03)
4,LDA,0.56 (0.04),0.56 (0.04),0.56 (0.04),0.54 (0.09),0.55 (0.06),0.58 (0.03)
5,MLP Classifier,0.62 (0.02),0.62 (0.02),0.62 (0.02),0.64 (0.08),0.63 (0.04),0.64 (0.04)
6,SVM rbf,0.57 (0.03),0.57 (0.03),0.58 (0.03),0.53 (0.06),0.55 (0.04),0.62 (0.05)
7,SVM sigmoid,0.58 (0.02),0.58 (0.02),0.59 (0.02),0.51 (0.06),0.54 (0.04),0.61 (0.04)


#### A-vowel phonation

In [73]:
embeddings_var = a_vowel_phonation_byola_embeddings
embeddings_var.shape

(400, 512)

##### Grid search with PCA

In [84]:
search_space = {'pca__n_components': [12, 13, 14, 50, 100, None],
                'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [3, 5, 7],
                 'classifier__weights': ['uniform', 'distance'],
                 }


pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', KNeighborsClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

knn_neighbours = clf.best_params_['classifier__n_neighbors']
knn_weights = clf.best_params_['classifier__weights']

print(clf.best_params_)

{'classifier': KNeighborsClassifier(), 'classifier__n_neighbors': 5, 'classifier__weights': 'uniform', 'pca__n_components': 100}


In [85]:
search_space = {'pca__n_components': [12, 13, 14, 50, 100, None],
                'classifier': [LogisticRegression(solver='liblinear')],
                'classifier__C': [0.01, 0.1, 1.0, 10]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', LogisticRegression())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lr_c = clf.best_params_['classifier__C']

print(print(clf.best_params_))

{'classifier': LogisticRegression(solver='liblinear'), 'classifier__C': 1.0, 'pca__n_components': 50}
None


In [86]:
search_space = {'pca__n_components': [12, 13, 14, 50, 100, None],
                'classifier': [ExtraTreesClassifier(random_state=RANDOM_STATE)],
                'classifier__n_estimators': [10, 30, 50, 100, 200],
                'classifier__max_depth': [8, 10, 12, 20]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', ExtraTreesClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

etc_max_depth = clf.best_params_['classifier__max_depth']
etc_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': ExtraTreesClassifier(max_depth=8, n_estimators=50, random_state=42), 'classifier__max_depth': 8, 'classifier__n_estimators': 50, 'pca__n_components': 50}


In [87]:
search_space = {'pca__n_components': [12, 13, 14, 50, 100, None],
                'classifier': [RandomForestClassifier(max_depth=4, max_leaf_nodes=5, random_state=RANDOM_STATE)],
                'classifier__max_depth': [5, 10, None],
                'classifier__n_estimators': [10, 20, 30, 40, 50, 100]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', RandomForestClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

rf_max_depth = clf.best_params_['classifier__max_depth']
rf_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': RandomForestClassifier(max_depth=5, max_leaf_nodes=5, n_estimators=50,
                       random_state=42), 'classifier__max_depth': 5, 'classifier__n_estimators': 50, 'pca__n_components': 50}


In [88]:
search_space = {'pca__n_components': [12, 13, 14, 50, 100, None],
                'classifier': [LinearDiscriminantAnalysis()],
                'classifier__solver': ['svd', 'lsqr']}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', LinearDiscriminantAnalysis())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lda_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)


{'classifier': LinearDiscriminantAnalysis(), 'classifier__solver': 'svd', 'pca__n_components': 50}


In [89]:
search_space = {'pca__n_components': [12, 13, 14, 50, 100, 200, None],
                'classifier': [MLPClassifier(random_state=RANDOM_STATE, max_iter = 1000)],
                 'classifier__hidden_layer_sizes': [(4,), (8,), (20,), (50,), (32,), (32,6,), (100,), (256, 128,)],
                 'classifier__activation': ['relu', 'tanh', 'logistic'],
                 'classifier__solver': ['adam', 'lbfgs']}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', MLPClassifier(random_state=1, max_iter=1000))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

mlp_hidden_layer = clf.best_params_['classifier__hidden_layer_sizes']
mlp_activation = clf.best_params_['classifier__activation']
mlp_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)

{'classifier': MLPClassifier(activation='logistic', hidden_layer_sizes=(8,), max_iter=1000,
              random_state=42, solver='lbfgs'), 'classifier__activation': 'logistic', 'classifier__hidden_layer_sizes': (8,), 'classifier__solver': 'lbfgs', 'pca__n_components': 13}


In [90]:
search_space = {'pca__n_components': [12, 13, 14, 50, 100, None],
                'classifier': [SVC(kernel='rbf')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', SVC(kernel='rbf'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_c = clf.best_params_['classifier__C']
svc_g = clf.best_params_['classifier__gamma']


print(clf.best_params_)

{'classifier': SVC(C=1000000.0, gamma=1e-06), 'classifier__C': 1000000.0, 'classifier__gamma': 1e-06, 'pca__n_components': 50}


In [91]:
search_space = {'pca__n_components': [12, 13, 14, 50, 100],
                'classifier': [SVC(kernel='sigmoid')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', SVC(kernel='sigmoid'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_sig_c = clf.best_params_['classifier__C']
svc_sig_g = clf.best_params_['classifier__gamma']

print(clf.best_params_)

{'classifier': SVC(C=1000000.0, gamma=0.001, kernel='sigmoid'), 'classifier__C': 1000000.0, 'classifier__gamma': 0.001, 'pca__n_components': 100}


##### Cross-validation

In [92]:
# Model building to train
names = ['K Nearest Neighbors','Logistic Regression', 'ExtraTrees Classifier','Random Forest','LDA' ,'MLP Classifier' ,'SVM rbf', 'SVM sigmoid']
Classifiers = [
               KNeighborsClassifier(n_neighbors=knn_neighbours, weights=knn_weights),
               LogisticRegression(solver='liblinear', C=lr_c), 
               ExtraTreesClassifier(random_state= RANDOM_STATE, max_depth=etc_max_depth, n_estimators=etc_n_estimators),
               RandomForestClassifier(max_leaf_nodes=5, random_state= RANDOM_STATE, max_depth=rf_max_depth, n_estimators=rf_n_estimators), 
               LinearDiscriminantAnalysis(solver = lda_solver),
               MLPClassifier(random_state= RANDOM_STATE, hidden_layer_sizes=mlp_hidden_layer, activation=mlp_activation, solver=mlp_solver, max_iter=1000),
               svm.SVC(kernel='rbf', C=svc_c, gamma=svc_g),
               svm.SVC(kernel='sigmoid', C=svc_sig_c, gamma=svc_sig_g)]

In [93]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=None, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.51 (0.03),0.51 (0.03),0.51 (0.03),0.53 (0.07),0.52 (0.04),0.52 (0.03)
1,Logistic Regression,0.52 (0.02),0.52 (0.02),0.52 (0.02),0.52 (0.05),0.52 (0.03),0.52 (0.03)
2,ExtraTrees Classifier,0.51 (0.05),0.51 (0.05),0.50 (0.05),0.55 (0.07),0.52 (0.06),0.48 (0.05)
3,Random Forest,0.50 (0.01),0.50 (0.01),0.50 (0.01),0.37 (0.15),0.41 (0.09),0.50 (0.03)
4,LDA,0.49 (0.06),0.49 (0.06),0.51 (0.05),0.51 (0.08),0.51 (0.06),0.50 (0.06)
5,MLP Classifier,0.53 (0.03),0.53 (0.03),0.55 (0.02),0.58 (0.05),0.56 (0.03),0.56 (0.05)
6,SVM rbf,0.51 (0.02),0.51 (0.02),0.50 (0.02),0.51 (0.05),0.50 (0.03),0.50 (0.03)
7,SVM sigmoid,0.56 (0.06),0.56 (0.06),0.55 (0.06),0.58 (0.07),0.57 (0.06),0.58 (0.08)


In [95]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=200, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.52 (0.03),0.52 (0.03),0.52 (0.02),0.52 (0.07),0.52 (0.04),0.52 (0.03)
1,Logistic Regression,0.56 (0.02),0.56 (0.02),0.56 (0.02),0.55 (0.05),0.55 (0.03),0.55 (0.03)
2,ExtraTrees Classifier,0.55 (0.03),0.55 (0.03),0.55 (0.04),0.54 (0.06),0.54 (0.05),0.54 (0.04)
3,Random Forest,0.51 (0.05),0.51 (0.05),0.51 (0.05),0.45 (0.11),0.47 (0.08),0.52 (0.04)
4,LDA,0.53 (0.02),0.53 (0.02),0.53 (0.02),0.53 (0.04),0.53 (0.03),0.56 (0.03)
5,MLP Classifier,0.54 (0.05),0.54 (0.05),0.55 (0.05),0.57 (0.09),0.56 (0.06),0.56 (0.07)
6,SVM rbf,0.53 (0.04),0.53 (0.04),0.53 (0.05),0.52 (0.05),0.52 (0.05),0.55 (0.04)
7,SVM sigmoid,0.57 (0.06),0.57 (0.06),0.57 (0.06),0.58 (0.05),0.58 (0.05),0.59 (0.08)


In [97]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=50, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.52 (0.03),0.52 (0.03),0.52 (0.03),0.51 (0.07),0.51 (0.04),0.51 (0.03)
1,Logistic Regression,0.60 (0.05),0.60 (0.05),0.60 (0.05),0.61 (0.10),0.60 (0.07),0.61 (0.07)
2,ExtraTrees Classifier,0.57 (0.05),0.57 (0.05),0.57 (0.06),0.53 (0.08),0.55 (0.06),0.56 (0.07)
3,Random Forest,0.57 (0.03),0.57 (0.03),0.57 (0.03),0.49 (0.08),0.52 (0.06),0.56 (0.04)
4,LDA,0.61 (0.04),0.61 (0.04),0.61 (0.04),0.61 (0.07),0.61 (0.04),0.61 (0.06)
5,MLP Classifier,0.57 (0.05),0.57 (0.05),0.56 (0.06),0.61 (0.08),0.58 (0.06),0.57 (0.10)
6,SVM rbf,0.60 (0.03),0.60 (0.03),0.61 (0.04),0.57 (0.11),0.58 (0.05),0.63 (0.07)
7,SVM sigmoid,0.58 (0.08),0.58 (0.08),0.58 (0.08),0.58 (0.07),0.58 (0.07),0.60 (0.10)
