In [5]:
import tensorflow as tf
print(tf.__version__)

2.9.1


In [6]:
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
import pandas as pd
#pd.options.mode.chained_assignment = None
import numpy as np
from scipy import stats


In [8]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.applications import *


In [9]:
import sys
sys.path.append('../utils')  # Add the 'utils' folder to the Python path

from classification_cross_validation import pipeline_cross_val  # Import function from helper.py


In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.svm import SVC

In [12]:
import os

DATA_FOLDER = 'path_to_folder_containing_participant_data'
# Specifying colive voice extraction
EXTRACTION = "EXTRACT_2023_03_16_12_09_45"

EMBEDDINGS_FOLDER = '..\\embeddings\\for model training'

### Results for French-speaking male participants

##### Data reading 
(explicit participant data is private, the embeddings extracted from the audio recordings   are however available)

In [13]:
df = pd.read_csv(os.path.join(DATA_FOLDER, EXTRACTION, "male_french_only_matched_sg_cg_16032023_manual_approach.csv"))

In [14]:
df['currently_smoking'].value_counts()

1    37
0    37
Name: currently_smoking, dtype: int64

In [15]:
y_smk = df['currently_smoking'].values
y_smk.shape

(74,)

##### Features

In [16]:
#embeddings

reading_wav2vec_embeddings = np.load(os.path.join(EMBEDDINGS_FOLDER, 'wav2vec_embeddings_reading_male_french.npy'))
counting_yamnet_embeddings = np.load(os.path.join(EMBEDDINGS_FOLDER, 'yamnet_counting_embeddings_male_french.npy'))
a_vowel_phonation_egemaps_embeddings = np.load(os.path.join(EMBEDDINGS_FOLDER, 'egemaps_a_vowel_phonation_embeddings_male_french.npy'))

In [17]:
RANDOM_STATE = 42
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
scaler = StandardScaler()

#### Reading

In [19]:
embeddings_var = reading_wav2vec_embeddings
embeddings_var.shape

(74, 1024)

##### Grid search with PCA

In [20]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [3, 5, 7],
                 'classifier__weights': ['uniform', 'distance'],
                 }


pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', KNeighborsClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

knn_neighbours = clf.best_params_['classifier__n_neighbors']
knn_weights = clf.best_params_['classifier__weights']

print(clf.best_params_)

{'classifier': KNeighborsClassifier(n_neighbors=7), 'classifier__n_neighbors': 7, 'classifier__weights': 'uniform', 'pca__n_components': 20}


In [21]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [LogisticRegression(solver='liblinear')],
                'classifier__C': [0.01, 0.1, 1.0, 10]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', LogisticRegression())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lr_c = clf.best_params_['classifier__C']

print(print(clf.best_params_))

{'classifier': LogisticRegression(solver='liblinear'), 'classifier__C': 1.0, 'pca__n_components': None}
None


In [22]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [ExtraTreesClassifier(random_state=RANDOM_STATE)],
                'classifier__n_estimators': [10, 30, 50, 100, 200],
                'classifier__max_depth': [8, 10, 12, 20]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', ExtraTreesClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

etc_max_depth = clf.best_params_['classifier__max_depth']
etc_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': ExtraTreesClassifier(max_depth=10, n_estimators=50, random_state=42), 'classifier__max_depth': 10, 'classifier__n_estimators': 50, 'pca__n_components': 40}


In [23]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [RandomForestClassifier(max_depth=4, max_leaf_nodes=5, random_state=RANDOM_STATE)],
                'classifier__max_depth': [5, 10, None],
                'classifier__n_estimators': [10, 20, 30, 40, 50, 100]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', RandomForestClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

rf_max_depth = clf.best_params_['classifier__max_depth']
rf_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': RandomForestClassifier(max_depth=5, max_leaf_nodes=5, n_estimators=50,
                       random_state=42), 'classifier__max_depth': 5, 'classifier__n_estimators': 50, 'pca__n_components': 20}


In [24]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [LinearDiscriminantAnalysis()],
                'classifier__solver': ['svd', 'lsqr']}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', LinearDiscriminantAnalysis())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lda_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)


{'classifier': LinearDiscriminantAnalysis(), 'classifier__solver': 'svd', 'pca__n_components': 20}


In [25]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [MLPClassifier(random_state=RANDOM_STATE, max_iter = 1000)],
                 'classifier__hidden_layer_sizes': [(4,), (8,), (20,), (50,), (32,), (32,6,), (100,), (256, 128,)],
                 'classifier__activation': ['relu', 'tanh', 'logistic'],
                 'classifier__solver': ['adam', 'lbfgs']}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', MLPClassifier(random_state=1, max_iter=1000))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

mlp_hidden_layer = clf.best_params_['classifier__hidden_layer_sizes']
mlp_activation = clf.best_params_['classifier__activation']
mlp_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)

{'classifier': MLPClassifier(hidden_layer_sizes=(20,), max_iter=1000, random_state=42,
              solver='lbfgs'), 'classifier__activation': 'relu', 'classifier__hidden_layer_sizes': (20,), 'classifier__solver': 'lbfgs', 'pca__n_components': 50}


In [26]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [SVC(kernel='rbf')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', SVC(kernel='rbf'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_c = clf.best_params_['classifier__C']
svc_g = clf.best_params_['classifier__gamma']


print(clf.best_params_)

{'classifier': SVC(C=100000.0, gamma=1e-05), 'classifier__C': 100000.0, 'classifier__gamma': 1e-05, 'pca__n_components': 10}


In [27]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [SVC(kernel='sigmoid')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', SVC(kernel='sigmoid'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_sig_c = clf.best_params_['classifier__C']
svc_sig_g = clf.best_params_['classifier__gamma']

print(clf.best_params_)

{'classifier': SVC(C=1000000.0, gamma=0.001, kernel='sigmoid'), 'classifier__C': 1000000.0, 'classifier__gamma': 0.001, 'pca__n_components': 30}


##### Cross-validation

In [30]:
# Model building to train
names = ['K Nearest Neighbors','Logistic Regression', 'ExtraTrees Classifier','Random Forest','LDA' ,'MLP Classifier' ,'SVM rbf', 'SVM sigmoid']
Classifiers = [
               KNeighborsClassifier(n_neighbors=knn_neighbours, weights=knn_weights),
               LogisticRegression(solver='liblinear', C=lr_c), 
               ExtraTreesClassifier(random_state= RANDOM_STATE, max_depth=etc_max_depth, n_estimators=etc_n_estimators),
               RandomForestClassifier(max_leaf_nodes=5, random_state= RANDOM_STATE, max_depth=rf_max_depth, n_estimators=rf_n_estimators), 
               LinearDiscriminantAnalysis(solver = lda_solver),
               MLPClassifier(random_state= RANDOM_STATE, hidden_layer_sizes=mlp_hidden_layer, activation=mlp_activation, solver=mlp_solver, max_iter=1000),
               svm.SVC(kernel='rbf', C=svc_c, gamma=svc_g),
               svm.SVC(kernel='sigmoid', C=svc_sig_c, gamma=svc_sig_g)]

In [31]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=None, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.60 (0.14),0.60 (0.13),0.58 (0.12),0.62 (0.21),0.59 (0.16),0.61 (0.15)
1,Logistic Regression,0.57 (0.15),0.57 (0.15),0.58 (0.18),0.57 (0.16),0.57 (0.15),0.54 (0.13)
2,ExtraTrees Classifier,0.61 (0.09),0.61 (0.08),0.62 (0.11),0.60 (0.16),0.59 (0.11),0.58 (0.12)
3,Random Forest,0.57 (0.12),0.57 (0.13),0.63 (0.23),0.51 (0.16),0.54 (0.13),0.54 (0.18)
4,LDA,0.42 (0.10),0.42 (0.10),0.39 (0.14),0.40 (0.15),0.39 (0.14),0.36 (0.14)
5,MLP Classifier,0.57 (0.08),0.57 (0.08),0.57 (0.08),0.54 (0.16),0.55 (0.11),0.58 (0.13)
6,SVM rbf,0.52 (0.16),0.52 (0.16),0.51 (0.19),0.51 (0.22),0.50 (0.19),0.57 (0.16)
7,SVM sigmoid,0.51 (0.11),0.52 (0.11),0.51 (0.13),0.52 (0.14),0.51 (0.13),0.46 (0.17)


In [32]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=50, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.58 (0.14),0.58 (0.14),0.56 (0.14),0.59 (0.24),0.57 (0.19),0.60 (0.15)
1,Logistic Regression,0.50 (0.13),0.50 (0.14),0.52 (0.18),0.49 (0.10),0.50 (0.12),0.50 (0.12)
2,ExtraTrees Classifier,0.50 (0.12),0.50 (0.13),0.53 (0.25),0.40 (0.19),0.43 (0.16),0.57 (0.09)
3,Random Forest,0.53 (0.13),0.53 (0.13),0.57 (0.23),0.51 (0.15),0.52 (0.14),0.57 (0.21)
4,LDA,0.53 (0.17),0.53 (0.17),0.54 (0.21),0.51 (0.22),0.51 (0.19),0.54 (0.12)
5,MLP Classifier,0.65 (0.08),0.65 (0.08),0.65 (0.10),0.65 (0.11),0.65 (0.09),0.61 (0.15)
6,SVM rbf,0.56 (0.18),0.56 (0.19),0.57 (0.23),0.51 (0.20),0.53 (0.20),0.53 (0.15)
7,SVM sigmoid,0.51 (0.14),0.51 (0.14),0.51 (0.14),0.52 (0.14),0.51 (0.13),0.47 (0.17)


In [34]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=30, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.56 (0.17),0.56 (0.17),0.54 (0.17),0.56 (0.25),0.54 (0.20),0.59 (0.18)
1,Logistic Regression,0.50 (0.08),0.50 (0.08),0.50 (0.09),0.54 (0.13),0.52 (0.09),0.45 (0.07)
2,ExtraTrees Classifier,0.53 (0.06),0.53 (0.07),0.52 (0.11),0.43 (0.14),0.47 (0.12),0.44 (0.07)
3,Random Forest,0.54 (0.12),0.54 (0.12),0.53 (0.17),0.51 (0.23),0.51 (0.18),0.50 (0.14)
4,LDA,0.51 (0.04),0.51 (0.04),0.51 (0.05),0.48 (0.12),0.49 (0.08),0.46 (0.06)
5,MLP Classifier,0.53 (0.14),0.53 (0.13),0.50 (0.14),0.53 (0.25),0.50 (0.20),0.54 (0.18)
6,SVM rbf,0.52 (0.12),0.52 (0.12),0.52 (0.10),0.63 (0.16),0.56 (0.11),0.48 (0.18)
7,SVM sigmoid,0.62 (0.07),0.62 (0.07),0.62 (0.09),0.59 (0.10),0.60 (0.10),0.62 (0.10)


In [36]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=10, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.56 (0.06),0.56 (0.06),0.55 (0.07),0.62 (0.12),0.58 (0.08),0.59 (0.14)
1,Logistic Regression,0.53 (0.11),0.53 (0.11),0.51 (0.17),0.49 (0.20),0.49 (0.18),0.59 (0.12)
2,ExtraTrees Classifier,0.53 (0.11),0.53 (0.11),0.54 (0.12),0.52 (0.12),0.52 (0.11),0.57 (0.10)
3,Random Forest,0.53 (0.08),0.53 (0.08),0.52 (0.11),0.49 (0.20),0.49 (0.14),0.54 (0.12)
4,LDA,0.53 (0.11),0.53 (0.11),0.51 (0.17),0.51 (0.20),0.51 (0.18),0.60 (0.12)
5,MLP Classifier,0.55 (0.09),0.55 (0.10),0.55 (0.15),0.51 (0.13),0.52 (0.12),0.59 (0.09)
6,SVM rbf,0.68 (0.10),0.68 (0.10),0.67 (0.12),0.73 (0.07),0.70 (0.09),0.68 (0.08)
7,SVM sigmoid,0.54 (0.11),0.54 (0.11),0.54 (0.13),0.49 (0.14),0.51 (0.13),0.51 (0.15)


#### Counting

In [37]:
embeddings_var = counting_yamnet_embeddings
embeddings_var.shape

(74, 1024)

##### Grid search with PCA

In [38]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [3, 5, 7],
                 'classifier__weights': ['uniform', 'distance'],
                 }


pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', KNeighborsClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

knn_neighbours = clf.best_params_['classifier__n_neighbors']
knn_weights = clf.best_params_['classifier__weights']

print(clf.best_params_)

{'classifier': KNeighborsClassifier(), 'classifier__n_neighbors': 5, 'classifier__weights': 'uniform', 'pca__n_components': 40}


In [39]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [LogisticRegression(solver='liblinear')],
                'classifier__C': [0.01, 0.1, 1.0, 10]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', LogisticRegression())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lr_c = clf.best_params_['classifier__C']

print(print(clf.best_params_))

{'classifier': LogisticRegression(C=10, solver='liblinear'), 'classifier__C': 10, 'pca__n_components': 20}
None


In [40]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [ExtraTreesClassifier(random_state=RANDOM_STATE)],
                'classifier__n_estimators': [10, 30, 50, 100, 200],
                'classifier__max_depth': [8, 10, 12, 20]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', ExtraTreesClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

etc_max_depth = clf.best_params_['classifier__max_depth']
etc_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': ExtraTreesClassifier(max_depth=20, n_estimators=10, random_state=42), 'classifier__max_depth': 20, 'classifier__n_estimators': 10, 'pca__n_components': 40}


In [41]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [RandomForestClassifier(max_depth=4, max_leaf_nodes=5, random_state=RANDOM_STATE)],
                'classifier__max_depth': [5, 10, None],
                'classifier__n_estimators': [10, 20, 30, 40, 50, 100]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', RandomForestClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

rf_max_depth = clf.best_params_['classifier__max_depth']
rf_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': RandomForestClassifier(max_depth=5, max_leaf_nodes=5, n_estimators=40,
                       random_state=42), 'classifier__max_depth': 5, 'classifier__n_estimators': 40, 'pca__n_components': None}


In [42]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [LinearDiscriminantAnalysis()],
                'classifier__solver': ['svd', 'lsqr']}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', LinearDiscriminantAnalysis())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lda_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)


{'classifier': LinearDiscriminantAnalysis(), 'classifier__solver': 'svd', 'pca__n_components': 50}


In [43]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [MLPClassifier(random_state=RANDOM_STATE, max_iter = 1000)],
                 'classifier__hidden_layer_sizes': [(4,), (8,), (20,), (50,), (32,), (32,6,), (100,), (256, 128,)],
                 'classifier__activation': ['relu', 'tanh', 'logistic'],
                 'classifier__solver': ['adam', 'lbfgs']}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', MLPClassifier(random_state=1, max_iter=1000))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

mlp_hidden_layer = clf.best_params_['classifier__hidden_layer_sizes']
mlp_activation = clf.best_params_['classifier__activation']
mlp_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)

{'classifier': MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=42,
              solver='lbfgs'), 'classifier__activation': 'relu', 'classifier__hidden_layer_sizes': (50,), 'classifier__solver': 'lbfgs', 'pca__n_components': 20}


In [44]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [SVC(kernel='rbf')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', SVC(kernel='rbf'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_c = clf.best_params_['classifier__C']
svc_g = clf.best_params_['classifier__gamma']


print(clf.best_params_)

{'classifier': SVC(C=1000000.0, gamma=1e-05), 'classifier__C': 1000000.0, 'classifier__gamma': 1e-05, 'pca__n_components': 20}


In [45]:
search_space = {'pca__n_components': [10, 20, 30, 40, 50, None],
                'classifier': [SVC(kernel='sigmoid')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', SVC(kernel='sigmoid'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_sig_c = clf.best_params_['classifier__C']
svc_sig_g = clf.best_params_['classifier__gamma']

print(clf.best_params_)

{'classifier': SVC(C=100000.0, gamma=1e-05, kernel='sigmoid'), 'classifier__C': 100000.0, 'classifier__gamma': 1e-05, 'pca__n_components': 20}


##### Cross-validation

In [46]:
# Model building to train
names = ['K Nearest Neighbors','Logistic Regression', 'ExtraTrees Classifier','Random Forest','LDA' ,'MLP Classifier' ,'SVM rbf', 'SVM sigmoid']
Classifiers = [
               KNeighborsClassifier(n_neighbors=knn_neighbours, weights=knn_weights),
               LogisticRegression(solver='liblinear', C=lr_c), 
               ExtraTreesClassifier(random_state= RANDOM_STATE, max_depth=etc_max_depth, n_estimators=etc_n_estimators),
               RandomForestClassifier(max_leaf_nodes=5, random_state= RANDOM_STATE, max_depth=rf_max_depth, n_estimators=rf_n_estimators), 
               LinearDiscriminantAnalysis(solver = lda_solver),
               MLPClassifier(random_state= RANDOM_STATE, hidden_layer_sizes=mlp_hidden_layer, activation=mlp_activation, solver=mlp_solver, max_iter=1000),
               svm.SVC(kernel='rbf', C=svc_c, gamma=svc_g),
               svm.SVC(kernel='sigmoid', C=svc_sig_c, gamma=svc_sig_g)]

In [47]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=None, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.70 (0.07),0.71 (0.06),0.72 (0.04),0.68 (0.19),0.68 (0.11),0.71 (0.10)
1,Logistic Regression,0.62 (0.09),0.62 (0.09),0.61 (0.07),0.66 (0.24),0.62 (0.13),0.68 (0.08)
2,ExtraTrees Classifier,0.49 (0.09),0.49 (0.08),0.47 (0.12),0.42 (0.26),0.41 (0.19),0.54 (0.09)
3,Random Forest,0.68 (0.05),0.68 (0.05),0.74 (0.16),0.63 (0.16),0.65 (0.07),0.68 (0.08)
4,LDA,0.56 (0.11),0.56 (0.11),0.50 (0.09),0.62 (0.12),0.55 (0.07),0.55 (0.12)
5,MLP Classifier,0.64 (0.09),0.64 (0.09),0.62 (0.09),0.68 (0.18),0.64 (0.12),0.71 (0.08)
6,SVM rbf,0.66 (0.06),0.67 (0.07),0.67 (0.09),0.68 (0.22),0.65 (0.10),0.72 (0.04)
7,SVM sigmoid,0.65 (0.08),0.65 (0.08),0.65 (0.09),0.66 (0.24),0.63 (0.13),0.69 (0.06)


In [49]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=40, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.72 (0.06),0.72 (0.06),0.74 (0.06),0.68 (0.17),0.70 (0.08),0.71 (0.12)
1,Logistic Regression,0.59 (0.11),0.60 (0.12),0.58 (0.13),0.62 (0.24),0.59 (0.16),0.62 (0.09)
2,ExtraTrees Classifier,0.65 (0.13),0.65 (0.13),0.65 (0.13),0.73 (0.12),0.68 (0.11),0.65 (0.12)
3,Random Forest,0.58 (0.13),0.59 (0.14),0.58 (0.16),0.57 (0.25),0.56 (0.17),0.66 (0.12)
4,LDA,0.60 (0.14),0.60 (0.14),0.58 (0.16),0.60 (0.25),0.58 (0.18),0.61 (0.13)
5,MLP Classifier,0.58 (0.10),0.59 (0.11),0.56 (0.08),0.66 (0.24),0.59 (0.14),0.67 (0.11)
6,SVM rbf,0.64 (0.12),0.64 (0.13),0.63 (0.17),0.65 (0.23),0.63 (0.16),0.65 (0.09)
7,SVM sigmoid,0.61 (0.11),0.62 (0.12),0.63 (0.16),0.57 (0.26),0.57 (0.16),0.63 (0.09)


#### A-vowel phonation

In [78]:
embeddings_var = a_vowel_phonation_egemaps_embeddings
embeddings_var.shape

(74, 89)

##### Grid search without PCA 

In [79]:
search_space = { 'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [3, 5, 7],
                 'classifier__weights': ['uniform', 'distance'],
                 }

pca_model =  Pipeline([('scaler', scaler), ('classifier', KNeighborsClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

knn_neighbours = clf.best_params_['classifier__n_neighbors']
knn_weights = clf.best_params_['classifier__weights']

print(clf.best_params_)

{'classifier': KNeighborsClassifier(n_neighbors=7), 'classifier__n_neighbors': 7, 'classifier__weights': 'uniform'}


In [80]:
search_space = {'classifier': [LogisticRegression(solver='liblinear')],
                'classifier__C': [0.01, 0.1, 1.0, 10]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', LogisticRegression())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lr_c = clf.best_params_['classifier__C']

print(print(clf.best_params_))

{'classifier': LogisticRegression(C=0.01, solver='liblinear'), 'classifier__C': 0.01}
None


In [81]:
search_space = {'classifier': [ExtraTreesClassifier(random_state=RANDOM_STATE)],
                'classifier__n_estimators': [10, 30, 50, 100, 200],
                'classifier__max_depth': [8, 10, 12, 20]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', ExtraTreesClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

etc_max_depth = clf.best_params_['classifier__max_depth']
etc_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': ExtraTreesClassifier(max_depth=10, n_estimators=10, random_state=42), 'classifier__max_depth': 10, 'classifier__n_estimators': 10}


In [82]:
search_space = {'classifier': [RandomForestClassifier(max_depth=4, max_leaf_nodes=5, random_state=RANDOM_STATE)],
                'classifier__max_depth': [5, 10, None],
                'classifier__n_estimators': [10, 20, 30, 40, 50, 100]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', RandomForestClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

rf_max_depth = clf.best_params_['classifier__max_depth']
rf_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': RandomForestClassifier(max_depth=5, max_leaf_nodes=5, n_estimators=10,
                       random_state=42), 'classifier__max_depth': 5, 'classifier__n_estimators': 10}


In [83]:
search_space = {'classifier': [LinearDiscriminantAnalysis()],
                'classifier__solver': ['svd', 'lsqr']}

pca_model =  Pipeline([('scaler', scaler), ('classifier', LinearDiscriminantAnalysis())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lda_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)


{'classifier': LinearDiscriminantAnalysis(), 'classifier__solver': 'svd'}


In [84]:
search_space = { 'classifier': [MLPClassifier(random_state=RANDOM_STATE, max_iter = 1000)],
                 'classifier__hidden_layer_sizes': [(4,), (8,), (20,), (50,), (32,), (32,6,), (100,), (256, 128,)],
                 'classifier__activation': ['relu', 'tanh', 'logistic'],
                 'classifier__solver': ['adam', 'lbfgs']}

pca_model =  Pipeline([('scaler', scaler), ('classifier',MLPClassifier(random_state=1, max_iter=1000))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

mlp_hidden_layer = clf.best_params_['classifier__hidden_layer_sizes']
mlp_activation = clf.best_params_['classifier__activation']
mlp_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)

{'classifier': MLPClassifier(hidden_layer_sizes=(8,), max_iter=1000, random_state=42,
              solver='lbfgs'), 'classifier__activation': 'relu', 'classifier__hidden_layer_sizes': (8,), 'classifier__solver': 'lbfgs'}


In [85]:
search_space = {'classifier': [SVC(kernel='rbf')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', ('classifier', SVC(kernel='rbf')))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_c = clf.best_params_['classifier__C']
svc_g = clf.best_params_['classifier__gamma']


print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=1e-06), 'classifier__C': 10000.0, 'classifier__gamma': 1e-06}


In [86]:
search_space = {'classifier': [SVC(kernel='sigmoid')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', SVC(kernel='sigmoid'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_sig_c = clf.best_params_['classifier__C']
svc_sig_g = clf.best_params_['classifier__gamma']

print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=1e-06, kernel='sigmoid'), 'classifier__C': 10000.0, 'classifier__gamma': 1e-06}


##### Cross-validation

In [87]:
# Model building to train
names = ['K Nearest Neighbors','Logistic Regression', 'ExtraTrees Classifier','Random Forest','LDA' ,'MLP Classifier' ,'SVM rbf', 'SVM sigmoid']
Classifiers = [
               KNeighborsClassifier(n_neighbors=knn_neighbours, weights=knn_weights),
               LogisticRegression(solver='liblinear', C=lr_c), 
               ExtraTreesClassifier(random_state= RANDOM_STATE, max_depth=etc_max_depth, n_estimators=etc_n_estimators),
               RandomForestClassifier(max_leaf_nodes=5, random_state= RANDOM_STATE, max_depth=rf_max_depth, n_estimators=rf_n_estimators), 
               LinearDiscriminantAnalysis(solver = lda_solver),
               MLPClassifier(random_state= RANDOM_STATE, hidden_layer_sizes=mlp_hidden_layer, activation=mlp_activation, solver=mlp_solver, max_iter=1000),
               svm.SVC(kernel='rbf', C=svc_c, gamma=svc_g),
               svm.SVC(kernel='sigmoid', C=svc_sig_c, gamma=svc_sig_g)]

In [88]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.60 (0.17),0.60 (0.17),0.66 (0.22),0.62 (0.12),0.62 (0.12),0.67 (0.19)
1,Logistic Regression,0.51 (0.11),0.51 (0.12),0.55 (0.15),0.54 (0.11),0.53 (0.09),0.49 (0.18)
2,ExtraTrees Classifier,0.69 (0.07),0.69 (0.08),0.75 (0.14),0.56 (0.12),0.64 (0.11),0.69 (0.13)
3,Random Forest,0.54 (0.05),0.54 (0.06),0.53 (0.07),0.57 (0.16),0.54 (0.09),0.64 (0.06)
4,LDA,0.46 (0.11),0.46 (0.11),0.46 (0.11),0.46 (0.12),0.46 (0.11),0.40 (0.15)
5,MLP Classifier,0.59 (0.13),0.59 (0.13),0.60 (0.15),0.59 (0.10),0.59 (0.13),0.54 (0.16)
6,SVM rbf,0.59 (0.07),0.60 (0.08),0.64 (0.18),0.62 (0.15),0.60 (0.06),0.53 (0.16)
7,SVM sigmoid,0.54 (0.16),0.54 (0.15),0.56 (0.17),0.56 (0.14),0.55 (0.14),0.51 (0.19)
