In [1]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [2]:
import tensorflow as tf
print(tf.__version__)

2.9.1


In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
import pandas as pd
#pd.options.mode.chained_assignment = None
import numpy as np
from scipy import stats


In [5]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.applications import *

In [6]:
import sys
sys.path.append('../utils')  # Add the 'utils' folder to the Python path

from classification_cross_validation import pipeline_cross_val  # Import function from helper.py

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.svm import SVC

In [8]:
import os

DATA_FOLDER = "..\\data"
EMBEDDINGS_FOLDER = '..\\embeddings\\for model training'

### Results for male participants without language stratification

##### Data reading 
(explicit participant data is private, the embeddings extracted from the audio recordings   are however available)

In [11]:
df = pd.read_csv(os.path.join(DATA_FOLDER, "men_matched_clinical_data.csv"))

In [12]:
df['currently_smoking'].value_counts()

1    237
0    237
Name: currently_smoking, dtype: int64

In [13]:
y_smk = df['currently_smoking'].values
y_smk.shape

(474,)

##### Features

In [12]:
# wav2vec for speech
reading_wav2vec_embeddings = np.load(os.path.join(EMBEDDINGS_FOLDER, 'wav2vec_embeddings_reading_male.npy'))
counting_wav2vec_embeddings = np.load(os.path.join(EMBEDDINGS_FOLDER, 'wav2vec_embeddings_counting_male.npy'))

# byola for a-vowel phonation
a_vowel_phonation_byola_embeddings = np.load(os.path.join(EMBEDDINGS_FOLDER, 'byola_2048_a_vowel_phonation_embeddings_male.npy'))

In [13]:
RANDOM_STATE = 42
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
scaler = StandardScaler()

#### Reading

In [43]:
embeddings_var = reading_wav2vec_embeddings
embeddings_var.shape

(474, 1024)

##### Grid search with PCA

In [44]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [3, 5, 7],
                 'classifier__weights': ['uniform', 'distance'],
                 }


pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', KNeighborsClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

knn_neighbours = clf.best_params_['classifier__n_neighbors']
knn_weights = clf.best_params_['classifier__weights']

print(clf.best_params_)

{'classifier': KNeighborsClassifier(n_neighbors=7, weights='distance'), 'classifier__n_neighbors': 7, 'classifier__weights': 'distance', 'pca__n_components': None}


In [45]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [LogisticRegression(solver='liblinear')],
                'classifier__C': [0.01, 0.1, 1.0, 10]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', LogisticRegression())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lr_c = clf.best_params_['classifier__C']

print(print(clf.best_params_))

{'classifier': LogisticRegression(C=0.01, solver='liblinear'), 'classifier__C': 0.01, 'pca__n_components': None}
None


In [46]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [ExtraTreesClassifier(random_state=RANDOM_STATE)],
                'classifier__n_estimators': [10, 30, 50, 100, 200],
                'classifier__max_depth': [8, 10, 12, 20]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', ExtraTreesClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

etc_max_depth = clf.best_params_['classifier__max_depth']
etc_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': ExtraTreesClassifier(max_depth=12, random_state=42), 'classifier__max_depth': 12, 'classifier__n_estimators': 100, 'pca__n_components': 100}


In [47]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [RandomForestClassifier(max_depth=4, max_leaf_nodes=5, random_state=RANDOM_STATE)],
                'classifier__max_depth': [5, 10, None],
                'classifier__n_estimators': [10, 20, 30, 40, 50, 100]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', RandomForestClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

rf_max_depth = clf.best_params_['classifier__max_depth']
rf_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': RandomForestClassifier(max_depth=5, max_leaf_nodes=5, n_estimators=10,
                       random_state=42), 'classifier__max_depth': 5, 'classifier__n_estimators': 10, 'pca__n_components': 50}


In [48]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [LinearDiscriminantAnalysis()],
                'classifier__solver': ['svd', 'lsqr']}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', LinearDiscriminantAnalysis())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lda_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)


{'classifier': LinearDiscriminantAnalysis(), 'classifier__solver': 'svd', 'pca__n_components': 100}


In [49]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [MLPClassifier(random_state=RANDOM_STATE, max_iter = 1000)],
                 'classifier__hidden_layer_sizes': [(4,), (8,), (20,), (50,), (32,), (32,6,), (100,), (256, 128,)],
                 'classifier__activation': ['relu', 'tanh', 'logistic'],
                 'classifier__solver': ['adam', 'lbfgs']}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', MLPClassifier(random_state=1, max_iter=1000))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

mlp_hidden_layer = clf.best_params_['classifier__hidden_layer_sizes']
mlp_activation = clf.best_params_['classifier__activation']
mlp_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)

{'classifier': MLPClassifier(activation='tanh', hidden_layer_sizes=(256, 128), max_iter=1000,
              random_state=42, solver='lbfgs'), 'classifier__activation': 'tanh', 'classifier__hidden_layer_sizes': (256, 128), 'classifier__solver': 'lbfgs', 'pca__n_components': None}


In [50]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [SVC(kernel='rbf')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', SVC(kernel='rbf'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_c = clf.best_params_['classifier__C']
svc_g = clf.best_params_['classifier__gamma']


print(clf.best_params_)

{'classifier': SVC(C=100000.0, gamma=1e-06), 'classifier__C': 100000.0, 'classifier__gamma': 1e-06, 'pca__n_components': 100}


In [51]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [SVC(kernel='sigmoid')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', SVC(kernel='sigmoid'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_sig_c = clf.best_params_['classifier__C']
svc_sig_g = clf.best_params_['classifier__gamma']

print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=1e-06, kernel='sigmoid'), 'classifier__C': 10000.0, 'classifier__gamma': 1e-06, 'pca__n_components': 100}


##### Cross-validation

In [52]:
# Model building to train
names = ['K Nearest Neighbors','Logistic Regression', 'ExtraTrees Classifier','Random Forest','LDA' ,'MLP Classifier' ,'SVM rbf', 'SVM sigmoid']
Classifiers = [
               KNeighborsClassifier(n_neighbors=knn_neighbours, weights=knn_weights),
               LogisticRegression(solver='liblinear', C=lr_c), 
               ExtraTreesClassifier(random_state= RANDOM_STATE, max_depth=etc_max_depth, n_estimators=etc_n_estimators),
               RandomForestClassifier(max_leaf_nodes=5, random_state= RANDOM_STATE, max_depth=rf_max_depth, n_estimators=rf_n_estimators), 
               LinearDiscriminantAnalysis(solver = lda_solver),
               MLPClassifier(random_state= RANDOM_STATE, hidden_layer_sizes=mlp_hidden_layer, activation=mlp_activation, solver=mlp_solver, max_iter=1000),
               svm.SVC(kernel='rbf', C=svc_c, gamma=svc_g),
               svm.SVC(kernel='sigmoid', C=svc_sig_c, gamma=svc_sig_g)]

In [53]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=None, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.52 (0.04),0.52 (0.04),0.53 (0.04),0.44 (0.03),0.48 (0.03),0.51 (0.06)
1,Logistic Regression,0.63 (0.05),0.63 (0.05),0.64 (0.06),0.60 (0.05),0.62 (0.05),0.65 (0.06)
2,ExtraTrees Classifier,0.51 (0.05),0.51 (0.05),0.48 (0.08),0.31 (0.08),0.38 (0.08),0.52 (0.06)
3,Random Forest,0.52 (0.04),0.52 (0.04),0.52 (0.05),0.44 (0.11),0.47 (0.09),0.51 (0.06)
4,LDA,0.54 (0.04),0.54 (0.04),0.50 (0.05),0.49 (0.06),0.49 (0.05),0.50 (0.06)
5,MLP Classifier,0.62 (0.05),0.62 (0.05),0.62 (0.05),0.63 (0.05),0.62 (0.04),0.64 (0.06)
6,SVM rbf,0.57 (0.02),0.57 (0.02),0.57 (0.02),0.58 (0.06),0.57 (0.03),0.58 (0.02)
7,SVM sigmoid,0.62 (0.03),0.62 (0.03),0.62 (0.04),0.62 (0.03),0.62 (0.03),0.65 (0.05)


In [54]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=300, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.53 (0.04),0.53 (0.04),0.53 (0.05),0.45 (0.03),0.49 (0.04),0.51 (0.06)
1,Logistic Regression,0.64 (0.05),0.64 (0.05),0.65 (0.06),0.61 (0.05),0.62 (0.04),0.65 (0.06)
2,ExtraTrees Classifier,0.57 (0.02),0.57 (0.02),0.54 (0.02),0.51 (0.08),0.52 (0.05),0.59 (0.01)
3,Random Forest,0.56 (0.03),0.56 (0.03),0.56 (0.03),0.55 (0.07),0.55 (0.05),0.57 (0.04)
4,LDA,0.57 (0.03),0.57 (0.03),0.57 (0.03),0.58 (0.04),0.57 (0.03),0.58 (0.04)
5,MLP Classifier,0.59 (0.06),0.59 (0.06),0.60 (0.07),0.58 (0.05),0.59 (0.06),0.60 (0.05)
6,SVM rbf,0.57 (0.03),0.57 (0.03),0.57 (0.03),0.58 (0.02),0.57 (0.02),0.58 (0.03)
7,SVM sigmoid,0.61 (0.04),0.61 (0.04),0.62 (0.05),0.62 (0.03),0.62 (0.03),0.64 (0.05)


In [55]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=100, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.51 (0.04),0.51 (0.04),0.52 (0.05),0.43 (0.03),0.47 (0.04),0.51 (0.06)
1,Logistic Regression,0.63 (0.05),0.63 (0.05),0.64 (0.06),0.60 (0.06),0.61 (0.05),0.65 (0.06)
2,ExtraTrees Classifier,0.59 (0.04),0.59 (0.04),0.60 (0.05),0.51 (0.03),0.55 (0.02),0.61 (0.02)
3,Random Forest,0.57 (0.05),0.57 (0.05),0.57 (0.05),0.48 (0.10),0.52 (0.07),0.57 (0.05)
4,LDA,0.62 (0.02),0.62 (0.02),0.63 (0.03),0.62 (0.04),0.62 (0.02),0.64 (0.04)
5,MLP Classifier,0.57 (0.05),0.57 (0.05),0.57 (0.05),0.57 (0.05),0.57 (0.05),0.62 (0.06)
6,SVM rbf,0.64 (0.04),0.64 (0.04),0.62 (0.05),0.65 (0.06),0.64 (0.05),0.63 (0.06)
7,SVM sigmoid,0.63 (0.04),0.63 (0.04),0.64 (0.05),0.62 (0.04),0.63 (0.03),0.65 (0.05)


In [56]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=50, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.50 (0.03),0.50 (0.03),0.50 (0.04),0.41 (0.03),0.45 (0.03),0.50 (0.06)
1,Logistic Regression,0.61 (0.06),0.61 (0.06),0.63 (0.07),0.57 (0.07),0.60 (0.06),0.64 (0.08)
2,ExtraTrees Classifier,0.57 (0.05),0.57 (0.05),0.57 (0.06),0.53 (0.05),0.55 (0.05),0.59 (0.06)
3,Random Forest,0.57 (0.03),0.57 (0.03),0.58 (0.06),0.54 (0.06),0.55 (0.03),0.60 (0.04)
4,LDA,0.59 (0.07),0.60 (0.07),0.60 (0.08),0.60 (0.07),0.60 (0.06),0.63 (0.09)
5,MLP Classifier,0.53 (0.06),0.53 (0.06),0.53 (0.05),0.55 (0.05),0.54 (0.05),0.57 (0.07)
6,SVM rbf,0.58 (0.07),0.58 (0.07),0.60 (0.07),0.55 (0.04),0.57 (0.05),0.62 (0.08)
7,SVM sigmoid,0.58 (0.06),0.58 (0.06),0.59 (0.07),0.54 (0.04),0.56 (0.05),0.62 (0.08)


#### Counting

In [42]:
embeddings_var = counting_wav2vec_embeddings
embeddings_var.shape

(474, 1024)

##### Grid search with PCA

In [53]:
search_space = {'pca__n_components': [50, 100, 200, None],
                'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [3, 5, 7],
                 'classifier__weights': ['uniform', 'distance'],
                 }


pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', KNeighborsClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

knn_neighbours = clf.best_params_['classifier__n_neighbors']
knn_weights = clf.best_params_['classifier__weights']

print(clf.best_params_)

{'classifier': KNeighborsClassifier(weights='distance'), 'classifier__n_neighbors': 5, 'classifier__weights': 'distance', 'pca__n_components': 100}


In [54]:
search_space = {'pca__n_components': [50, 100, 200, None],
                'classifier': [LogisticRegression(solver='liblinear')],
                'classifier__C': [0.01, 0.1, 1.0, 10]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', LogisticRegression())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lr_c = clf.best_params_['classifier__C']

print(print(clf.best_params_))

{'classifier': LogisticRegression(C=10, solver='liblinear'), 'classifier__C': 10, 'pca__n_components': None}
None


In [55]:
search_space = {'pca__n_components': [50, 100, 200, None],
                'classifier': [ExtraTreesClassifier(random_state=RANDOM_STATE)],
                'classifier__n_estimators': [10, 30, 50, 100, 200],
                'classifier__max_depth': [8, 10, 12, 20]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', ExtraTreesClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

etc_max_depth = clf.best_params_['classifier__max_depth']
etc_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': ExtraTreesClassifier(max_depth=8, random_state=42), 'classifier__max_depth': 8, 'classifier__n_estimators': 100, 'pca__n_components': 50}


In [56]:
search_space = {'pca__n_components': [50, 100, 200, None],
                'classifier': [RandomForestClassifier(max_depth=4, max_leaf_nodes=5, random_state=RANDOM_STATE)],
                'classifier__max_depth': [5, 10, None],
                'classifier__n_estimators': [10, 20, 30, 40, 50, 100]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', RandomForestClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

rf_max_depth = clf.best_params_['classifier__max_depth']
rf_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': RandomForestClassifier(max_depth=5, max_leaf_nodes=5, n_estimators=10,
                       random_state=42), 'classifier__max_depth': 5, 'classifier__n_estimators': 10, 'pca__n_components': 50}


In [57]:
search_space = {'pca__n_components': [50, 100, 200, None],
                'classifier': [LinearDiscriminantAnalysis()],
                'classifier__solver': ['svd', 'lsqr']}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', LinearDiscriminantAnalysis())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lda_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)


{'classifier': LinearDiscriminantAnalysis(), 'classifier__solver': 'svd', 'pca__n_components': 50}


In [58]:
search_space = {'pca__n_components': [50, 100, 200, None],
                'classifier': [MLPClassifier(random_state=RANDOM_STATE, max_iter = 1000)],
                 'classifier__hidden_layer_sizes': [(4,), (8,), (20,), (50,), (32,), (32,6,), (100,), (256, 128,)],
                 'classifier__activation': ['relu', 'tanh', 'logistic'],
                 'classifier__solver': ['adam', 'lbfgs']}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', MLPClassifier(random_state=1, max_iter=1000))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

mlp_hidden_layer = clf.best_params_['classifier__hidden_layer_sizes']
mlp_activation = clf.best_params_['classifier__activation']
mlp_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)

{'classifier': MLPClassifier(activation='tanh', hidden_layer_sizes=(4,), max_iter=1000,
              random_state=42), 'classifier__activation': 'tanh', 'classifier__hidden_layer_sizes': (4,), 'classifier__solver': 'adam', 'pca__n_components': 200}


In [59]:
search_space = {'pca__n_components': [50, 100, 200, None],
                'classifier': [SVC(kernel='rbf')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', SVC(kernel='rbf'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_c = clf.best_params_['classifier__C']
svc_g = clf.best_params_['classifier__gamma']


print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=1e-06), 'classifier__C': 10000.0, 'classifier__gamma': 1e-06, 'pca__n_components': 50}


In [60]:
search_space = {'pca__n_components': [50, 100, 200, None],
                'classifier': [SVC(kernel='sigmoid')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', SVC(kernel='sigmoid'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_sig_c = clf.best_params_['classifier__C']
svc_sig_g = clf.best_params_['classifier__gamma']

print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=1e-06, kernel='sigmoid'), 'classifier__C': 10000.0, 'classifier__gamma': 1e-06, 'pca__n_components': 50}


##### Cross-validation

In [61]:
# Model building to train
names = ['K Nearest Neighbors','Logistic Regression', 'ExtraTrees Classifier','Random Forest','LDA' ,'MLP Classifier' ,'SVM rbf', 'SVM sigmoid']
Classifiers = [
               KNeighborsClassifier(n_neighbors=knn_neighbours, weights=knn_weights),
               LogisticRegression(solver='liblinear', C=lr_c), 
               ExtraTreesClassifier(random_state= RANDOM_STATE, max_depth=etc_max_depth, n_estimators=etc_n_estimators),
               RandomForestClassifier(max_leaf_nodes=5, random_state= RANDOM_STATE, max_depth=rf_max_depth, n_estimators=rf_n_estimators), 
               LinearDiscriminantAnalysis(solver = lda_solver),
               MLPClassifier(random_state= RANDOM_STATE, hidden_layer_sizes=mlp_hidden_layer, activation=mlp_activation, solver=mlp_solver, max_iter=1000),
               svm.SVC(kernel='rbf', C=svc_c, gamma=svc_g),
               svm.SVC(kernel='sigmoid', C=svc_sig_c, gamma=svc_sig_g)]

In [64]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=200, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.57 (0.04),0.57 (0.04),0.58 (0.05),0.51 (0.04),0.54 (0.03),0.58 (0.05)
1,Logistic Regression,0.63 (0.04),0.63 (0.04),0.63 (0.04),0.67 (0.06),0.64 (0.04),0.66 (0.04)
2,ExtraTrees Classifier,0.56 (0.04),0.56 (0.04),0.55 (0.05),0.49 (0.06),0.52 (0.05),0.59 (0.04)
3,Random Forest,0.54 (0.06),0.54 (0.06),0.55 (0.08),0.50 (0.09),0.52 (0.06),0.56 (0.06)
4,LDA,0.59 (0.06),0.59 (0.06),0.59 (0.06),0.62 (0.04),0.60 (0.05),0.65 (0.07)
5,MLP Classifier,0.65 (0.05),0.65 (0.05),0.65 (0.04),0.68 (0.07),0.66 (0.05),0.68 (0.05)
6,SVM rbf,0.61 (0.04),0.61 (0.04),0.61 (0.05),0.62 (0.05),0.61 (0.04),0.66 (0.03)
7,SVM sigmoid,0.59 (0.03),0.59 (0.03),0.59 (0.03),0.59 (0.06),0.59 (0.04),0.65 (0.03)


In [65]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=100, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.57 (0.05),0.57 (0.05),0.59 (0.06),0.50 (0.05),0.54 (0.05),0.58 (0.06)
1,Logistic Regression,0.59 (0.01),0.59 (0.01),0.59 (0.01),0.60 (0.04),0.59 (0.02),0.65 (0.02)
2,ExtraTrees Classifier,0.57 (0.02),0.57 (0.02),0.58 (0.02),0.50 (0.04),0.53 (0.03),0.61 (0.02)
3,Random Forest,0.52 (0.02),0.52 (0.02),0.52 (0.02),0.48 (0.05),0.50 (0.03),0.55 (0.04)
4,LDA,0.61 (0.03),0.61 (0.03),0.60 (0.02),0.61 (0.06),0.61 (0.04),0.66 (0.03)
5,MLP Classifier,0.59 (0.02),0.59 (0.02),0.59 (0.02),0.59 (0.04),0.59 (0.03),0.62 (0.04)
6,SVM rbf,0.59 (0.02),0.59 (0.02),0.59 (0.02),0.61 (0.02),0.60 (0.02),0.64 (0.03)
7,SVM sigmoid,0.60 (0.02),0.60 (0.02),0.60 (0.02),0.60 (0.05),0.60 (0.03),0.64 (0.03)


In [66]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=50, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.57 (0.03),0.57 (0.03),0.58 (0.04),0.52 (0.03),0.55 (0.03),0.58 (0.05)
1,Logistic Regression,0.60 (0.01),0.60 (0.01),0.60 (0.01),0.59 (0.04),0.60 (0.02),0.64 (0.03)
2,ExtraTrees Classifier,0.61 (0.01),0.61 (0.01),0.63 (0.02),0.56 (0.03),0.59 (0.02),0.63 (0.02)
3,Random Forest,0.59 (0.02),0.59 (0.02),0.61 (0.02),0.51 (0.07),0.55 (0.05),0.62 (0.03)
4,LDA,0.61 (0.01),0.61 (0.01),0.62 (0.01),0.60 (0.05),0.61 (0.03),0.65 (0.03)
5,MLP Classifier,0.56 (0.06),0.56 (0.06),0.56 (0.07),0.50 (0.10),0.53 (0.08),0.59 (0.07)
6,SVM rbf,0.62 (0.01),0.62 (0.01),0.62 (0.01),0.62 (0.05),0.62 (0.02),0.64 (0.04)
7,SVM sigmoid,0.62 (0.02),0.62 (0.02),0.62 (0.02),0.62 (0.07),0.62 (0.04),0.65 (0.04)


#### A-vowel phonation

In [30]:
embeddings_var = a_vowel_phonation_byola_embeddings
embeddings_var.shape

(474, 2048)

##### Grid search with PCA

In [31]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [3, 5, 7],
                 'classifier__weights': ['uniform', 'distance'],
                 }


pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', KNeighborsClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

knn_neighbours = clf.best_params_['classifier__n_neighbors']
knn_weights = clf.best_params_['classifier__weights']

print(clf.best_params_)

{'classifier': KNeighborsClassifier(n_neighbors=7), 'classifier__n_neighbors': 7, 'classifier__weights': 'uniform', 'pca__n_components': None}


In [32]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [LogisticRegression(solver='liblinear')],
                'classifier__C': [0.01, 0.1, 1.0, 10]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', LogisticRegression())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lr_c = clf.best_params_['classifier__C']

print(print(clf.best_params_))

{'classifier': LogisticRegression(C=0.1, solver='liblinear'), 'classifier__C': 0.1, 'pca__n_components': 50}
None


In [33]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [ExtraTreesClassifier(random_state=RANDOM_STATE)],
                'classifier__n_estimators': [10, 30, 50, 100, 200],
                'classifier__max_depth': [8, 10, 12, 20]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', ExtraTreesClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

etc_max_depth = clf.best_params_['classifier__max_depth']
etc_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': ExtraTreesClassifier(max_depth=10, n_estimators=200, random_state=42), 'classifier__max_depth': 10, 'classifier__n_estimators': 200, 'pca__n_components': 50}


In [34]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [RandomForestClassifier(max_depth=4, max_leaf_nodes=5, random_state=RANDOM_STATE)],
                'classifier__max_depth': [5, 10, None],
                'classifier__n_estimators': [10, 20, 30, 40, 50, 100]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', RandomForestClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

rf_max_depth = clf.best_params_['classifier__max_depth']
rf_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': RandomForestClassifier(max_depth=5, max_leaf_nodes=5, n_estimators=50,
                       random_state=42), 'classifier__max_depth': 5, 'classifier__n_estimators': 50, 'pca__n_components': 100}


In [35]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [LinearDiscriminantAnalysis()],
                'classifier__solver': ['svd', 'lsqr']}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', LinearDiscriminantAnalysis())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lda_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)


{'classifier': LinearDiscriminantAnalysis(), 'classifier__solver': 'svd', 'pca__n_components': 50}


In [36]:
search_space = {'pca__n_components': [50, 100, 200, None],
                'classifier': [MLPClassifier(random_state=RANDOM_STATE, max_iter = 1000)],
                 'classifier__hidden_layer_sizes': [(4,), (8,), (20,), (50,), (32,), (32,6,), (100,), (256, 128,)],
                 'classifier__activation': ['relu', 'tanh', 'logistic'],
                 'classifier__solver': ['adam', 'lbfgs']}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', MLPClassifier(random_state=1, max_iter=1000))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

mlp_hidden_layer = clf.best_params_['classifier__hidden_layer_sizes']
mlp_activation = clf.best_params_['classifier__activation']
mlp_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)

{'classifier': MLPClassifier(activation='tanh', hidden_layer_sizes=(256, 128), max_iter=1000,
              random_state=42, solver='lbfgs'), 'classifier__activation': 'tanh', 'classifier__hidden_layer_sizes': (256, 128), 'classifier__solver': 'lbfgs', 'pca__n_components': 50}


In [37]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [SVC(kernel='rbf')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', SVC(kernel='rbf'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_c = clf.best_params_['classifier__C']
svc_g = clf.best_params_['classifier__gamma']


print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=1e-06), 'classifier__C': 10000.0, 'classifier__gamma': 1e-06, 'pca__n_components': 50}


In [38]:
search_space = {'pca__n_components': [50, 100, None],
                'classifier': [SVC(kernel='sigmoid')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('pca', PCA(random_state=42)), ('classifier', SVC(kernel='sigmoid'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_sig_c = clf.best_params_['classifier__C']
svc_sig_g = clf.best_params_['classifier__gamma']

print(clf.best_params_)

{'classifier': SVC(C=100000.0, gamma=1e-06, kernel='sigmoid'), 'classifier__C': 100000.0, 'classifier__gamma': 1e-06, 'pca__n_components': 50}


##### Cross-validation

In [39]:
# Model building to train
names = ['K Nearest Neighbors','Logistic Regression', 'ExtraTrees Classifier','Random Forest','LDA' ,'MLP Classifier' ,'SVM rbf', 'SVM sigmoid']
Classifiers = [
               KNeighborsClassifier(n_neighbors=knn_neighbours, weights=knn_weights),
               LogisticRegression(solver='liblinear', C=lr_c), 
               ExtraTreesClassifier(random_state= RANDOM_STATE, max_depth=etc_max_depth, n_estimators=etc_n_estimators),
               RandomForestClassifier(max_leaf_nodes=5, random_state= RANDOM_STATE, max_depth=rf_max_depth, n_estimators=rf_n_estimators), 
               LinearDiscriminantAnalysis(solver = lda_solver),
               MLPClassifier(random_state= RANDOM_STATE, hidden_layer_sizes=mlp_hidden_layer, activation=mlp_activation, solver=mlp_solver, max_iter=1000),
               svm.SVC(kernel='rbf', C=svc_c, gamma=svc_g),
               svm.SVC(kernel='sigmoid', C=svc_sig_c, gamma=svc_sig_g)]

In [40]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=None, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.53 (0.05),0.53 (0.05),0.53 (0.06),0.54 (0.06),0.53 (0.06),0.54 (0.07)
1,Logistic Regression,0.52 (0.06),0.52 (0.06),0.52 (0.06),0.52 (0.06),0.52 (0.06),0.55 (0.04)
2,ExtraTrees Classifier,0.53 (0.03),0.53 (0.03),0.55 (0.02),0.55 (0.07),0.55 (0.03),0.56 (0.05)
3,Random Forest,0.51 (0.05),0.51 (0.05),0.50 (0.04),0.46 (0.11),0.47 (0.05),0.52 (0.06)
4,LDA,0.50 (0.06),0.50 (0.06),0.54 (0.04),0.52 (0.06),0.53 (0.05),0.55 (0.06)
5,MLP Classifier,0.53 (0.04),0.53 (0.04),0.54 (0.04),0.56 (0.03),0.55 (0.01),0.57 (0.03)
6,SVM rbf,0.51 (0.05),0.51 (0.05),0.51 (0.05),0.53 (0.05),0.52 (0.05),0.53 (0.05)
7,SVM sigmoid,0.51 (0.05),0.52 (0.05),0.51 (0.05),0.53 (0.05),0.52 (0.05),0.53 (0.05)


In [41]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=100, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.52 (0.05),0.52 (0.05),0.52 (0.06),0.49 (0.07),0.50 (0.06),0.53 (0.07)
1,Logistic Regression,0.56 (0.03),0.56 (0.03),0.55 (0.03),0.56 (0.05),0.56 (0.04),0.60 (0.02)
2,ExtraTrees Classifier,0.55 (0.05),0.55 (0.05),0.60 (0.05),0.49 (0.08),0.53 (0.04),0.57 (0.03)
3,Random Forest,0.56 (0.04),0.56 (0.04),0.58 (0.04),0.51 (0.07),0.54 (0.04),0.58 (0.05)
4,LDA,0.58 (0.01),0.58 (0.01),0.57 (0.01),0.59 (0.04),0.58 (0.02),0.61 (0.02)
5,MLP Classifier,0.57 (0.01),0.57 (0.01),0.57 (0.01),0.57 (0.06),0.57 (0.03),0.59 (0.03)
6,SVM rbf,0.56 (0.03),0.56 (0.03),0.56 (0.03),0.55 (0.07),0.55 (0.05),0.60 (0.04)
7,SVM sigmoid,0.54 (0.04),0.54 (0.04),0.54 (0.05),0.53 (0.06),0.54 (0.05),0.59 (0.04)


In [42]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, reduction ='PCA', n_components=50, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.52 (0.06),0.52 (0.06),0.52 (0.07),0.52 (0.08),0.52 (0.07),0.52 (0.08)
1,Logistic Regression,0.59 (0.02),0.59 (0.02),0.60 (0.02),0.57 (0.06),0.58 (0.04),0.62 (0.04)
2,ExtraTrees Classifier,0.57 (0.03),0.57 (0.03),0.58 (0.03),0.52 (0.10),0.54 (0.06),0.61 (0.04)
3,Random Forest,0.52 (0.04),0.52 (0.04),0.53 (0.06),0.42 (0.07),0.47 (0.06),0.56 (0.04)
4,LDA,0.58 (0.02),0.58 (0.02),0.59 (0.02),0.55 (0.08),0.57 (0.05),0.62 (0.04)
5,MLP Classifier,0.61 (0.04),0.61 (0.04),0.63 (0.06),0.59 (0.10),0.60 (0.04),0.64 (0.04)
6,SVM rbf,0.62 (0.03),0.62 (0.03),0.64 (0.05),0.57 (0.08),0.59 (0.05),0.63 (0.05)
7,SVM sigmoid,0.62 (0.03),0.62 (0.03),0.64 (0.05),0.58 (0.08),0.61 (0.04),0.63 (0.05)
