In [2]:
import tensorflow as tf
print(tf.__version__)

2.9.1


In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
import pandas as pd
#pd.options.mode.chained_assignment = None
import numpy as np
from scipy import stats

In [5]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.applications import *

In [7]:
import sys
sys.path.append('../utils')  # Add the 'utils' folder to the Python path

from classification_cross_validation import pipeline_cross_val  # Import function from helper.py


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.svm import SVC

In [55]:
import os

DATA_FOLDER = 'path_to_folder_containing_participant_data'
# Specifying colive voice extraction
EXTRACTION = "EXTRACT_2023_03_16_12_09_45"

EMBEDDINGS_FOLDER = '..\\embeddings\\for model training'

### Results for French-speaking female participants

##### Data reading 
(explicit participant data is private, the embeddings extracted from the audio recordings   are however available)

In [56]:
# reading dataframe 
df = pd.read_csv(os.path.join(DATA_FOLDER, EXTRACTION, "dataframe_name"))
# excluding participants that did not provide all audios correctly
list_of_participants_to_exclude = []
df = df[~df["UniqueId"].isin(list_of_participants_to_exclude)]

In [12]:
df['currently_smoking'].value_counts()

1    251
0    251
Name: currently_smoking, dtype: int64

In [57]:
y_smk = df['currently_smoking'].values
y_smk.shape

(502,)

##### Features

In [28]:
reading_vggish_embeddings = np.load(os.path.join(EMBEDDINGS_FOLDER, 'vggish_reading_embeddings_female_french.npy'), allow_pickle=True)
counting_egemaps_embeddings = np.load(os.path.join(EMBEDDINGS_FOLDER, 'egemaps_counting_embeddings_female_french.npy'))
a_vowel_phonation_byola_embeddings = np.load(os.path.join(EMBEDDINGS_FOLDER, 'byola_512_a_vowel_phonation_embeddings_female_french.npy'))

In [29]:
RANDOM_STATE = 42
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
scaler = StandardScaler()

#### Reading

In [16]:
embeddings_var = reading_vggish_embeddings
embeddings_var.shape

(502, 128)

##### Grid search without PCA 

In [17]:
search_space = { 'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [3, 5, 7],
                 'classifier__weights': ['uniform', 'distance'],
                 }

pca_model =  Pipeline([('scaler', scaler), ('classifier', KNeighborsClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

knn_neighbours = clf.best_params_['classifier__n_neighbors']
knn_weights = clf.best_params_['classifier__weights']

print(clf.best_params_)

{'classifier': KNeighborsClassifier(n_neighbors=3), 'classifier__n_neighbors': 3, 'classifier__weights': 'uniform'}


In [18]:
search_space = {'classifier': [LogisticRegression(solver='liblinear')],
                'classifier__C': [0.01, 0.1, 1.0, 10]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', LogisticRegression())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lr_c = clf.best_params_['classifier__C']

print(print(clf.best_params_))

{'classifier': LogisticRegression(C=0.1, solver='liblinear'), 'classifier__C': 0.1}
None


In [19]:
search_space = {'classifier': [ExtraTreesClassifier(random_state=RANDOM_STATE)],
                'classifier__n_estimators': [10, 30, 50, 100, 200],
                'classifier__max_depth': [8, 10, 12, 20]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', ExtraTreesClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

etc_max_depth = clf.best_params_['classifier__max_depth']
etc_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': ExtraTreesClassifier(max_depth=20, n_estimators=50, random_state=42), 'classifier__max_depth': 20, 'classifier__n_estimators': 50}


In [20]:
search_space = {'classifier': [RandomForestClassifier(max_depth=4, max_leaf_nodes=5, random_state=RANDOM_STATE)],
                'classifier__max_depth': [5, 10, None],
                'classifier__n_estimators': [10, 20, 30, 40, 50, 100]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', RandomForestClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

rf_max_depth = clf.best_params_['classifier__max_depth']
rf_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': RandomForestClassifier(max_depth=5, max_leaf_nodes=5, n_estimators=10,
                       random_state=42), 'classifier__max_depth': 5, 'classifier__n_estimators': 10}


In [21]:
search_space = {'classifier': [LinearDiscriminantAnalysis()],
                'classifier__solver': ['svd', 'lsqr']}

pca_model =  Pipeline([('scaler', scaler), ('classifier', LinearDiscriminantAnalysis())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lda_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)


{'classifier': LinearDiscriminantAnalysis(), 'classifier__solver': 'svd'}


In [22]:
search_space = { 'classifier': [MLPClassifier(random_state=RANDOM_STATE, max_iter = 1000)],
                 'classifier__hidden_layer_sizes': [(4,), (8,), (20,), (50,), (32,), (32,6,), (100,), (256, 128,)],
                 'classifier__activation': ['relu', 'tanh', 'logistic'],
                 'classifier__solver': ['adam', 'lbfgs']}

pca_model =  Pipeline([('scaler', scaler), ('classifier',MLPClassifier(random_state=1, max_iter=1000))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

mlp_hidden_layer = clf.best_params_['classifier__hidden_layer_sizes']
mlp_activation = clf.best_params_['classifier__activation']
mlp_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)

{'classifier': MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=42,
              solver='lbfgs'), 'classifier__activation': 'relu', 'classifier__hidden_layer_sizes': (50,), 'classifier__solver': 'lbfgs'}


In [23]:
search_space = {'classifier': [SVC(kernel='rbf')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', ('classifier', SVC(kernel='rbf')))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_c = clf.best_params_['classifier__C']
svc_g = clf.best_params_['classifier__gamma']


print(clf.best_params_)

{'classifier': SVC(C=100000.0, gamma=1e-06), 'classifier__C': 100000.0, 'classifier__gamma': 1e-06}


In [24]:
search_space = {'classifier': [SVC(kernel='sigmoid')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', SVC(kernel='sigmoid'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_sig_c = clf.best_params_['classifier__C']
svc_sig_g = clf.best_params_['classifier__gamma']

print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=1e-05, kernel='sigmoid'), 'classifier__C': 10000.0, 'classifier__gamma': 1e-05}


##### Cross-validation

In [25]:
# Model building to train
names = ['K Nearest Neighbors','Logistic Regression', 'ExtraTrees Classifier','Random Forest','LDA' ,'MLP Classifier' ,'SVM rbf', 'SVM sigmoid']
Classifiers = [
               KNeighborsClassifier(n_neighbors=knn_neighbours, weights=knn_weights),
               LogisticRegression(solver='liblinear', C=lr_c), 
               ExtraTreesClassifier(random_state= RANDOM_STATE, max_depth=etc_max_depth, n_estimators=etc_n_estimators),
               RandomForestClassifier(max_leaf_nodes=5, random_state= RANDOM_STATE, max_depth=rf_max_depth, n_estimators=rf_n_estimators), 
               LinearDiscriminantAnalysis(solver = lda_solver),
               MLPClassifier(random_state= RANDOM_STATE, hidden_layer_sizes=mlp_hidden_layer, activation=mlp_activation, solver=mlp_solver, max_iter=1000),
               svm.SVC(kernel='rbf', C=svc_c, gamma=svc_g),
               svm.SVC(kernel='sigmoid', C=svc_sig_c, gamma=svc_sig_g)]

In [26]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.63 (0.03),0.63 (0.03),0.64 (0.03),0.61 (0.06),0.62 (0.04),0.64 (0.03)
1,Logistic Regression,0.66 (0.04),0.66 (0.04),0.66 (0.04),0.64 (0.06),0.65 (0.05),0.70 (0.05)
2,ExtraTrees Classifier,0.66 (0.03),0.66 (0.03),0.66 (0.03),0.64 (0.04),0.65 (0.03),0.70 (0.03)
3,Random Forest,0.64 (0.01),0.64 (0.01),0.67 (0.03),0.57 (0.06),0.61 (0.03),0.67 (0.04)
4,LDA,0.61 (0.05),0.61 (0.05),0.61 (0.05),0.61 (0.07),0.61 (0.06),0.65 (0.07)
5,MLP Classifier,0.67 (0.03),0.67 (0.03),0.69 (0.04),0.65 (0.06),0.66 (0.04),0.70 (0.05)
6,SVM rbf,0.66 (0.04),0.66 (0.04),0.67 (0.05),0.63 (0.04),0.65 (0.04),0.70 (0.04)
7,SVM sigmoid,0.65 (0.03),0.65 (0.03),0.66 (0.04),0.63 (0.03),0.64 (0.03),0.69 (0.05)


#### Counting

In [30]:
embeddings_var = counting_egemaps_embeddings
embeddings_var.shape

(502, 89)

##### Grid search without PCA 

In [31]:
search_space = { 'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [3, 5, 7],
                 'classifier__weights': ['uniform', 'distance'],
                 }

pca_model =  Pipeline([('scaler', scaler), ('classifier', KNeighborsClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

knn_neighbours = clf.best_params_['classifier__n_neighbors']
knn_weights = clf.best_params_['classifier__weights']

print(clf.best_params_)

{'classifier': KNeighborsClassifier(n_neighbors=3), 'classifier__n_neighbors': 3, 'classifier__weights': 'uniform'}


In [32]:
search_space = {'classifier': [LogisticRegression(solver='liblinear')],
                'classifier__C': [0.01, 0.1, 1.0, 10]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', LogisticRegression())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lr_c = clf.best_params_['classifier__C']

print(print(clf.best_params_))

{'classifier': LogisticRegression(C=0.1, solver='liblinear'), 'classifier__C': 0.1}
None


In [33]:
search_space = {'classifier': [ExtraTreesClassifier(random_state=RANDOM_STATE)],
                'classifier__n_estimators': [10, 30, 50, 100, 200],
                'classifier__max_depth': [8, 10, 12, 20]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', ExtraTreesClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

etc_max_depth = clf.best_params_['classifier__max_depth']
etc_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': ExtraTreesClassifier(max_depth=10, n_estimators=200, random_state=42), 'classifier__max_depth': 10, 'classifier__n_estimators': 200}


In [34]:
search_space = {'classifier': [RandomForestClassifier(max_depth=4, max_leaf_nodes=5, random_state=RANDOM_STATE)],
                'classifier__max_depth': [5, 10, None],
                'classifier__n_estimators': [10, 20, 30, 40, 50, 100]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', RandomForestClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

rf_max_depth = clf.best_params_['classifier__max_depth']
rf_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': RandomForestClassifier(max_depth=5, max_leaf_nodes=5, n_estimators=50,
                       random_state=42), 'classifier__max_depth': 5, 'classifier__n_estimators': 50}


In [35]:
search_space = {'classifier': [LinearDiscriminantAnalysis()],
                'classifier__solver': ['svd', 'lsqr']}

pca_model =  Pipeline([('scaler', scaler), ('classifier', LinearDiscriminantAnalysis())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lda_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)


{'classifier': LinearDiscriminantAnalysis(), 'classifier__solver': 'svd'}


In [36]:
search_space = { 'classifier': [MLPClassifier(random_state=RANDOM_STATE, max_iter = 1000)],
                 'classifier__hidden_layer_sizes': [(4,), (8,), (20,), (50,), (32,), (32,6,), (100,), (256, 128,)],
                 'classifier__activation': ['relu', 'tanh', 'logistic'],
                 'classifier__solver': ['adam', 'lbfgs']}

pca_model =  Pipeline([('scaler', scaler), ('classifier',MLPClassifier(random_state=1, max_iter=1000))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

mlp_hidden_layer = clf.best_params_['classifier__hidden_layer_sizes']
mlp_activation = clf.best_params_['classifier__activation']
mlp_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)

{'classifier': MLPClassifier(activation='logistic', hidden_layer_sizes=(8,), max_iter=1000,
              random_state=42), 'classifier__activation': 'logistic', 'classifier__hidden_layer_sizes': (8,), 'classifier__solver': 'adam'}


In [37]:
search_space = {'classifier': [SVC(kernel='rbf')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', ('classifier', SVC(kernel='rbf')))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_c = clf.best_params_['classifier__C']
svc_g = clf.best_params_['classifier__gamma']


print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=1e-05), 'classifier__C': 10000.0, 'classifier__gamma': 1e-05}


In [38]:
search_space = {'classifier': [SVC(kernel='sigmoid')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', SVC(kernel='sigmoid'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_sig_c = clf.best_params_['classifier__C']
svc_sig_g = clf.best_params_['classifier__gamma']

print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=1e-05, kernel='sigmoid'), 'classifier__C': 10000.0, 'classifier__gamma': 1e-05}


##### Cross-validation

In [39]:
# Model building to train
names = ['K Nearest Neighbors','Logistic Regression', 'ExtraTrees Classifier','Random Forest','LDA' ,'MLP Classifier' ,'SVM rbf', 'SVM sigmoid']
Classifiers = [
               KNeighborsClassifier(n_neighbors=knn_neighbours, weights=knn_weights),
               LogisticRegression(solver='liblinear', C=lr_c), 
               ExtraTreesClassifier(random_state= RANDOM_STATE, max_depth=etc_max_depth, n_estimators=etc_n_estimators),
               RandomForestClassifier(max_leaf_nodes=5, random_state= RANDOM_STATE, max_depth=rf_max_depth, n_estimators=rf_n_estimators), 
               LinearDiscriminantAnalysis(solver = lda_solver),
               MLPClassifier(random_state= RANDOM_STATE, hidden_layer_sizes=mlp_hidden_layer, activation=mlp_activation, solver=mlp_solver, max_iter=1000),
               svm.SVC(kernel='rbf', C=svc_c, gamma=svc_g),
               svm.SVC(kernel='sigmoid', C=svc_sig_c, gamma=svc_sig_g)]

In [40]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.57 (0.04),0.57 (0.04),0.59 (0.06),0.50 (0.04),0.54 (0.04),0.59 (0.05)
1,Logistic Regression,0.64 (0.02),0.64 (0.02),0.65 (0.02),0.63 (0.06),0.64 (0.04),0.68 (0.05)
2,ExtraTrees Classifier,0.63 (0.03),0.63 (0.03),0.64 (0.02),0.58 (0.07),0.61 (0.05),0.68 (0.03)
3,Random Forest,0.63 (0.05),0.63 (0.05),0.67 (0.07),0.54 (0.06),0.59 (0.06),0.68 (0.03)
4,LDA,0.63 (0.03),0.63 (0.03),0.63 (0.03),0.61 (0.06),0.62 (0.04),0.66 (0.04)
5,MLP Classifier,0.66 (0.01),0.66 (0.01),0.66 (0.02),0.64 (0.06),0.65 (0.03),0.70 (0.03)
6,SVM rbf,0.63 (0.03),0.63 (0.03),0.63 (0.03),0.61 (0.06),0.62 (0.04),0.67 (0.04)
7,SVM sigmoid,0.63 (0.02),0.63 (0.02),0.64 (0.02),0.62 (0.07),0.63 (0.04),0.67 (0.05)


#### A-vowel phonation

In [41]:
embeddings_var = a_vowel_phonation_byola_embeddings
embeddings_var.shape

(502, 512)

##### Grid search without PCA 

In [42]:
search_space = { 'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [3, 5, 7],
                 'classifier__weights': ['uniform', 'distance'],
                 }

pca_model =  Pipeline([('scaler', scaler), ('classifier', KNeighborsClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

knn_neighbours = clf.best_params_['classifier__n_neighbors']
knn_weights = clf.best_params_['classifier__weights']

print(clf.best_params_)

{'classifier': KNeighborsClassifier(n_neighbors=7), 'classifier__n_neighbors': 7, 'classifier__weights': 'uniform'}


In [43]:
search_space = {'classifier': [LogisticRegression(solver='liblinear')],
                'classifier__C': [0.01, 0.1, 1.0, 10]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', LogisticRegression())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lr_c = clf.best_params_['classifier__C']

print(print(clf.best_params_))

{'classifier': LogisticRegression(C=0.01, solver='liblinear'), 'classifier__C': 0.01}
None


In [44]:
search_space = {'classifier': [ExtraTreesClassifier(random_state=RANDOM_STATE)],
                'classifier__n_estimators': [10, 30, 50, 100, 200],
                'classifier__max_depth': [8, 10, 12, 20]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', ExtraTreesClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

etc_max_depth = clf.best_params_['classifier__max_depth']
etc_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': ExtraTreesClassifier(max_depth=10, random_state=42), 'classifier__max_depth': 10, 'classifier__n_estimators': 100}


In [45]:
search_space = {'classifier': [RandomForestClassifier(max_depth=4, max_leaf_nodes=5, random_state=RANDOM_STATE)],
                'classifier__max_depth': [5, 10, None],
                'classifier__n_estimators': [10, 20, 30, 40, 50, 100]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', RandomForestClassifier())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

rf_max_depth = clf.best_params_['classifier__max_depth']
rf_n_estimators = clf.best_params_['classifier__n_estimators']

print(clf.best_params_)

{'classifier': RandomForestClassifier(max_depth=5, max_leaf_nodes=5, n_estimators=40,
                       random_state=42), 'classifier__max_depth': 5, 'classifier__n_estimators': 40}


In [46]:
search_space = {'classifier': [LinearDiscriminantAnalysis()],
                'classifier__solver': ['svd', 'lsqr']}

pca_model =  Pipeline([('scaler', scaler), ('classifier', LinearDiscriminantAnalysis())])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

lda_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)


{'classifier': LinearDiscriminantAnalysis(), 'classifier__solver': 'svd'}


In [47]:
search_space = { 'classifier': [MLPClassifier(random_state=RANDOM_STATE, max_iter = 1000)],
                 'classifier__hidden_layer_sizes': [(4,), (8,), (20,), (50,), (32,), (32,6,), (100,), (256, 128,)],
                 'classifier__activation': ['relu', 'tanh', 'logistic'],
                 'classifier__solver': ['adam', 'lbfgs']}

pca_model =  Pipeline([('scaler', scaler), ('classifier',MLPClassifier(random_state=1, max_iter=1000))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

mlp_hidden_layer = clf.best_params_['classifier__hidden_layer_sizes']
mlp_activation = clf.best_params_['classifier__activation']
mlp_solver = clf.best_params_['classifier__solver']

print(clf.best_params_)

{'classifier': MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=1000, random_state=42,
              solver='lbfgs'), 'classifier__activation': 'relu', 'classifier__hidden_layer_sizes': (256, 128), 'classifier__solver': 'lbfgs'}


In [48]:
search_space = {'classifier': [SVC(kernel='rbf')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', ('classifier', SVC(kernel='rbf')))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_c = clf.best_params_['classifier__C']
svc_g = clf.best_params_['classifier__gamma']


print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=0.001), 'classifier__C': 10000.0, 'classifier__gamma': 0.001}


In [49]:
search_space = {'classifier': [SVC(kernel='sigmoid')],
                'classifier__C': [1e4, 1e5, 1e6],
                'classifier__gamma': [1e-6, 1e-5, 1e-4, 1e-3]}

pca_model =  Pipeline([('scaler', scaler), ('classifier', SVC(kernel='sigmoid'))])

clf = GridSearchCV(pca_model, search_space, cv=kfold, verbose=False, scoring='accuracy', n_jobs=-1)
clf = clf.fit(embeddings_var, y_smk)

svc_sig_c = clf.best_params_['classifier__C']
svc_sig_g = clf.best_params_['classifier__gamma']

print(clf.best_params_)

{'classifier': SVC(C=10000.0, gamma=0.001, kernel='sigmoid'), 'classifier__C': 10000.0, 'classifier__gamma': 0.001}


##### Cross-validation

In [50]:
# Model building to train
names = ['K Nearest Neighbors','Logistic Regression', 'ExtraTrees Classifier','Random Forest','LDA' ,'MLP Classifier' ,'SVM rbf', 'SVM sigmoid']
Classifiers = [
               KNeighborsClassifier(n_neighbors=knn_neighbours, weights=knn_weights),
               LogisticRegression(solver='liblinear', C=lr_c), 
               ExtraTreesClassifier(random_state= RANDOM_STATE, max_depth=etc_max_depth, n_estimators=etc_n_estimators),
               RandomForestClassifier(max_leaf_nodes=5, random_state= RANDOM_STATE, max_depth=rf_max_depth, n_estimators=rf_n_estimators), 
               LinearDiscriminantAnalysis(solver = lda_solver),
               MLPClassifier(random_state= RANDOM_STATE, hidden_layer_sizes=mlp_hidden_layer, activation=mlp_activation, solver=mlp_solver, max_iter=1000),
               svm.SVC(kernel='rbf', C=svc_c, gamma=svc_g),
               svm.SVC(kernel='sigmoid', C=svc_sig_c, gamma=svc_sig_g)]

In [51]:
pipeline_cross_val(names, Classifiers, embeddings_var, y_smk, kfold=kfold)

Unnamed: 0,Classifier,accuracy,balanced_accuracy,Precision,Recall,F1,AUC
0,K Nearest Neighbors,0.56 (0.04),0.56 (0.04),0.57 (0.05),0.50 (0.06),0.53 (0.05),0.58 (0.04)
1,Logistic Regression,0.62 (0.03),0.62 (0.03),0.63 (0.05),0.60 (0.06),0.61 (0.04),0.67 (0.04)
2,ExtraTrees Classifier,0.62 (0.04),0.62 (0.04),0.64 (0.07),0.56 (0.04),0.60 (0.02),0.65 (0.03)
3,Random Forest,0.62 (0.04),0.62 (0.04),0.64 (0.05),0.57 (0.07),0.60 (0.04),0.66 (0.04)
4,LDA,0.55 (0.07),0.55 (0.07),0.55 (0.06),0.56 (0.12),0.55 (0.09),0.58 (0.09)
5,MLP Classifier,0.65 (0.04),0.65 (0.04),0.66 (0.04),0.63 (0.06),0.64 (0.04),0.70 (0.02)
6,SVM rbf,0.63 (0.06),0.63 (0.06),0.65 (0.08),0.61 (0.06),0.63 (0.05),0.69 (0.05)
7,SVM sigmoid,0.60 (0.03),0.60 (0.03),0.60 (0.03),0.62 (0.07),0.61 (0.04),0.63 (0.03)
