In [None]:
import pandas as pd
import numpy as np

import sklearn
from sklearn import preprocessing, svm
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve, accuracy_score
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split, StratifiedKFold
al_score, GridSearchCV, train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.utils import class_weight
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

## Auxiliary functions

In [None]:
def prepare_data(X, Y, delete_equal, weighted):
    #normalize data
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(X)
    X_norm = pd.DataFrame(x_scaled)
    
    #define classweights to balance classes 50/50
    if (weighted):
        weights = class_weight.compute_class_weight('balanced', np.array([0,1]), Y)
        class_weights = {0:1, 1: weights[1]/weights[0]}     
    else:
        class_weights = {0:1, 1:1}
  

    return (X_norm, class_weights)

In [None]:
#array with metrics that were being analyzed
def evaluate(Y_test, pred):
    cr = classification_report(Y_test, pred, output_dict = True)
    cm = confusion_matrix(Y_test, pred)
    fpr, tpr, _ = roc_curve(Y_test, pred)
    
    auc = round(roc_auc_score(Y_test, pred, average = 'weighted'),2)
   
    results = [cr['weighted avg']['recall'], cr['0']['recall'], cr['1']['recall'], cr['weighted avg']['precision'], cr['0']['precision'], cr['1']['precision'], cr['weighted avg']['f1-score'], cr["accuracy"], auc] 
    tag = ["Recall_weighted", "Recall_0", "Recall_1", "Precision_weighted", "Precision_0", "Precision_1", "f1_weighted", "Acc","AUC"]

    return results, tag

In [None]:
#this function receives the training data, classes and class weights and train multiple learning algorithms using cross validation and then a final model on the test set
def ml_models(X,Y, class_weights):
    #split data into train and test
    X_fold, X_test, Y_fold, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42, stratify = Y) 
    
    names = ['K Nearest Neighbors','Random Forest', 'SVM Linear','SVM RBF', 'SVM Sigmoid']
    
    #array with the different classifiers used. Parameters should be changed everytime a new fine tunning is made in order to correctly generate it.
    classifiers = [
        KNeighborsClassifier(algorithm = 'auto', metric = 'manhattan', n_neighbors = 3, p = 1, weights = 'uniform'),
        RandomForestClassifier(class_weight = class_weights, random_state = 42, criterion = 'entropy', max_depth = 10, min_samples_split = 10, n_estimators = 25),
        svm.SVC(max_iter = 10000, kernel = 'sigmoid', class_weight = class_weights, random_state = 42, C = 1000, degree = 3, gamma = 'auto', tol = 0.001),
        svm.SVC(max_iter = 10000, kernel = 'rbf', class_weight = class_weights, random_state = 42, C = 10, degree = 3, gamma = 'scale', tol = 0.001),
        svm.SVC(max_iter = 10000, kernel = 'sigmoid', class_weight = class_weights, random_state = 42, C = 1000, degree = 3, gamma = 'auto', tol = 0.001)
        ]

    models = zip(names, classifiers)
                        
    names = []
    result = []
    metrics = []

    kfold = KFold(n_splits = 10, shuffle=True, random_state = 42)
 
    for name, model in models:
        #cross validation results (AUC)
        cv_results = cross_val_score(model, X_fold, Y_fold, cv = kfold, scoring = 'roc_auc')
        result.append(cv_results)
        names.append(name)
        msg = "{0}: {1} ({2})".format(name, cv_results.mean(), cv_results.std())
        print(msg)

        ##final model on test data
        model.fit(X_fold,Y_fold)
        Y_pred = model.predict(X_test)

        metric, tags = evaluate(Y_test, Y_pred)
        metric.append(cv_results.mean())
        tags.append("kfold_AUC")
        metrics.append(metric)

        return names, result, metrics, tags
  

## openSMILE

### RFECV 3gp Type1 and Type2

In [None]:
Y = pd.read_csv("../Data/prediCovid_taste_loss_dataset_B_trim_june.csv")
Y = Y[Y["format"]=="3gp"]["lossTaste_daily"]

df = pd.read_csv("../openSMILE/3gp/type1_type2/rfe_cv_3gp_type1_type2.csv")
df = df.drop(columns = ["Unnamed: 0"])
X = np.array(df)

X_norm, class_weights = prepare_data(X, Y, delete_equal = 1, weighted = 1)

In [None]:
names, result, metrics, tags = ml_models(X_norm, Y, class_weights)

In [None]:
df = pd.DataFrame(metrics)
df.to_csv("../openSMILE/results/rfecv_3gp_type1_type2_openSMILE.csv", index = False)

### RFECV m4a Type1 and Type2

In [None]:
Y = pd.read_csv("/content/drive/MyDrive/LIH/Data/prediCovid_taste_loss_dataset_B_trim_june.csv")
Y = Y[Y["format"]=="m4a"]["lossTaste_daily"]

df = pd.read_csv("/content/drive/MyDrive/LIH/Scripts/openSMILE/m4a/type1_type2/rfe_cv_m4a_type1_type2.csv")
df = df.drop(columns = ["Unnamed: 0"])
X = np.array(df)

X_norm, class_weights = prepare_data(X, Y, delete_equal = 1, weighted = 1)

In [None]:
names, result, metrics, tags = ml_models(X_norm, Y, class_weights)

In [None]:
df = pd.DataFrame(metrics)
df.to_csv("../openSMILE/results/rfecv_m4a_type1_type2_openSMILE.csv", index = False)

## Fine tunning

This section have a fine tunner with grid search and for each algorithm, it varies its parameters maximising the roc_auc.

In [None]:
def fine_tune(classifier,parameters, X_norm, Y):
    grid_search = GridSearchCV(estimator = classifier,
                               param_grid = parameters,
                               scoring = 'roc_auc',
                               n_jobs = -1,
                               cv = 10,
                               verbose = 2)
    
    grid_search = grid_search.fit(X_norm, Y)
    grid_search.best_params_

    return grid_search

In [None]:
#change formats based on which data to use

formats = "3gp"
#formats = "m4a"

Y = pd.read_csv("/content/drive/MyDrive/LIH/Data/prediCovid_taste_loss_dataset_B_trim_june.csv")
Y = Y[Y["format"]==formats]["lossTaste_daily"]

df = pd.read_csv("../openSMILE/" + formats + "/type1_type2/rfe_cv_" + formats + "\"" + formats + "_type1_type2.csv")
df = df.drop(columns = ["Unnamed: 0"])

X = np.array(df)

X_all, class_weights = prepare_data(X, Y, delete_equal = 1, weighted = 1)

X_norm, X_test, Y_fold, Y_test = train_test_split(X_all, Y, test_size = 0.2, random_state = 42, stratify = Y) 


In [None]:
classifier = KNeighborsClassifier()
parameters = {'n_neighbors': [3,5,10,15,20,25,30,40,50], 'weights': ["uniform","distance"], 'algorithm': ["auto", "ball_tree", "kd_tree", "brute"], 'p': [1, 2], 'metric': ["euclidean","manhattan","chebyshev"]}

grid = fine_tune(classifier, parameters, X_norm, Y_fold)
print(grid.best_params_)

In [None]:
classifier = RandomForestClassifier(class_weight = class_weights, random_state = 42)
parameters = {'criterion': ["gini","entropy"],'n_estimators': [25, 100, 150, 200, 300], 'max_depth': [1,3,5,10, 'None'], 'min_samples_split': [2, 4, 8, 10, 50, 100]}

grid = fine_tune(classifier, parameters, X_norm, Y_fold)
print(grid.best_params_)


In [None]:
classifier = svm.SVC(class_weight = class_weights, max_iter = 10000)
# Applying Grid Search to find the best parameters
parameters = {'C': [0.1, 1, 10, 100],'kernel': ["linear"], 'degree': [3,4,5], 'tol': [1e-3,1e-4]}

grid = fine_tune(classifier, parameters, X_norm, Y_fold)
grid.best_params_

In [None]:
classifier = svm.SVC(kernel = "rbf", class_weight = class_weights, max_iter = 1000)
# Applying Grid Search to find the best parameters
parameters = {'C': [1, 10, 100, 1000], 'gamma' : ["scale", "auto"], 'degree': [3,4,5], 'tol': [1e-3,1e-4]}

grid = fine_tune(classifier, parameters, X_norm, Y_fold)
grid.best_params_

In [None]:
classifier = svm.SVC(kernel = "sigmoid", class_weight = class_weights, max_iter = 1000)
# Applying Grid Search to find the best parameters
parameters = {'C': [10, 100, 1000], 'gamma' : ["scale", "auto"], 'degree': [3,4], 'tol': [1e-3,1e-4]}

grid = fine_tune(classifier, parameters, X_norm, Y_fold)
grid.best_params_