In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing, svm, datasets, metrics, model_selection
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve, accuracy_score
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process.kernels import RBF
from sklearn.utils import class_weight
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB

np.random.seed(42)

In [None]:
def prepare_data(X, Y, delete_equal, weighted):
    #normalize data
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(X)
    X_norm = pd.DataFrame(x_scaled)
  
    #define classweights to balance classes 50/50
    if (weighted):
        weights = class_weight.compute_class_weight('balanced', np.array([0,1]), Y)
        class_weights = {0:1, 1: weights[1]/weights[0]}     
    else:
        class_weights = {0:1, 1:1}
  

  return (X_norm, class_weights)

In [None]:
def evaluate(X_test,Y_test, Y_pred, model_name):
    predictions = Y_pred

    print(classification_report(Y_test, predictions))

    cm = confusion_matrix(Y_test, predictions)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels = np.array([0,1]))
    disp = disp.plot(include_values=True, cmap=plt.cm.Blues , xticks_rotation='vertical', values_format = '.5g')
    disp.ax_.set_title(model_name)
    
    #important: the auc on graph is not the weighted one
    metrics.plot_roc_curve(model,X_test,Y_test)
    
    print(round(roc_auc_score(Y_test, Y_pred, average = "weighted"),2))

# 3gp

## Type1 and Type2

In [None]:
Y = pd.read_csv("../Data/prediCovid_taste_loss_dataset_B_trim_june.csv")
Y = Y[Y["format"]=="3gp"]["lossTaste_daily"]

df = pd.read_csv("../openSMILE/3gp/type1_type2/rfe_cv_3gp_type1_type2.csv")
df = df.drop(columns = ["Unnamed: 0"])

X = np.array(df)

#normalize data
X_norm, class_weights = prepare_data(X, Y, delete_equal = 1, weighted = 1)

In [None]:
#divide in train and test
X_fold, X_test, Y_fold, Y_test = train_test_split(X_norm, Y, test_size = 0.2, random_state = 42, stratify = Y) 

#best model for 3gp
model = KNeighborsClassifier(algorithm = 'auto', metric = 'euclidean', n_neighbors = 3, p = 1, weights = 'uniform')
model.fit(X_fold, Y_fold)

Y_proba = model.predict_proba(X_test)

In [None]:
#one hot encoding the probability to calculate performance
Y_hot = np.argmax(Y_proba, axis = 1)

#important: the auc on graph is not the weighted one
evaluate(X_test, Y_test, Y_hot, "3gp Type1 and Type2")

In [None]:
proba = pd.DataFrame(Y_proba)[1].to_list()
df_proba = pd.DataFrame({"Taste and Smell loss": Y_test, "Predicted Probability": proba})

In [None]:
import seaborn as sns
from scipy import stats

sns.boxplot(df_proba["Taste and Smell loss"], y = df_proba["Predicted Probability"], palette= {0: "lightskyblue", 1 : "cornflowerblue"})
plt.savefig("../openSMILE/3gp_biomarkers.png")

prob_F,prob_p = stats.ttest_ind(df_proba[df_proba["Predicted Probability"]<=0.5]["Predicted Probability"], df_proba[df_proba["Predicted Probability"]>0.5]["Predicted Probability"])
print(prob_p)

df_proba.to_csv("../openSMILE/probas/3gp_rfecv_type1_type2.csv", index = False)

# m4a

## Type1 and Type2

In [None]:
Y = pd.read_csv("../Data/prediCovid_taste_loss_dataset_B_trim_june.csv")
Y = Y[Y["format"]=="m4a"]["lossTaste_daily"]

df = pd.read_csv("../openSMILE/m4a/type1_type2/rfe_cv_m4a_type1_type2.csv")
df = df.drop(columns = ["Unnamed: 0"])

X = np.array(df)

#normalize data
X_norm, class_weights = prepare_data(X, Y, delete_equal = 1, weighted = 1)

In [None]:
#divide data in test and train
X_fold, X_test, Y_fold, Y_test = train_test_split(X_norm, Y, test_size = 0.2, random_state = 42, stratify = Y) 

#best model for 3gp
model = KNeighborsClassifier(algorithm = 'auto', metric = 'manhattan', n_neighbors = 3, p = 1, weights = 'uniform')
model.fit(X_fold, Y_fold)

Y_proba = model.predict_proba(X_test)

In [None]:
#one hot encoding the probability to calculate performance
Y_hot = np.argmax(Y_proba, axis = 1)

#important: the auc on graph is not the weighted one
evaluate(X_test, Y_test, Y_hot, "m4a Type1 and Type2")

In [None]:
proba = pd.DataFrame(Y_proba)[1].to_list()
df_proba = pd.DataFrame({"Taste and Smell loss": Y_test, "Predicted Probability": proba})

In [None]:
import seaborn as sns
from scipy import stats

sns.boxplot(df_proba["Taste and Smell loss"], y = df_proba["Predicted Probability"], palette= {0: "lightskyblue", 1 : "cornflowerblue"})

plt.savefig("../openSMILE/m4a_biomarkers.png")
prob_F,prob_p = stats.ttest_ind(df_proba[df_proba["Predicted Probability"]<=0.5]["Predicted Probability"], df_proba[df_proba["Predicted Probability"]>0.5]["Predicted Probability"])
print(prob_p)

df_proba.to_csv("../openSMILE/probas/m4a_rfecv_type1_type2.csv", index = False)