In [1]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.utils import class_weight
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

In [None]:
def prepare_data(X, Y, delete_equal, weighted):
    #normalize data
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(X)
    X_norm = pd.DataFrame(x_scaled)
  
    #define classweights to balance classes 50/50
    if (weighted):
        weights = class_weight.compute_class_weight('balanced', np.array([0,1]), Y)
        class_weights = {0:1, 1: weights[1]/weights[0]}     
    else:
        class_weights = {0:1, 1:1}
  

    return (X_norm, class_weights)

## Format: m4a

In [None]:
#Data with the outcome
Y = pd.read_csv("../Data/prediCovid_taste_loss_dataset_B_trim_june.csv")

#Filter the outcome for m4a format only
Y = Y[Y["format"]=="m4a"]["lossTaste_daily"]

#load OpenSMILE features
df = pd.read_pickle("../Data/m4a_type1_type2_openSMILE.pkl")
df = df.drop(columns = ["ID_short"])

#feature names
features = df.columns

X = pd.DataFrame(np.array(df))

#Separate test and train
X_fold, X_test, Y_fold, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42, stratify = Y) 
X_norm, class_weights = prepare_data(X_fold, Y_fold, delete_equal = 1, weighted = 1)

In [None]:
min_features_to_select = 10

estimator = LogisticRegression(class_weight = class_weights, max_iter = 10000, random_state = 42)
rfecv = RFECV(estimator=estimator, step=25, cv=KFold(10, shuffle = True, random_state = 42), scoring='roc_auc', min_features_to_select=min_features_to_select, verbose = 1)
selector = rfecv.fit(X_norm, Y_fold)

In [None]:
print("Optimal number of features : %d" % rfecv.n_features_)

In [None]:
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features")
plt.ylabel("Cross validation ROC_AUC score")
plt.plot(np.multiply(range(min_features_to_select, len(selector.grid_scores_) + min_features_to_select),25), selector.grid_scores_)
plt.vlines(849,0.640,0.790, color = "r", linestyle = "--")
plt.hlines(0.790, 10, 849, color = "r", linestyle = "--")

plt.savefig("../openSMILE/m4a_rfecv.png")
plt.show()


In [None]:
rfe = features[selector.support_]
rfe_selected = df[rfe.to_list()]
rfe_selected.to_csv("../openSMILE/m4a/type1_type2/rfe_cv_m4a_type1_type2.csv")

## Format 3gp

In [None]:
#Data with the outcome
Y = pd.read_csv("../Data/prediCovid_taste_loss_dataset_B_trim_june.csv")

#Filter the outcome for m4a format only
Y = Y[Y["format"]=="3gp"]["lossTaste_daily"]

#load OpenSMILE features
df = pd.read_pickle("../Data/3gp_type1_type2_openSMILE.pkl")
df = df.drop(columns = ["ID_short"])

#feature names
features = df.columns

X = pd.DataFrame(np.array(df))

#Separate test and train
X_fold, X_test, Y_fold, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42, stratify = Y) 
X_norm, class_weights = prepare_data(X_fold, Y_fold, delete_equal = 1, weighted = 1)

In [None]:
min_features_to_select = 10

estimator = LogisticRegression(class_weight = class_weights, max_iter = 10000, random_state = 42)
rfecv = RFECV(estimator=estimator, step=25, cv=KFold(10, shuffle = True, random_state = 42), scoring='roc_auc', min_features_to_select=min_features_to_select)
selector = rfecv.fit(X_norm, Y_fold)

In [None]:
print("Optimal number of features : %d" % rfecv.n_features_)

In [None]:
# Plot number of features versus cross-validation scores
plt.figure()
plt.xlabel("Number of features")
plt.ylabel("Cross validation ROC_AUC score")
plt.plot(np.multiply(range(min_features_to_select, len(selector.grid_scores_) + min_features_to_select), 25),
         selector.grid_scores_)
plt.vlines(3248,0.74,0.844, color = "r", linestyle = "--")
plt.hlines(0.844, 10, 3248, color = "r", linestyle = "--")

plt.savefig("../openSMILE/3gp_rfecv.png")

plt.show()

In [None]:
rfe = features[selector.support_]
rfe_selected = df[rfe.to_list()]
rfe_selected.to_csv("../openSMILE/3gp/type1_type2/rfe_cv_3gp_type1_type2.csv")