# Classification

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import StratifiedGroupKFold

from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, matthews_corrcoef
from scipy.stats import mode
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout
from scikeras.wrappers import KerasClassifier
from keras.metrics import AUC

import random
np.random.seed(123)
random.seed(123)
tf.random.set_seed(123)

In [3]:
#Upload CSV files
scaled_data = pd.read_csv("Data/scaled_data.csv")
extracted_data = pd.read_csv("Data/df_extracted_features.csv")

# Concatenate all data in a single dataframe:
all_data = pd.concat([scaled_data, extracted_data], axis=1)

In [4]:
# Import generated and selected variables: 
import sys
sys.path.append("Data/") 
from feature_extraction import pca_features, mfa_features, tsne_features
from feature_selection import mrmr_features, rfe_features, ga_features

In [5]:
# Define sets of variables to be evaluated
all_features = [mrmr_features, rfe_features, ga_features, pca_features, mfa_features, tsne_features]
all_names = ["mRMR_Features", "RFE_Features","GA_Features" ,"PCA_Features", "MFA_Features", "tSNE_Features"]
Class = scaled_data.columns[754]

In [6]:
def classification(data, features, features_names, classifier, param_grid=None):
    '''
    Function to train and evaluate different datasets using a specified classifier.
    Evaluation uses Leave-One-Group-Out cross-validation method.
    If a param_grid is supplied, it performs hyperparameter tuning to maximize the MCC score.
    Returns a dataframe with the best results for each dataset, providing accuracy, F1-score, Kappa and MCC values.
    '''

    grup = data["id"]
    Class = data.iloc[:, 754]
    results_dict = {}
    
    # Leave-One-Group-Out
    logo = LeaveOneGroupOut()

    # Evaluate each set of features
    for num,feature_set in enumerate(features):
        sel_features = data[feature_set]
        results = []        
        
        
        # List of predictions and labels
        pred = []
        labels = []

 
        # Cross-validation LOGO
        for train_idx, test_idx in logo.split(sel_features, Class, groups=grup):
            X_train, X_test = sel_features.iloc[train_idx], sel_features.iloc[test_idx]
            y_train, y_test = Class.iloc[train_idx], Class.iloc[test_idx]

            # Training and optimizing with grid_search
            if param_grid is not None:               
                mcc_score = make_scorer(matthews_corrcoef)
                grid_search = GridSearchCV(classifier, param_grid, scoring= mcc_score,  n_jobs=-1)
                grid_search.fit(X_train, y_train)            

                # Predictions
                predictions = grid_search.best_estimator_.predict(X_test)

            # Training when no param_grid defined    
            else:
                classifier.fit(X_train, y_train)
                predictions = classifier.predict(X_test)

                
            pred.extend(predictions)
            labels.extend(y_test)

        # Voting final predictions
        final_pred = []
        for i in range(0, len(labels), 3):  # 3 observations per subject
            patient_preds = pred[i:i+3]
            final = mode(patient_preds).mode.item()
            final_pred.append(final)

        # Metrics calculation
        accuracy = accuracy_score(labels[::3], final_pred)
        f1 = f1_score(labels[::3], final_pred, average='weighted')
        kappa = cohen_kappa_score(labels[::3], final_pred)
        mcc = matthews_corrcoef(labels[::3], final_pred)

        if param_grid is not None:
            results.append({
                'accuracy': accuracy,
                'F1': f1,
                'Kappa': kappa,
                'MCC': mcc,
                'Best_Param': grid_search.best_params_ 
            })
        
        else:
            results.append({
                'accuracy': accuracy,
                'F1': f1,
                'Kappa': kappa,
                'MCC': mcc
            })

        df = pd.DataFrame(results)
        results_dict[features_names[num]] = df
        
        results_df = pd.concat(results_dict.values(), keys=results_dict.keys(), names=['Feature_Type', "Index"])
        results_df.reset_index(level='Feature_Type', inplace=False)
    return(results_df)

## k-NN

In [7]:
knn = KNeighborsClassifier()

param_grid_knn = {
            'n_neighbors': [1,3,5,7,11,15]
        }
knn_results = classification(all_data , all_features, all_names, knn, param_grid_knn)

In [8]:
knn_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,F1,Kappa,MCC,Best_Param
Feature_Type,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mRMR_Features,0,0.857143,0.847523,0.579689,0.59617,{'n_neighbors': 5}
RFE_Features,0,0.825397,0.812103,0.480315,0.4974,{'n_neighbors': 1}
GA_Features,0,0.813492,0.786338,0.399513,0.446659,{'n_neighbors': 15}
PCA_Features,0,0.81746,0.806695,0.469036,0.4795,{'n_neighbors': 11}
MFA_Features,0,0.793651,0.771913,0.363636,0.387069,{'n_neighbors': 15}
tSNE_Features,0,0.75,0.743468,0.308175,0.309691,{'n_neighbors': 11}


## Naive Bayes

In [9]:
nb = GaussianNB()

nb_results = classification(all_data , all_features, all_names, nb)

In [10]:
nb_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,F1,Kappa,MCC
Feature_Type,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mRMR_Features,0,0.837302,0.835993,0.563903,0.564191
RFE_Features,0,0.765873,0.776077,0.447615,0.458368
GA_Features,0,0.797619,0.770633,0.356499,0.392107
PCA_Features,0,0.805556,0.794871,0.437602,0.446178
MFA_Features,0,0.769841,0.74795,0.298656,0.31461
tSNE_Features,0,0.765873,0.746979,0.299076,0.310907


Si intento utiltizar priors (indicant els pesos de cada classe inicialment obtinc pitjors resultats. També si intento utiltizar el var_smoothing.

### Logistic regression

In [11]:
lr = LogisticRegression(max_iter=200) #increase max number of iterations

lr_results = classification(all_data , all_features, all_names, lr)

In [12]:
lr_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,F1,Kappa,MCC
Feature_Type,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mRMR_Features,0,0.876984,0.868166,0.635974,0.65624
RFE_Features,0,0.876984,0.868166,0.635974,0.65624
GA_Features,0,0.825397,0.807003,0.461538,0.49128
PCA_Features,0,0.84127,0.833171,0.543478,0.552773
MFA_Features,0,0.789683,0.77271,0.370356,0.385008
tSNE_Features,0,0.761905,0.725723,0.228571,0.257855


### SVM

In [13]:
param_grid_svm = {
    'C': [0.01, 0.1, 0.5, 1,10,100],
    'gamma': ['scale', 'auto']
}

svm_l = SVC(kernel = "linear", class_weight="balanced")
svm_r = SVC(kernel = "rbf", class_weight="balanced")

In [14]:
# Linear
svm_l_results = classification(all_data , all_features, all_names, svm_l, param_grid_svm)

In [15]:
svm_l_results 

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,F1,Kappa,MCC,Best_Param
Feature_Type,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mRMR_Features,0,0.801587,0.808201,0.516278,0.522576,"{'C': 100, 'gamma': 'scale'}"
RFE_Features,0,0.880952,0.882652,0.69526,0.696555,"{'C': 10, 'gamma': 'scale'}"
GA_Features,0,0.75,0.764011,0.441065,0.464002,"{'C': 1, 'gamma': 'scale'}"
PCA_Features,0,0.789683,0.797678,0.494474,0.5028,"{'C': 0.1, 'gamma': 'scale'}"
MFA_Features,0,0.710317,0.72773,0.368876,0.394491,"{'C': 0.5, 'gamma': 'scale'}"
tSNE_Features,0,0.662698,0.68333,0.271329,0.291884,"{'C': 100, 'gamma': 'scale'}"


In [16]:
#RBF
svm_r_results = classification(all_data , all_features, all_names, svm_r, param_grid_svm)

In [17]:
svm_r_results 

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,F1,Kappa,MCC,Best_Param
Feature_Type,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mRMR_Features,0,0.853175,0.856197,0.629706,0.632276,"{'C': 10, 'gamma': 'scale'}"
RFE_Features,0,0.876984,0.879516,0.689754,0.692569,"{'C': 1, 'gamma': 'scale'}"
GA_Features,0,0.72619,0.741535,0.387833,0.408001,"{'C': 10, 'gamma': 'auto'}"
PCA_Features,0,0.805556,0.801708,0.467667,0.469025,"{'C': 10, 'gamma': 'scale'}"
MFA_Features,0,0.690476,0.706443,0.292542,0.304013,"{'C': 0.1, 'gamma': 'scale'}"
tSNE_Features,0,0.710317,0.722942,0.31654,0.324145,"{'C': 0.1, 'gamma': 'scale'}"


### Random Forest

In [18]:
param_grid_rf = {'n_estimators': [20,50,100,200]}

rf = RandomForestClassifier(class_weight="balanced", n_jobs=-1) 

rf_results = classification(all_data , all_features, all_names, rf, param_grid_rf)

In [19]:
rf_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,F1,Kappa,MCC,Best_Param
Feature_Type,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mRMR_Features,0,0.837302,0.819297,0.495212,0.530156,{'n_estimators': 200}
RFE_Features,0,0.845238,0.828112,0.519836,0.556518,{'n_estimators': 200}
GA_Features,0,0.829365,0.804522,0.450619,0.503795,{'n_estimators': 200}
PCA_Features,0,0.813492,0.781392,0.384023,0.446797,{'n_estimators': 50}
MFA_Features,0,0.789683,0.764085,0.339466,0.368044,{'n_estimators': 200}
tSNE_Features,0,0.797619,0.783112,0.401118,0.413901,{'n_estimators': 200}


### Artificial Neural Network

In [20]:
# Define neural network architecture:

def neural_network(neurons):
    tf.random.set_seed(123)
    nn = Sequential()   
    nn.add(Dense(neurons, activation="relu", input_shape=(sel_features.shape[1],)))  
    nn.add(Dropout(0.2))
    nn.add(Dense(1, activation="sigmoid"))  # Binary output
    nn.compile(loss="binary_crossentropy", metrics=["AUC"])
    return nn


# Compile model
nn_model = KerasClassifier(model=neural_network, verbose=0, random_state=123)


# Param grid
param_grid = {"model__neurons": [8,16,32,64,128],
             "epochs":[25,50,75,100],
             "optimizer": ["adam", "sdg"]}  

In [28]:
grup = all_data["id"]
Class = all_data.iloc[:, 754]
results_dict = {}

# Stratified Group K Fold
sgkf = StratifiedGroupKFold(n_splits=10)

# Evaluate each set of features
for num,feature_set in enumerate(all_features):
    sel_features = all_data[feature_set]
    results = []        


    # List of predictions and labels
    pred = []
    labels = []


    # Cross-validation SGKF
    for train_idx, test_idx in sgkf.split(sel_features, Class, groups=grup):
        X_train, X_test = sel_features.iloc[train_idx], sel_features.iloc[test_idx]
        y_train, y_test = Class.iloc[train_idx], Class.iloc[test_idx]

               
        # Search for best params
        mcc_score = make_scorer(matthews_corrcoef)
        grid_search = GridSearchCV(nn_model, param_grid, scoring= mcc_score,  n_jobs=-1)
        grid_search.fit(X_train, y_train)         
        

        # Predictions
        predictions = grid_search.best_estimator_.predict(X_test)

        pred.extend(predictions)
        labels.extend(y_test)

    # Voting final predictions
    final_pred = []
    for i in range(0, len(labels), 3):  # 3 observations per subject
        patient_preds = pred[i:i+3]
        final = mode(patient_preds).mode.item()
        final_pred.append(final)

    # Metrics calculation
    accuracy = accuracy_score(labels[::3], final_pred)
    f1 = f1_score(labels[::3], final_pred, average='weighted')
    kappa = cohen_kappa_score(labels[::3], final_pred)
    mcc = matthews_corrcoef(labels[::3], final_pred)


    results.append({
        'accuracy': accuracy,
        'F1': f1,
        'Kappa': kappa,
        'MCC': mcc,
        'Best_Neurons': grid_search.best_params_['model__neurons'], 
        'Best_Optimizer': grid_search.best_params_['optimizer'],
        'Best_Epochs': grid_search.best_params_['epochs']
    })

    df = pd.DataFrame(results)
    results_dict[all_names[num]] = df

    nn_results = pd.concat(results_dict.values(), keys=results_dict.keys(), names=['Feature_Type', "Index"])
    nn_results.reset_index(level='Feature_Type', inplace=False)

In [29]:
nn_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,F1,Kappa,MCC,Best_Neurons,Best_Optimizer,Best_Epochs
Feature_Type,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
mRMR_Features,0,0.873016,0.866537,0.634783,0.645639,128,adam,100
RFE_Features,0,0.884921,0.881914,0.681541,0.684894,128,adam,100
GA_Features,0,0.837302,0.830824,0.539818,0.545658,64,adam,75
PCA_Features,0,0.849206,0.839052,0.556338,0.572155,16,adam,25
MFA_Features,0,0.777778,0.76644,0.36087,0.367041,64,adam,100
tSNE_Features,0,0.77381,0.755556,0.322836,0.335608,8,adam,50


In [30]:
knn_results.to_csv("Results/knn_results.csv", sep=";")
nb_results.to_csv("Results/nb_results.csv", sep=";")
lr_results.to_csv("Results/lr_results.csv", sep=";")
svm_l_results.to_csv("Results/svm_l_results.csv", sep=";")
svm_r_results.to_csv("Results/svm_r_results.csv", sep=";")
rf_results.to_csv("Results/rf_results.csv", sep=";")
nn_results.to_csv("Results/nn_results.csv", sep=";")