# Classification

In [1]:
import pandas as pd
import numpy as np
np.random.seed(123)

#Upload CSV files
scaled_data = pd.read_csv("Data/scaled_data.csv")
extracted_data = pd.read_csv("Data/df_extracted_features.csv")

# Concatenate all data in a single dataframe:
all_data = pd.concat([scaled_data, extracted_data], axis=1)

In [2]:
# Import generated and selected variables: 
import sys
sys.path.append("Data/") 
from feature_extraction import pca_features, mfa_features, tsne_features
from feature_selection import mrmr_features, rfe_features, ga_features

In [3]:
# Define sets of variables to be evaluated
all_features = [mrmr_features, rfe_features, ga_features, pca_features, mfa_features, tsne_features]
all_names = ["mRMR_Features", "RFE_Features","GA_Features" ,"PCA_Features", "MFA_Features", "tSNE_Features"]
Class = scaled_data.columns[754]

In [4]:
# Define variables for testing the code
feat1 = [pca_features, mfa_features]
names1 = ["PCA_Features", "MFA_Features"]

In [5]:
def classification(data, features, features_names, classifier, param_grid=None):
    '''
    Function to train and evaluate different datasets using a specified classifier.
    Evaluation uses Leave-One-Group-Out cross-validation method.
    If a param_grid is supplied, it performs hyperparameter tuning to maximize the MCC score.
    Returns a dataframe with the best results for each dataset, providing accuracy, F1-score, Kappa and MCC values.
    '''

    grup = data["id"]
    Class = data.iloc[:, 754]
    results_dict = {}
    
    # Leave-One-Group-Out
    logo = LeaveOneGroupOut()

    # Evaluate each set of features
    for num,feature_set in enumerate(features):
        sel_features = data[feature_set]
        results = []        
        
        
        # List of predictions and labels
        pred = []
        labels = []

 
        # Cross-validation LOGO
        for train_idx, test_idx in logo.split(sel_features, Class, groups=grup):
            X_train, X_test = sel_features.iloc[train_idx], sel_features.iloc[test_idx]
            y_train, y_test = Class.iloc[train_idx], Class.iloc[test_idx]

            # Training and optimizing with grid_search
            if param_grid is not None:               
                mcc_score = make_scorer(matthews_corrcoef)
                grid_search = GridSearchCV(classifier, param_grid, scoring= mcc_score,  n_jobs=-1)
                grid_search.fit(X_train, y_train)            

                # Predictions
                predictions = grid_search.best_estimator_.predict(X_test)

            # Training when no param_grid defined    
            else:
                classifier.fit(X_train, y_train)
                predictions = classifier.predict(X_test)

                
            pred.extend(predictions)
            labels.extend(y_test)

        # Voting final predictions
        final_pred = []
        for i in range(0, len(labels), 3):  # 3 observations per subject
            patient_preds = pred[i:i+3]
            final = mode(patient_preds).mode.item()
            final_pred.append(final)

        # Metrics calculation
        accuracy = accuracy_score(labels[::3], final_pred)
        f1 = f1_score(labels[::3], final_pred, average='weighted')
        kappa = cohen_kappa_score(labels[::3], final_pred)
        mcc = matthews_corrcoef(labels[::3], final_pred)

        if param_grid is not None:
            results.append({
                'accuracy': accuracy,
                'F1': f1,
                'Kappa': kappa,
                'MCC': mcc,
                'Best_Param': grid_search.best_params_ 
            })
        
        else:
            results.append({
                'accuracy': accuracy,
                'F1': f1,
                'Kappa': kappa,
                'MCC': mcc
            })

        df = pd.DataFrame(results)
        results_dict[features_names[num]] = df
        
        results_df = pd.concat(results_dict.values(), keys=results_dict.keys(), names=['Feature_Type', "Index"])
        results_df.reset_index(level='Feature_Type', inplace=False)
    return(results_df)

## k-NN

In [6]:
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, matthews_corrcoef
from scipy.stats import mode
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [7]:
knn = KNeighborsClassifier()

param_grid_knn = {
            'n_neighbors': [1,3]
        }
classification(all_data , feat1, names1, knn, param_grid_knn)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,F1,Kappa,MCC,Best_Param
Feature_Type,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PCA_Features,0,0.77381,0.766391,0.367225,0.369994,{'n_neighbors': 1}
MFA_Features,0,0.793651,0.779755,0.392884,0.404054,{'n_neighbors': 3}


## Naive Bayes

In [8]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

gnb = GaussianNB()

classification(all_data , feat1, names1, gnb)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,F1,Kappa,MCC
Feature_Type,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PCA_Features,0,0.805556,0.794871,0.437602,0.446178
MFA_Features,0,0.769841,0.74795,0.298656,0.31461


Si intento utiltizar priors (indicant els pesos de cada classe inicialment obtinc pitjors resultats. També si intento utiltizar el var_smoothing.

### Logistic regression

In [9]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=200) #increase max number of iterations

classification(all_data , feat1, names1, lr)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,F1,Kappa,MCC
Feature_Type,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PCA_Features,0,0.84127,0.833171,0.543478,0.552773
MFA_Features,0,0.789683,0.77271,0.370356,0.385008


### SVM

In [10]:
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

param_grid_svm = {
    'C': [0.1, 0.5, 1,2,5,10],
    'gamma': ['scale', 'auto']
}

svm_l = SVC(kernel = "linear", class_weight="balanced")
svm_r = SVC(kernel = "rbf", class_weight="balanced")

In [11]:
# Linear
classification(all_data , feat1, names1, svm_l, param_grid_svm)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,F1,Kappa,MCC,Best_Param
Feature_Type,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PCA_Features,0,0.789683,0.797678,0.494474,0.5028,"{'C': 0.1, 'gamma': 'scale'}"
MFA_Features,0,0.710317,0.72773,0.368876,0.394491,"{'C': 0.5, 'gamma': 'scale'}"


In [12]:
#RBF
classification(all_data , feat1, names1, svm_r, param_grid_svm)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,F1,Kappa,MCC,Best_Param
Feature_Type,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PCA_Features,0,0.809524,0.807454,0.486762,0.487208,"{'C': 10, 'gamma': 'scale'}"
MFA_Features,0,0.68254,0.698916,0.274403,0.285162,"{'C': 0.1, 'gamma': 'scale'}"


### Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {'n_estimators': [20,50,100,200]}

rf = RandomForestClassifier(class_weight="balanced", n_jobs=-1) 

In [14]:
classification(all_data , feat1, names1, rf, param_grid_rf)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,F1,Kappa,MCC,Best_Param
Feature_Type,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PCA_Features,0,0.829365,0.804522,0.450619,0.503795,{'n_estimators': 200}
MFA_Features,0,0.805556,0.784038,0.396717,0.424711,{'n_estimators': 200}
