# Import librairies

In [1]:
# Import main librairies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import librairie to plot trees
from sklearn import tree

# Import Oversampling librairies
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import KMeansSMOTE

# Import undersampling librairies
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import OneSidedSelection

# Import Cross Validation librairies
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

# Import Confusion matrix librairies
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# Import Train Test Split librairy
from sklearn.model_selection import train_test_split

# Import librairie to plot and quantify distributions
import seaborn as sns
from collections import Counter

sns.set()

# Encoding data

In [2]:
def encodeData(X):
    
    # Prepare some dummy variables into binary variables
    if "DIAG" in X.columns.values:
        X.loc[(X.DIAG != 'SLA'), "DIAG"] = "Autre"
    if "DIAGPROBA" in X.columns.values:
        X.loc[(X.DIAGPROBA != 'Forme certaine'), "DIAGPROBA"] = "Autre"
    if "CAUSEDCD_SLA" in X.columns.values:
        X.loc[(X.CAUSEDCD_SLA != 'Insuffisance respiratoire'), "CAUSEDCD_SLA"] = "Autre"
    
    # map binary variable to 0 or 1
    cleanup_nums = {
        "SEX": {"Masculin": 0, "Féminin": 1},
        "DIAG": {"SLA": 0, "Autre": 1},
        "CAUSEDCD_SLA": {"Insuffisance respiratoire":0, "Autre":1}
    }
    
    # Encode binary variables to number 0 and 1
    X = X.replace(cleanup_nums)
    
    # Encode dummy variables
    X = pd.get_dummies(X, drop_first=True)
    
    
    return X

# Label categorical data

In [3]:
def labelData(X):
    
    X = X.copy()
    
    # Label the categorical variable
    if "DIAG" == X.name:
        X[(X != 'SLA')] = "Autre"
    if "DIAGPROBA" == X.name:
        X[(X != 'Forme certaine')] = "Forme probable"
        print('test')
    if "CAUSEDCD_SLA" == X.name:
        X[(X != 'Insuffisance respiratoire')] = "Autre"
    if "DIAG_DCD_M" == X.name:
        X[(X != 'M18') & (X != 'M30')] = None
    if "FIRSTSYMPTOM_DCD_M" == X.name:
        X[(X != 'M18') & (X != 'M42')] = None
        
    return X

# Binary Classifier Metrics

In [4]:
def computeRecall(cm):
    
    """
    Compute recall
    cm: Confusion matrix
    return: recall
    """
    
    return cm[1][1]/(cm[1][0] + cm[1][1])

def computeFmeasure(accuracy, recall):
    
    """
    precision: accuracy
    recall: recall
    Compute F measure. The closer to 1, the better the model is.
    """
    
    return 2 * ((accuracy*recall)/(accuracy+recall))

# Binary Classifier

In [5]:
def binaryClassifier(clf, X, y, test_size, n_repeats, plot_res):

    # Arrays that contains computed metrics from the model n_repeats time
    accuracies = []
    recalls = []
    Fmeasures = []
    
    # Arrays that contains computed metrics from the confusion matrix n_repeats time
    left_class_nb = []
    right_class_nb = []
    left_class_fit = []
    right_class_fit = []
    
    # For each iterations
    for n in range(1, n_repeats):
    
        # Split dataset into train and test
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = test_size)

        # Oversample data
        X_train, y_train = oversampling(X_train, y_train)
        
        # Undersample data
        X_train, y_train = undersampling(X_train, y_train)
    
        # Train model
        clf.fit(X_train, y_train)

        # Predict target based on test data
        y_pred = clf.predict(X_test)

        # Create confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        # Compute accuracy
        accuracy = clf.score(X_test, y_test)

        # Compute recall
        recall = computeRecall(cm)

        # Compute F score
        Fmeasure = computeFmeasure(accuracy, recall)

        # Append computed metrics from the model to temporary arrays
        accuracies.append(accuracy)
        recalls.append(recall)
        Fmeasures.append(Fmeasure)
        
        # Append computed metrics from the confusion matrix to temporary arrays
        left_class_nb.append(cm[0][0] + cm[0][1])
        right_class_nb.append(cm[1][0] + cm[1][1])
        left_class_fit.append(cm[0][0]/(cm[0][0] + cm[0][1]))
        right_class_fit.append(cm[1][1]/(cm[1][0] + cm[1][1]))
        
    # Plot classifier results
    if (str(type(clf)) == "<class 'sklearn.tree._classes.DecisionTreeClassifier'>") & plot_res:
        plotDecisionTreeResults(cm, clf, X_train, y_train, X_test, y_test, accuracy, recall, Fmeasure)
    if (str(type(clf)) == "<class 'sklearn.svm._classes.SVC'>") & plot_res:
        plotSVCResults(cm, clf, X_train, y_train, X_test, y_test, accuracy, recall, Fmeasure)
        
    return accuracies, recalls, Fmeasures, left_class_nb, right_class_nb, left_class_fit, right_class_fit

# Cross-Validation Binary Classifier

In [6]:
def crossValidationBinaryClassifier(clf, X, y, test_size, n_repeats):
    
    # Oversample data
    X, y = oversampling(X, y)
    
    # Undersample data
    X, y = undersampling(X, y)
    
    # Arrays that contains computed metrics from the model n_repeats time
    accuracies = []
    recalls = []
    Fmeasures = []
    
    # Execute Cross-Validation
    cv = RepeatedKFold(n_splits = n_splits, n_repeats= n_repeats)
    
    # getting metrics values
    accuracies = cross_val_score(clf, X, y, scoring='accuracy', cv=cv, error_score="raise")
    recalls = cross_val_score(clf, X, y, scoring='recall', cv=cv, error_score="raise")
    Fmeasures = cross_val_score(clf, X, y, scoring='f1', cv=cv, error_score="raise")
    
    return accuracies, recalls, Fmeasures

# Oversampling methods

In [7]:
def oversampling(X, y):
    
    if useSmote:
        sm = SMOTE(k_neighbors=n_neighbors, sampling_strategy=sampling_strategy)
        X, y = sm.fit_resample(X, y)
        
    if useBorderlineSMOTE:
        sm = BorderlineSMOTE(k_neighbors=n_neighbors, sampling_strategy=sampling_strategy)
        X, y = sm.fit_resample(X, y)
        
    if useSVMSMOTE:
        sm = SVMSMOTE(k_neighbors=n_neighbors, sampling_strategy=sampling_strategy)
        X, y = sm.fit_resample(X, y)
        
    if useKMeansSMOTE:
        sm = BorderlineSMOTE(k_neighbors=n_neighbors, sampling_strategy=sampling_strategy)
        X, y = sm.fit_resample(X, y)
        
    return X, y

# Undersampling methods

In [8]:
def undersampling(X, y):
    
    if useNearMiss:
        if NearMiss_version == 1 | NearMiss_version == 2:
            sm = NearMiss(version=NearMiss_version, n_neighbors=n_neighbors, sampling_strategy=sampling_strategy)
            X, y = sm.fit_resample(X, y)
        else:
            sm = NearMiss(version=NearMiss_version, n_neighbors_ver3=n_neighbors, sampling_strategy=sampling_strategy)
            X, y = sm.fit_resample(X, y)
            
    if useCondensedNearestNeighbour:
        sm = CondensedNearestNeighbour(n_neighbors=n_neighbors)
        X, y = sm.fit_resample(X, y)
        
    if useTomekLinks:
        sm = TomekLinks()
        X, y = sm.fit_resample(X, y)
        
    if useEditedNearestNeighbours:
        sm = EditedNearestNeighbours(n_neighbors=n_neighbors)
        X, y = sm.fit_resample(X, y)
        
    if useOneSidedSelection:
        sm = OneSidedSelection(n_neighbors=n_neighbors, n_seeds_S=n_seeds_S)
        X, y = sm.fit_resample(X, y)
        
    return X, y

# Plot decision tree classifier results

In [9]:
def plotDecisionTreeResults(cm, clf, X_train, y_train, X_test, y_test, accuracy, recall, Fmeasure):
    
    # Plot confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
    display_labels=clf.classes_)
    disp.plot()
    plt.title("Confusion Matrix", fontsize=20)
    plt.show()
    print("accuracy :", accuracy)
    print("recall :", recall)
    print("Fmeasure :", Fmeasure)

    # Plot decision tree
    fig = plt.figure(figsize=(25,10))
    _ = tree.plot_tree(clf,
                   feature_names=X_train.columns,  
                   class_names=clf.classes_,
                   fontsize=25,
                   filled=True)
    plt.title("Decision Tree", fontsize=35)
    
    plt.show()
    
    if len(X_test.columns.values) < 3:
    
        # Plot test data
        if len(X_test.columns.values) == 1:
            v1_name = X_test.columns.values[0]
            v2_name = "y"
            X_tmp = X_test.copy()
            X_tmp['y'] = np.zeros(len(X_tmp.copy())).copy()
            X_tmp['label'] = y_test.copy()
        if len(X_test.columns.values) == 2:
            v1_name = X_test.columns.values[0]
            v2_name = X_test.columns.values[1]
            X_tmp = X_test.copy()
            X_tmp['label'] = y_test.copy()

        plt.title("Test data", fontsize=20)
        sns.scatterplot(data=X_tmp, x=v1_name, y=v2_name, hue="label")
        plt.show()

        # Plot train data
        if len(X_train.columns.values) == 1:
            v1_name = X_train.columns.values[0]
            v2_name = "y"
            X_tmp = X_train.copy()
            X_tmp['y'] = np.zeros(len(X_tmp.copy())).copy()
            X_tmp['label'] = y_train.copy()
        if len(X_train.columns.values) == 2:
            v1_name = X_train.columns.values[0]
            v2_name = X_train.columns.values[1]
            X_tmp = X_train.copy()
            X_tmp['label'] = y_train.copy()

        plt.title("Train data", fontsize=20)
        sns.scatterplot(data=X_tmp, x=v1_name, y=v2_name, hue="label")
        plt.show()