# Import librairies

In [1]:
# Import main librairies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import librairie to plot trees
from sklearn import tree

# Import Cross Validation librairies
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

# Import Confusion matrix librairies
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# Import Train Test Split librairy
from sklearn.model_selection import train_test_split

# Binary Classifier Metrics

In [2]:
def computeRecall(cm):
    
    """
    Compute recall
    cm: Confusion matrix
    return: recall
    """
    
    return cm[1][1]/(cm[1][0] + cm[1][1])

def computeFmeasure(accuracy, recall):
    
    """
    precision: accuracy
    recall: recall
    Compute F measure. The closer to 1, the better the model is.
    """
    
    return 2 * ((accuracy*recall)/(accuracy+recall))

# Binary Classifier

In [3]:
def binaryClassifier(clf, X, y, test_size, n_repeats, plot_res):

    # Arrays that contains computed metrics from the model n_repeats time
    accuracies = []
    recalls = []
    Fmeasures = []
    
    # Arrays that contains computed metrics from the confusion matrix n_repeats time
    left_class_nb = []
    right_class_nb = []
    left_class_fit = []
    right_class_fit = []
    
    # For each iterations
    for n in range(1, n_repeats):
    
        # Split dataset into train and test
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = test_size)
    
        # Train model
        clf.fit(X_train, y_train)

        # Predict target based on test data
        y_pred = clf.predict(X_test)

        # Create confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        # Compute accuracy
        accuracy = clf.score(X_test, y_test)

        # Compute recall
        recall = computeRecall(cm)

        # Compute F score
        Fmeasure = computeFmeasure(accuracy, recall)

        # Append computed metrics from the model to temporary arrays
        accuracies.append(accuracy)
        recalls.append(recall)
        Fmeasures.append(Fmeasure)
        
        # Append computed metrics from the confusion matrix to temporary arrays
        left_class_nb.append(cm[0][0] + cm[0][1])
        right_class_nb.append(cm[1][0] + cm[1][1])
        left_class_fit.append(cm[0][0]/(cm[0][0] + cm[0][1]))
        right_class_fit.append(cm[1][1]/(cm[1][0] + cm[1][1]))
        
    # Plot classifier results
    if (str(type(clf)) == "<class 'sklearn.tree._classes.DecisionTreeClassifier'>") & plot_res:
        plotDecisionTreeResults(cm, clf, X_train, y_train, X_test, y_test, accuracy, recall, Fmeasure)
    if (str(type(clf)) == "<class 'sklearn.svm._classes.SVC'>") & plot_res:
        plotSVCResults(cm, clf, X_train, y_train, X_test, y_test, accuracy, recall, Fmeasure)
        
    return accuracies, recalls, Fmeasures, left_class_nb, right_class_nb, left_class_fit, right_class_fit

# Cross-Validation Binary Classifier

In [4]:
def crossValidationBinaryClassifier(clf, X, y, test_size, n_repeats):
    
    # Oversample data
    X, y = oversampling(X, y)
    
    # Undersample data
    X, y = undersampling(X, y)
    
    # Arrays that contains computed metrics from the model n_repeats time
    accuracies = []
    recalls = []
    Fmeasures = []
    
    # Execute Cross-Validation
    cv = RepeatedKFold(n_splits = n_splits, n_repeats= n_repeats)
    
    # getting metrics values
    accuracies = cross_val_score(clf, X, y, scoring='accuracy', cv=cv, error_score="raise")
    recalls = cross_val_score(clf, X, y, scoring='recall', cv=cv, error_score="raise")
    Fmeasures = cross_val_score(clf, X, y, scoring='f1', cv=cv, error_score="raise")
    
    return accuracies, recalls, Fmeasures

# Plot decision tree classifier results

In [5]:
def plotDecisionTreeResults(cm, clf, X_train, y_train, X_test, y_test, accuracy, recall, Fmeasure):
    
    # Plot confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
    display_labels=clf.classes_)
    disp.plot()
    plt.title("Confusion Matrix", fontsize=20)
    plt.show()
    print("accuracy :", accuracy)
    print("recall :", recall)
    print("Fmeasure :", Fmeasure)

    # Plot decision tree
    fig = plt.figure(figsize=(25,10))
    _ = tree.plot_tree(clf,
                   feature_names=X_train.columns,  
                   class_names=clf.classes_,
                   fontsize=25,
                   filled=True)
    plt.title("Decision Tree", fontsize=35)
    
    plt.show()

    # Plot train data
    labelling(X_train, y_train)
    
    # Plot test data
    labelling(X_test, y_test)