# Import packages

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

# Exercise 2 - 1): Choosing the right metrics when dealing with unbalanced data

In [None]:
# Fix random seed for reproducibility:
seed = 666
# Set up seaborn (for heatmaps):
sns.set()

### Train and evaluate a K-NN with K=10 on randomly generated binary dataset, with different ratios between the two classes. Use both accuracy and F1 score metrics, plus the confusion matrix:
ratios = [0.6, 0.75, 0.9, 0.95, 0.98, 0.99]
test_accuracies = []
test_f1_scores = []
test_confusion_matrices = []
for ratio in ratios:
    X, Y = make_classification(n_samples=10000, 
                               n_classes=2, 
                               n_features=2, 
                               n_redundant=0, 
                               n_repeated=0, 
                               weights=[ratio],
                               flip_y=0, 
                               random_state=seed)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, shuffle=True, random_state=seed)
    model = KNeighborsClassifier(n_neighbors=10)
    model.fit(X_train, Y_train)
    Y_test_pred = model.predict(X_test)

    test_accuracy = accuracy_score(Y_test, Y_test_pred)
    test_accuracies.append(test_accuracy)

    test_f1_score = f1_score(Y_test, Y_test_pred, pos_label=1)
    test_f1_scores.append(test_f1_score)

    test_confusion_matrix = confusion_matrix(Y_test, Y_test_pred)
    test_confusion_matrices.append(test_confusion_matrix)

for test_confusion_matrix, ratio, idx in zip(test_confusion_matrices, ratios, range(len(ratios))):
    plt.figure(1, figsize=(15, 12))
    plt.subplot(3, 3, idx+1)
    plt.title("Confusion matrix, 1st class ratio = " + str(ratio))
    sns.heatmap(data=test_confusion_matrix.round(2), annot=True, fmt='d', cmap=sns.color_palette("RdBu_r", 1000))
plt.figure(1)
plt.suptitle("Assessment of a K-NN model (K=10) on randomly generated binary datasets, with different ratios between the two classes")
plt.subplot(3, 3, 8)
plt.title("Test accuracies + test F1-scores of minority class as functions of the 1st class ratio")
plt.plot(ratios, test_accuracies, c='g')
plt.plot(ratios, test_f1_scores, c='r')
plt.legend(["Accuracy", "F1-score"], loc='best')
plt.xlabel('1st class ratio')
plt.ylabel('Quality measures')
plt.show()

Let us focus on the confusion matrices first. With a binary dataset, the confusion matrix contains the true negatives in the upper-left square, the false positives in the upper-right square, the false negatives in the bottom-left square and the true positives in the bottom-right square, where the "positive" class is attributed to the minority class. Equivalently, the rows of the confusion matrix represent the actual class of each sample whereas the columns represent their predicted class.

Now if we look at the different confusion matrices, we notice that when data is balanced, there is a symmetry between the true positives and the true negatives, and the same can be said about the false positives with respect to the false negatives. But the more unbalanced the data, the more the symmetry collapses in the confusion matrix: true negatives (the correctly classified samples from the dominant class) converge to the total amount of samples, false positives converge to 0 and true positives are less and less prevalent until there are more false negatives than there are true positives. This shows that for the very unbalanced datasets, our $K$-NN model totally failed to capture the underlying structure of the minority class. 

If we consider the accuracy metric, it only captured the information that true negatives massively dominate all other categories in the presence of important data imbalance, which makes for an overall increasing accuracy as the imbalance in the data rises. In other word, the accuracy metric is not suitable to correctly assess the performance of a model when the data is unbalanced.

# Exercise 2 - 2): Model selection with Kfold cross-validation for classification on unbalanced data

In [None]:
### Split+shuffle X and Y into k=num_folds different folds:
def KFold_split(X, Y, num_folds, seed):
    KFold_splitter = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    X_train_folds = []
    X_val_folds = []
    Y_train_folds = []
    Y_val_folds = []
    for (kth_fold_train_idxs, kth_fold_val_idxs) in KFold_splitter.split(X, Y):
        X_train_folds.append(X[kth_fold_train_idxs])
        X_val_folds.append(X[kth_fold_val_idxs])
        Y_train_folds.append(Y[kth_fold_train_idxs])
        Y_val_folds.append(Y[kth_fold_val_idxs])
    return X_train_folds, X_val_folds, Y_train_folds, Y_val_folds

### Select a model via Kfold cross-validation:
def KFold_model_selection(X, Y, models, num_folds, seed):
    # Extract a test set:
    X_train_val, X_test, Y_train_val, Y_test = train_test_split(X, Y, test_size=0.3, shuffle=True, random_state=seed)
    # Extract train and validation folds:
    X_train_folds, X_val_folds, Y_train_folds, Y_val_folds = KFold_split(X_train_val, Y_train_val, num_folds, seed)
    # For each model, do KFold cross validation:
    mean_val_F1_scores = []
    for key in models.keys():
        print("\nNow preprocessing model", models[key])
        mean_val_F1_score = perform_KFold_CV(X_train_folds, X_val_folds, Y_train_folds, Y_val_folds, model_idx=key)
        print("Mean validation F1 score:", mean_val_F1_score)
        mean_val_F1_scores.append(mean_val_F1_score)
    # The model with the highest mean validation F1 score is our model of choice:
    best_model_idx = np.argmax(np.array(mean_val_F1_scores))
    best_model = models[best_model_idx]
    print("\n\nBest model:", best_model)
    # Train the best model on the whole train set then evaluate it on the test set:
    best_model_test_F1_score, best_model_test_confusion_matrix = assess_model(X_train=X_train_val, 
                                                                              X_eval=X_test,
                                                                              Y_train=Y_train_val,
                                                                              Y_eval=Y_test,
                                                                              model_idx=best_model_idx)
    print("Test F1 score:", best_model_test_F1_score)
    plt.figure(2, figsize=(7, 5))
    plt.title("Test confusion matrix of the best model " + best_model)
    sns.heatmap(data=best_model_test_confusion_matrix.round(2), annot=True, fmt='d', cmap=sns.color_palette("RdBu_r", 1000))
    plt.show()

### KFold cross-validation of a model:
def perform_KFold_CV(X_train_folds, X_val_folds, Y_train_folds, Y_val_folds, model_idx):
    val_fold_F1_scores = []
    # For each fold, assess a surrogate model:
    cmpt = 0
    for X_train_fold, X_val_fold, Y_train_fold, Y_val_fold in zip(X_train_folds, X_val_folds, Y_train_folds, Y_val_folds):
        val_fold_F1_score, _ = assess_model(X_train=X_train_fold, 
                                            X_eval=X_val_fold,
                                            Y_train=Y_train_fold,
                                            Y_eval=Y_val_fold,
                                            model_idx=model_idx)
        cmpt += 1
        print("Surrogate model", str(cmpt) + "/" + str(len(X_val_folds)), "validation F1 score:", val_fold_F1_score)
        val_fold_F1_scores.append(val_fold_F1_score)
    # Compute the mean validation F1 score between all the folds:
    mean_val_F1_score = np.mean(np.array(val_fold_F1_scores))
    return mean_val_F1_score

### Fit and evaluate a model:
def assess_model(X_train, X_eval, Y_train, Y_eval, model_idx):
    # Build the model:
    if model_idx == 0:
        model = KNeighborsClassifier(n_neighbors=20)
    elif model_idx == 1:
        model = LogisticRegression(solver='lbfgs')
    elif model_idx == 2:
        model = DecisionTreeClassifier()
    # Fit the model:
    model.fit(X_train, Y_train)
    # Evaluate the model on the evaluation set:
    Y_eval_pred = model.predict(X_eval)
    eval_F1_score = f1_score(Y_eval, Y_eval_pred, pos_label=1)
    eval_confusion_matrix = confusion_matrix(Y_eval, Y_eval_pred)
    return eval_F1_score, eval_confusion_matrix

In [None]:
### Model selection of a classification model on unbalanced data with KFold cross-validation:
# Load an unbalanced binary dataset:
with open('custom_unbalanced_dataset.pickle', 'rb') as unbalanced_dataset:
    X, Y = pickle.load(unbalanced_dataset)
    # Models to be cross-validated:
    models = {0: "K-NN, K=20",
              1: "Logistic regression",
              2: "Decision Tree"}
    # Select model with KFold cross-validation (use 10 folds):
    KFold_model_selection(X, Y, models, num_folds=10, seed=seed)

To perform cross-validation, we can choose the F1-score metric, which takes into account precision and recall. Precision is the ratio of true positives among the true positives and the false positives. Recall is the ratio of true positives among the true positives and the false negatives. In the $K$-NN example above, we can see that this metric reduces significantly as the imbalance in the data increases, so it is more suitable than the accuracy metric in our case, since we are dealing once again with a very unbalanced dataset (unbalance ratio of 0.9/0.1)