Jupyter notebook used to train the models.

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import joblib

import numpy as np
from numpy.typing import NDArray
from typing import List, Tuple

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold

In [None]:
# Function loading the proofs and separating the labels and feature for the training
def load_data(features_kept, file_name, n_rows = None):
    features_data = (pd.read_csv(file_name, nrows=n_rows)
    .filter(regex=features_kept)
    .to_numpy().transpose())
    
    features = features_data[1:].transpose()
    labels = features_data[0]
    del features_data

    return features, labels

In [None]:
# Function who train model based on the label and the feature given as input, and evaluate them.
def classify(train_features, train_labels, test_features, test_labels, name_model, type_model):
    # Select the type of model
    if type_model == "hist":
        clf = HistGradientBoostingClassifier(verbose=2)
    elif type_model == "neuronal":
        clf = MLPClassifier(hidden_layer_sizes=(1000,100),verbose= True)
    else:
        clf = RandomForestClassifier(verbose= 2, n_jobs=-1, max_depth = 100)

    # Train the classifier using the training features and labels.
    clf.fit(train_features, train_labels)
    # Use the classifier to make predictions on the test features.
    predictions = clf.predict(test_features)
    proba = clf.predict_proba(test_features)
    # clf do already some of the work for us
    score = clf.score(test_features, test_labels)
    # Store the model if the need of a retraining or a re-evaluation is needed.
    joblib.dump(clf, name_model, compress=9)

    
    return predictions, proba, score, clf

In [None]:
def perform_crossval(features, labels, run, folds=10, model= "hist", regex = "", name_proofs = "", output_folder = ""):
    labels = np.array(labels)
    features = np.array(features)

    # dataFrame used to store the evaluation of the models for future analysis
    df = pd.DataFrame(columns= [
        "name_model",
        "type_model",
        "regex",
        "trial_name",
        "params_model",
        "proofs_name",
        "total_number_proofs",
        "len_test_index",
        "test_index",
        "test_labels",
        "predictions",
        # "proba", # Too large to save
        "score",
        "score2",
        "score10",
    ])

    total_top_2_accuracy = 0.0
    total_top_10_accuracy = 0.0
    total_score = 0.0
    # StratifiedKFold will generate n_splits times a random training and evaluation subset from the data,
    # each of them containing the same proportion of different lables than the total set
    kf = StratifiedKFold(n_splits=folds)
    for idx, indexes in enumerate(kf.split(features, labels)):
        print("==== New Fold ====")
        name_model = f"model_{idx}_run{run}"
        train_index, test_index = indexes
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        predictions, proba, score, model_full = classify(X_train, y_train, X_test, y_test, f"{output_folder}/{name_model}", model)
        total_score += score
        score2 = calculate_accuracy(proba, y_test, 2)
        score10 = calculate_accuracy(proba, y_test, 10)
        total_top_2_accuracy += score2
        total_top_10_accuracy += score10
        # Store the evaluation of the newly trained model
        df = pd.concat([pd.DataFrame([[
            name_model, #done
            str(type(model_full)), #done
            regex,
            output_folder,
            str(model_full.get_params()), #done
            name_proofs,
            str(len(labels)), #done
            str(len(test_index)), #done
            str(test_index.tolist()), #done
            str(y_test.tolist()), #done
            str(predictions.tolist()), #done
            # str(proba.tolist()),
            str(score), #done
            str(score2), #done
            str(score10), #done
        ]], columns=df.columns), df], ignore_index=True)
        del model_full
    df.to_csv(f"{output_folder}/model_run{run}_simplified.csv")
        
    print("Total aveage prediction rate: {}".format(total_score / folds))
    print("Average prediction rate for top 2: {}".format(total_top_2_accuracy / folds))
    print("Average prediction rate for top 10: {}".format(total_top_10_accuracy / folds))

# Functions used to calculate score2 and score10

def sort_predictions(predictions_proba: NDArray, y_test: NDArray) -> List[Tuple[int, List[Tuple[int, float]]]]:
    ordered_predictions = []
    for y, pred_prob in zip(y_test, predictions_proba):
        sorted_proba = sorted([(i + 1, prob) for i, prob in enumerate(pred_prob) if prob > 0], key=lambda x: x[1], reverse=True)
        ordered_predictions.append((y, sorted_proba))
    return ordered_predictions

def calculate_accuracy(predictions_proba: NDArray, y_test: NDArray, N: int = 100) -> float:
    sorted_predictions = sort_predictions(predictions_proba, y_test)
    nb_IN_N_tops = sum(y in [pred[0] for pred in sorted_pred[:N]] for y, sorted_pred in sorted_predictions)
    return float(nb_IN_N_tops) / len(predictions_proba)


In [None]:
# Function used to reevaluate models if necessary
def perform_crossval_evaluation(features, labels, folder_model, run, folds=10, regex = "", name_proofs = ""):
    labels = np.array(labels)
    features = np.array(features)

    df = pd.DataFrame(columns= [
        "name_model",
        "type_model",
        "regex",
        "trial_name",
        "params_model",
        "proofs_name",
        "total_number_proofs",
        "len_test_index",
        "test_index",
        "test_labels",
        "predictions",
        # "proba", # Too large to save
        "score",
        "score2",
        "score10",
    ])

    total_top_2_accuracy = 0.0
    total_top_10_accuracy = 0.0
    total_score = 0.0
    kf = StratifiedKFold(n_splits=folds)
    for idx, indexes in enumerate(kf.split(features, labels)):
        print("==== New Fold ====")
        _, test_index = indexes
        X_test =  features[test_index]
        y_test = labels[test_index]
        model = joblib.load(f"{folder_model}/model_{idx}_run{run}_half")
        print("model loaded")
        predictions = model.predict(X_test)
        proba = model.predict_proba(X_test)
        score = model.score(X_test, y_test)
        total_score += score
        score2 = calculate_accuracy(proba, y_test, 2)
        score10 = calculate_accuracy(proba, y_test, 10)
        total_top_2_accuracy += score2
        total_top_10_accuracy += score10
        df = pd.concat([pd.DataFrame([[
            f"model_{idx}_run{run}", #done
            str(type(model)), #done
            regex,
            folder_model,
            str(model.get_params()), #done
            name_proofs,
            str(len(labels)), #done
            str(len(test_index)), #done
            str(test_index.tolist()), #done
            str(y_test.tolist()), #done
            str(predictions.tolist()), #done
            # str(proba.tolist()),
            str(score), #done
            str(score2), #done
            str(score10), #done
        ]], columns=df.columns), df], ignore_index=True)
        del model
    df.to_csv(f"{folder_model}/model_run{run}_simplified.csv")
        
    print("Total aveage prediction rate: {}".format(total_score / folds))
    print("Average prediction rate for top 2: {}".format(total_top_2_accuracy / folds))
    print("Average prediction rate for top 10: {}".format(total_top_10_accuracy / folds))

In [None]:
# Exemple of training of models.
regex = "Personal Number|number_attributes|Size|proof_array_\d+"
name_training_data = "formatted_run.csv"
folder_models = "models_run"
# Trying to train models with more than 500'000 proofs have lead to crash or runs of multiple days, 
# thus 500k is the maximum recommended for the training. 
n_rows = 10000
features, labels = load_data(regex, name_training_data, n_rows) 
perform_crossval(features, labels, 1, folds=10, model= "hist", regex = regex, name_proofs=name_training_data, output_folder=folder_models)