In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time

import numpy as np
from numpy.typing import NDArray
from typing import List, Tuple

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn import metrics

In [2]:
def load_data(features_kept, file_name, n_rows = None):

    """Function to load data that will be used for classification.

    Args:
        You can provide the args you want.
    Returns:
        features (list): the list of features you extract from every trace
        labels (list): the list of identifiers for each trace
    
    An example: Assume you have traces (trace1...traceN) for cells with IDs in the
    range 1-N.  
    
    You extract a list of features from each trace:
    features_trace1 = [f11, f12, ...]
    .
    .
    features_traceN = [fN1, fN2, ...]

    Your inputs to the classifier will be:

    features = [features_trace1, ..., features_traceN]
    labels = [1, ..., N]

    Note: You will have to decide what features/labels you want to use and implement 
    feature extraction on your own.
    """

    features_data = (pd.read_csv(file_name, nrows=n_rows)
    .filter(regex=features_kept)
    .to_numpy().transpose())
    
    features = features_data[1:].transpose()
    labels = features_data[0]
    del features_data

    return features, labels

In [3]:
def classify(train_features, train_labels, test_features, test_labels, name_model, type_model):

    """Function to perform classification, using a 
    Random Forest. 

    Reference: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    
    Args:
        train_features (numpy array): list of features used to train the classifier
        train_labels (numpy array): list of labels used to train the classifier
        test_features (numpy array): list of features used to test the classifier

    Returns:
        predictions: list of labels predicted by the classifier for test_features

    Note: You are free to make changes the parameters of the RandomForestClassifier().
    """

    # Initialize a random forest classifier. Change parameters if desired.
    if type_model == "hist":
        clf = HistGradientBoostingClassifier(verbose=2)
    elif type_model == "neuronal":
        clf = MLPClassifier(hidden_layer_sizes=(1000,100),verbose= True)
    else:
        clf = RandomForestClassifier(verbose= 2, n_jobs=-1)
    # Train the classifier using the training features and labels.
    clf.fit(train_features, train_labels)
    # Use the classifier to make predictions on the test features.
    predictions = clf.predict(test_features)
    proba = clf.predict_proba(test_features)
    #clf do already some of the work for us
    score = clf.score(test_features, test_labels)
    joblib.dump(clf, name_model, compress=9)

    
    return predictions, proba, score, clf

In [4]:
def perform_crossval(features, labels, run, folds=10, model= "hist", regex = "", name_proofs = "", output_folder = ""):

    """Function to perform cross-validation.
    Args:
        features (list): list of features
        labels (list): list of labels
        run (int): number of the run to save the model
        folds (int): number of fold for cross-validation (default=10)
    Returns:
        You can modify this as you like.
    
    This function splits the data into training and test sets. It feeds
    the sets into the classify() function for each fold. 

    You need to use the data returned by classify() over all folds 
    to evaluate the performance.         
    """

    kf = StratifiedKFold(n_splits=folds)
    df = pd.DataFrame(columns= [
        "name_model",
        "type_model",
        "regex",
        "trial_name",
        "params_model",
        "proofs_name",
        "total_number_proofs",
        "len_test_index",
        "test_index",
        "test_labels",
        "predictions",
        # "proba", # Too large to save
        "score",
        "score2",
        "score10",
    ])
    labels = np.array(labels)
    features = np.array(features)
    total_top_2_accuracy = 0.0
    total_top_10_accuracy = 0.0
    total_score = 0.0
    for idx, indexes in enumerate(kf.split(features, labels)):
        print("==== New Fold ====")
        name_model = f"model_{idx}_run{run}"
        train_index, test_index = indexes
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        print("here")
        print(len(X_train))
        print("here")
        predictions, proba, score, model_full = classify(X_train, y_train, X_test, y_test, f"{output_folder}/{name_model}", model)
        total_score += score
        score2 = calculate_accuracy(proba, y_test, 2)
        score10 = calculate_accuracy(proba, y_test, 10)
        total_top_2_accuracy += score2
        total_top_10_accuracy += score10
        df = pd.concat([pd.DataFrame([[
            name_model, #done
            str(type(model_full)), #done
            regex,
            output_folder,
            str(model_full.get_params()), #done
            name_proofs,
            str(len(labels)), #done
            str(len(test_index)), #done
            str(test_index.tolist()), #done
            str(y_test.tolist()), #done
            str(predictions.tolist()), #done
            # str(proba.tolist()),
            str(score), #done
            str(score2), #done
            str(score10), #done
        ]], columns=df.columns), df], ignore_index=True)
        del model_full
    df.to_csv(f"{output_folder}/model_run{run}_simplified.csv")
        
    print("Total aveage prediction rate: {}".format(total_score / folds))
    print("Average prediction rate for top 2: {}".format(total_top_2_accuracy / folds))
    print("Average prediction rate for top 10: {}".format(total_top_10_accuracy / folds))


def sort_predictions(predictions_proba: NDArray, y_test: NDArray) -> List[Tuple[int, List[Tuple[int, float]]]]:
    ordered_predictions = []
    for y, pred_prob in zip(y_test, predictions_proba):
        sorted_proba = sorted([(i + 1, prob) for i, prob in enumerate(pred_prob) if prob > 0], key=lambda x: x[1], reverse=True)
        ordered_predictions.append((y, sorted_proba))
    return ordered_predictions

def calculate_accuracy(predictions_proba: NDArray, y_test: NDArray, N: int = 100) -> float:
    sorted_predictions = sort_predictions(predictions_proba, y_test)
    nb_IN_N_tops = sum(y in [pred[0] for pred in sorted_pred[:N]] for y, sorted_pred in sorted_predictions)
    return float(nb_IN_N_tops) / len(predictions_proba)



In [5]:
def perform_crossval_evaluation(features, labels, folder_model, run, folds=10, regex = "", name_proofs = ""):

    """Function to perform cross-validation.
    Args:
        features (list): list of features
        labels (list): list of labels
        run (int): number of the run to save the model
        folds (int): number of fold for cross-validation (default=10)
    Returns:
        You can modify this as you like.
    
    This function splits the data into training and test sets. It feeds
    the sets into the classify() function for each fold. 

    You need to use the data returned by classify() over all folds 
    to evaluate the performance.         
    """

    kf = StratifiedKFold(n_splits=folds)
    df = pd.DataFrame(columns= [
        "name_model",
        "type_model",
        "regex",
        "trial_name",
        "params_model",
        "proofs_name",
        "total_number_proofs",
        "len_test_index",
        "test_index",
        "test_labels",
        "predictions",
        # "proba", # Too large to save
        "score",
        "score2",
        "score10",
    ])
    labels = np.array(labels)
    features = np.array(features)
    total_top_2_accuracy = 0.0
    total_top_10_accuracy = 0.0
    total_score = 0.0
    for idx, indexes in enumerate(kf.split(features, labels)):
        print("==== New Fold ====")
        train_index, test_index = indexes
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        model = joblib.load(f"{folder_model}/model_{idx}_run{run}_half")
        print("model loaded")
        predictions = model.predict(X_test)
        proba = model.predict_proba(X_test)
        score = model.score(X_test, y_test)
        total_score += score
        score2 = calculate_accuracy(proba, y_test, 2)
        score10 = calculate_accuracy(proba, y_test, 10)
        total_top_2_accuracy += score2
        total_top_10_accuracy += score10
        df = pd.concat([pd.DataFrame([[
            f"model_{idx}_run{run}", #done
            str(type(model)), #done
            regex,
            folder_models,
            str(model.get_params()), #done
            name_proofs,
            str(len(labels)), #done
            str(len(test_index)), #done
            str(test_index.tolist()), #done
            str(y_test.tolist()), #done
            str(predictions.tolist()), #done
            # str(proba.tolist()),
            str(score), #done
            str(score2), #done
            str(score10), #done
        ]], columns=df.columns), df], ignore_index=True)
        del model
    df.to_csv(f"{folder_models}/model_run{run}_simplified.csv")
        
    print("Total aveage prediction rate: {}".format(total_score / folds))
    print("Average prediction rate for top 2: {}".format(total_top_2_accuracy / folds))
    print("Average prediction rate for top 10: {}".format(total_top_10_accuracy / folds))

In [6]:
regex = "Personal Number|number_attributes|Size|proof_array_\d+"
name_training_data = "formatted_run1_trial4_random_1000.csv"
folder_model = "model trial 4 1000 redo"
n_rows = 500000
features, labels = load_data(regex, name_training_data, n_rows) #:500000
perform_crossval(features, labels, 1, folds=10, model= "random", regex = regex, name_proofs=name_training_data, output_folder=folder_model)

  regex = "Personal Number|number_attributes|Size|proof_array_\d+"


==== New Fold ====
here
450000
here


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 6 of 100building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 8 of 100

building tree 7 of 100
building tree 1 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100


: 

In [None]:
#Trial 4 random 1000
# regex = "Personal Number|number_attributes|Size|proof_array_\d+"
# name_training_data = "formatted_run1_trial4_random_1000.csv"
# folder_model = "model trial 4 1000 redo"
# n_rows = 500000
# features, labels = load_data(regex, name_training_data, n_rows) #:500000
perform_crossval(features, labels, 1, folds=10, model= "random", regex = regex, name_proofs=name_training_data, output_folder=folder_model)

In [None]:
perform_crossval(features, labels, 2, folds=10, model= "neuronal", regex = regex, name_proofs=name_training_data, output_folder="model trial 4 100 redo")