In [None]:
import os
import sys

sys.path.append('src/')

import logging
import sys

import hydra
import mlflow
import numpy as np
import pandas as pd
import torch
from joblib import Memory
from omegaconf import DictConfig, OmegaConf

from evaluators import torchFastTextEvaluator
from framework_classes import (
    DATA_GETTER,
    DATASETS,
    LOSSES,
    MODELS,
    MODULES,
    OPTIMIZERS,
    PREPROCESSORS,
    SCHEDULERS,
    TOKENIZERS,
    TRAINERS,
)
from utils.data import get_df_naf, get_file_system, get_processed_data, get_test_data, get_Y
from utils.mappings import mappings
from utils.mlflow import create_or_restore_experiment
from utils.validation_viz import calibration_curve, confidence_histogram, sort_and_get_pred

%load_ext autoreload
%autoreload 2

In [None]:
revision = "NAF2008"
model_class = "torchFastText"
start_month = 1
start_year = 2018
text_feature = "libelle"
textual_features_1 = "NAT_LIB"
textual_features_2 = "AGRI"
categorical_features_1 = "TYP"
categorical_features_2 = "NAT"
categorical_features_3 = "SRF"
categorical_features_4 = "CJ"
categorical_features_5 = "CRT"

In [None]:
cfg_dict = {"data": 
                {"sirene":"sirene_4", 
                "start_month": start_month, 
                "start_year": start_year, 
                "revision": revision,
                "text_feature": text_feature,
                "textual_features" : [textual_features_1, textual_features_2],
                "categorical_features" : [categorical_features_1, categorical_features_2, categorical_features_3, categorical_features_4, categorical_features_5],}, 
                
            "model":{"name": "torchFastText",
                    "preprocessor": "PyTorch",}}
cfg_dict_data = cfg_dict["data"]
df_naf = get_df_naf(revision=cfg_dict_data["revision"])
Y = get_Y(revision=revision)

In [None]:
preprocessor = PREPROCESSORS[cfg_dict["model"]["preprocessor"]](cfg_dict)
preprocessor

In [None]:

df_test_ls= get_test_data(**cfg_dict["data"], y=Y)


In [None]:

df_test_ls = pd.concat(preprocessor.preprocess(df_test_ls,
            df_naf=df_naf,
            y=Y,
            text_feature=cfg_dict_data["text_feature"],
            textual_features=cfg_dict_data["textual_features"],
            categorical_features=cfg_dict_data["categorical_features"],
            test_size=0.1,), axis=0)

In [None]:
_, _, df_test = get_processed_data()


In [None]:
import mlflow

mlflow.set_tracking_uri("https://projet-ape-mlflow.user.lab.sspcloud.fr/")
mlflow.set_experiment('model_comparison_s4')
logged_model = 'runs:/45afc22a961a4cdcb282aad93693326d/model'

# Load model as a PyFuncModel.
module = mlflow.pytorch.load_model(logged_model)

In [None]:
evaluator = torchFastTextEvaluator(module)
test_res = evaluator.launch_test(
    df_test_ls,
    text_feature=cfg_dict["data"]["text_feature"],
    categorical_features=cfg_dict["data"]["categorical_features"],
    Y=Y,
    batch_size=256,
    num_workers=72,
)

test_res

In [None]:
predictions, inference_time = evaluator.get_preds(
    df_test_ls,
    **cfg_dict["data"],
    Y=Y,
    return_inference_time=True,
    batch_size=256,
    num_workers=72,
)
print(inference_time)

In [None]:
df_res = evaluator.get_aggregated_preds(df=df_test_ls, predictions=predictions, Y=Y)
df_res

In [None]:
from sklearn.metrics import RocCurveDisplay, roc_auc_score

auc = roc_auc_score(df_test_ls[Y].values, predictions.detach().numpy(), multi_class='ovr', average=None)
auc.argmin(), auc[auc.argmin()]

In [None]:
sorted_confidence, well_predicted, predicted_confidence, predicted_class, true_values = sort_and_get_pred(predictions=predictions, df=df_test_ls, Y=Y)
fig1 = confidence_histogram(sorted_confidence, well_predicted, df=df_test_ls)
fig2 = calibration_curve(n_bins=40, confidences=predicted_confidence, predicted_classes=predicted_class, true_labels=true_values)