In [None]:
import os
import sys

sys.path.append(os.path.abspath("src"))

import logging
import sys

import hydra
import mlflow
import numpy as np
import pandas as pd
import torch
from joblib import Memory
from omegaconf import DictConfig, OmegaConf

from evaluators import torchFastTextEvaluator
from framework_classes import (
    DATA_GETTER,
    DATASETS,
    LOSSES,
    MODELS,
    MODULES,
    OPTIMIZERS,
    PREPROCESSORS,
    SCHEDULERS,
    TOKENIZERS,
    TRAINERS,
)
from models import FastTextWrapper
from utils.data import get_df_naf, get_file_system, get_processed_data, get_test_data, get_Y
from utils.mappings import mappings
from utils.mlflow import create_or_restore_experiment
from utils.validation_viz import calibration_curve, confidence_histogram, sort_and_get_pred

%load_ext autoreload
%autoreload 2

In [None]:
revision = "NAF2008"
model_class = "torchFastText"
start_month = 1
start_year = 2018
text_feature = "libelle"
textual_features_1 = "NAT_LIB"
textual_features_2 = "AGRI"
categorical_features_1 = "TYP"
categorical_features_2 = "NAT"
categorical_features_3 = "SRF"
categorical_features_4 = "CJ"
categorical_features_5 = "CRT"

In [None]:
cfg_dict = {"data": 
                {"sirene":"sirene_4", 
                "start_month": start_month, 
                "start_year": start_year, 
                "revision": revision,
                "text_feature": text_feature,
                "textual_features" : [textual_features_1, textual_features_2],
                "categorical_features" : [categorical_features_1, categorical_features_2, categorical_features_3, categorical_features_4, categorical_features_5],}, 
                
            "model":{"name": "fastText",
                    "preprocessor": "fastText",
                    "test_params": {"test_batch_size": 256, "run_id":'runs:/45afc22a961a4cdcb282aad93693326d/model'}}
            }
cfg_dict_data = cfg_dict["data"]
df_naf = get_df_naf(revision=cfg_dict_data["revision"])
Y = get_Y(revision=revision)
df_test_ls= get_test_data(**cfg_dict["data"], y=Y)

In [None]:
Y = get_Y(revision=cfg_dict["data"]["revision"])
df_train, df_val, df_test = get_processed_data(revision=cfg_dict["data"]["revision"])

In [None]:
dir(mlflow.pyfunc)

In [None]:
import os

import mlflow

os.environ["MLFLOW_S3_ENDPOINT_URL"] = 'https://minio.lab.sspcloud.fr'
# Load model as a PyFuncModel.
logged_model = 'runs:/d22c02c2df384c549c315b49af338988/default'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [None]:
fasttext_preds = fasttext.predict(df_test)

In [None]:
mlflow.set_tracking_uri("https://projet-ape-mlflow.user.lab.sspcloud.fr/")
run_id = cfg_dict["model"]["test_params"]["run_id"]
module = mlflow.pytorch.load_model(run_id)

In [None]:
torch_preds = torchFastTextEvaluator(module).get_preds(
            df=df_test,
            Y=Y,
            **cfg_dict["data"],
            batch_size=cfg_dict["model"]["test_params"]["test_batch_size"],
            num_workers=os.cpu_count() - 1,
        )

In [None]:
df_res = torchFastTextEvaluator(module).get_aggregated_preds(df=df_test, predictions=torch_preds, Y=Y)

In [None]:
ground_truth = df_res["APE_NIV5"]
torchft_preds = df_res["APE_NIV5_pred_k1"]

In [None]:
fasttext_preds_labels = fasttext_preds[0]
fasttext_preds_labels = [label[0][-5:] for label in fasttext_preds_labels]
fasttext_preds_labels

In [None]:
print((torchft_preds == fasttext_preds_labels).mean())
print((torchft_preds == ground_truth).mean())
print((ground_truth == fasttext_preds_labels).mean())

In [None]:
# Some results
sorted_confidence, well_predicted, predicted_confidence, predicted_class, true_values = (
    sort_and_get_pred(predictions=torch_preds, df=df_test, Y=Y)
)
fig1 = confidence_histogram(sorted_confidence, well_predicted, df=df_test)

In [None]:
def get_automatic_accuracy(thresholds, predicted_confidence, predicted_class, true_values):

    """
    We automatically code the APE if the confidence is above the threshold.
    Compute the accuracy on the automatically coded APEs.

    Args:
        thresholds (np.array(float), shape (n_thresholds,)): The threshold for automatic coding.
        predicted_confidence (torch.Tensor, shape (n_samples, 1): The confidence of the predictions.
        predicted_class (torch.Tensor, shape (n_samples, 1)
            The predicted class of the APEs.
        true_values (torch.Tensor, shape (n_samples, 1): The true values of the APEs.

    Returns:
        automatic_coding_rate (float): The rate of automatically coded APEs.
        accuracy_automatic (float): The accuracy on the automatically coded APEs.

    """
    n_thresholds = len(thresholds)
    automatic_coding_mask = predicted_confidence[:, None] > thresholds[None, :]
    automatic_coding_rate = automatic_coding_mask.mean(axis=0)

    predicted_class_expanded = np.repeat(predicted_class[:, None], n_thresholds, 1)
    true_values_expanded = np.repeat(true_values[:, None], n_thresholds, 1)
    
    predicted_automatic = np.ma.array(predicted_class_expanded, mask=~automatic_coding_mask)
    ground_truth_automatic = np.ma.array(true_values_expanded, mask=~automatic_coding_mask)
    accuracy_automatic = (predicted_automatic == ground_truth_automatic).mean(axis=0)
    return automatic_coding_rate, accuracy_automatic




In [None]:
thresholds = np.linspace(0, 1, 100)
torchft_plot =  get_automatic_accuracy(thresholds, predicted_confidence.numpy(), predicted_class.numpy(), true_values)
fasttext_preds_scores = np.array(fasttext_preds[1])
fasttext_preds_labels = np.array(fasttext_preds_labels)
ft_plot =  get_automatic_accuracy(thresholds, np.clip(fasttext_preds_scores.reshape(-1), 0, 1), fasttext_preds_labels.reshape(-1), ground_truth.values)


In [None]:
# file.py
import numpy as np
import plotly.graph_objects as go

# Assuming thresholds, torchft_plot, fasttext_preds_scores, fasttext_preds_labels, and ground_truth are already defined

# Create masks for the plots
mask_torchft = torchft_plot[0] > 0
mask_ft = ft_plot[0] > 0

# Create the Plotly figure
fig = go.Figure()

# Add traces for torchft
fig.add_trace(go.Scatter(
    x=torchft_plot[0][mask_torchft],
    y=torchft_plot[1][mask_torchft],
    mode='markers',
    hoverinfo='text',
    text=[f'Threshold: {thresh}' for thresh in thresholds[mask_torchft]],
    name='torchft'
))

# Add traces for ft
fig.add_trace(go.Scatter(
    x=ft_plot[0][mask_ft],
    y=ft_plot[1][mask_ft],
    mode='markers',
    hoverinfo='text',
    text=[f'Threshold: {thresh}' for thresh in thresholds[mask_ft]],
    name='ft'
))

# Update layout
fig.update_layout(
    xaxis_title="Pourcentage de codif automatique",
    yaxis_title="Accuracy",
    legend=dict(
        x=1,
        y=1,
        traceorder="normal",
        font=dict(
            family="sans-serif",
            size=12,
            color="black"
        ),
        bgcolor="LightSteelBlue",
        bordercolor="Black",
        borderwidth=2
    ),
    width=800,  # Set the figure width
    height=600   # Set the figure height
)

# Show the plot
fig.show()

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

well_predicted = (ground_truth == fasttext_preds_labels)
print(well_predicted.shape)
df = pd.DataFrame(
        {
            "confidence_score": fasttext_preds_scores.reshape((-1, )),
            "well_predicted": well_predicted,  # Ensure this is categorical if needed
        }
    )

# Plot with proper data format
fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(data=df, x="confidence_score", bins=100, hue="well_predicted", stat="percent")