In [None]:
import os
import sys

sys.path.append(os.path.abspath("src/"))

import logging
import sys

import hydra
import mlflow
import numpy as np
import pandas as pd
import torch
from joblib import Memory
from omegaconf import DictConfig, OmegaConf

from evaluators import Evaluator
from framework_classes import (
    DATA_GETTER,
    DATASETS,
    LOSSES,
    MODELS,
    MODULES,
    OPTIMIZERS,
    PREPROCESSORS,
    SCHEDULERS,
    TOKENIZERS,
    TRAINERS,
)
from models import FastTextWrapper
from src.datasets import SoftClassifDataset
from utils.data import PATHS, get_df_naf, get_file_system, get_processed_data, get_test_data, get_Y
from utils.mappings import mappings
from utils.mlflow import create_or_restore_experiment
from utils.validation_viz import (
    calibration_curve,
    confidence_histogram,
    get_automatic_accuracy,
    sort_and_get_pred,
)

%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd

df = get_df_naf(revision="NAF2008")

display(df)
# Define coefficients for each level
similarity_coefficients = np.array([0.1, 0.1, 0.1, 0.1, 0.5])

# Create a matrix of levels
levels_matrix = df[['APE_NIV1', 'APE_NIV2', 'APE_NIV3', 'APE_NIV4', 'APE_NIV5']].values

# Compute the similarity matrix using broadcasting
similarity_matrix = np.dot((levels_matrix[:, None, :] == levels_matrix[None, :, :]), similarity_coefficients)

# Convert the similarity matrix to a DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=df['APE_NIV5'], columns=df['APE_NIV5'])

similarity_df


In [None]:
df.APE_NIV3.value_counts()

In [None]:
from utils.mappings import mappings

In [None]:
ordered_similarity_df = similarity_df.loc[mappings['apet_finale'].keys(), mappings['apet_finale'].keys()]
# Map the APE_NIV5 codes to their order
df['order'] = df['APE_NIV5'].map(mappings['apet_finale'])

# Sort the DataFrame by the 'order' column
ordered_df = df.sort_values('order').drop(columns='order')
display(ordered_df)
ordered_similarity_df


In [None]:
revision = "NAF2008"
model_class = "torchFastText"
start_month = 1
start_year = 2018
text_feature = "libelle"
textual_features_1 = "NAT_LIB"
textual_features_2 = "AGRI"
categorical_features_1 = "TYP"
categorical_features_2 = "NAT"
categorical_features_3 = "SRF"
categorical_features_4 = "CJ"
categorical_features_5 = "CRT"

categorical_features = [categorical_features_1, categorical_features_2, categorical_features_3, categorical_features_4, categorical_features_5]
cfg_dict = {"data": 
                {"sirene":"sirene_4", 
                "start_month": start_month, 
                "start_year": start_year, 
                "revision": revision,
                "text_feature": text_feature,
                "textual_features" : [textual_features_1, textual_features_2],
                "categorical_features" : [categorical_features_1, categorical_features_2, categorical_features_3, categorical_features_4, categorical_features_5],}, 
                
            "model":{"model_name": "torchFastText",
                    "preprocessor": "PyTorch",
                    "model_params" : {"embedding_dim": 80, "categorical_embedding_dims": 5, "sparse":False, "direct_bagging":False},
                    "train_params": {
                                        "trainer_name": "Lightning",
                                        "num_epochs": 35,
                                        "patience_early_stopping": 5,
                                        "batch_size": 256,
                                        "optimizer_name": "Adam",
                                        "optimizer_params": {
                                            "lr": 1e-3
                                        },
                                        "scheduler_name": "ReduceLROnPlateau",
                                        "scheduler_params": {
                                            "factor": 0.5,
                                            "patience": 2,
                                            "min_lr": 1e-6
                                        },
                                        "loss_name": "CrossEntropyLoss"
                                        },
                    "test_params": {"test_batch_size": 256, "run_id":'runs:/45afc22a961a4cdcb282aad93693326d/model'}},

                "tokenizer": {"tokenizer_name": "NGramTokenizer",
                            "min_count":100, "min_n":3, "max_n":6, "len_word_ngrams":3, "num_tokens":10000},
            }
cfg_dict_data = cfg_dict["data"]
df_naf = get_df_naf(revision=cfg_dict_data["revision"])
Y = get_Y(revision=cfg_dict["data"]["revision"])
df_train, df_val, df_test = get_processed_data(revision=cfg_dict["data"]["revision"])

In [None]:
os.environ['MLFLOW_TRACKING_URI'] = "https://projet-ape-mlflow.user.lab.sspcloud.fr/" 
model_name = "FastText-pytorch"
module = mlflow.pytorch.load_model(f"models:/{model_name}/latest")

In [None]:
text, categorical_variables = (
            df_test[text_feature].values,
            df_test[categorical_features].values,
        )

dataset = SoftClassifDataset(
    texts=text,
    categorical_variables=categorical_variables,
    tokenizer=module.model.tokenizer,
    outputs=df_test[Y].values,
    similarity_coefficients=[0.01, 0.1, 0.1, 0.1, 0.5],
    revision=cfg_dict["data"]["revision"],
)
dataloader = dataset.create_dataloader(
    batch_size=10, shuffle=False, num_workers=12
)

In [None]:

df_train = df_train.sample(frac=0.001)
df_val = df_val.sample(frac=0.01)
df_test = df_test.sample(frac=0.01)

train_text, train_categorical_variables = (
            df_train[cfg_dict["data"]["text_feature"]].values,
            df_train[cfg_dict["data"]["categorical_features"]].values,
        )
val_text, val_categorical_variables = (
    df_val[cfg_dict["data"]["text_feature"]].values,
    df_val[cfg_dict["data"]["categorical_features"]].values,
)
test_text, test_categorical_variables = (
    df_test[cfg_dict["data"]["text_feature"]].values,
    df_test[cfg_dict["data"]["categorical_features"]].values,
)

In [None]:

tokenizer = TOKENIZERS[cfg_dict["tokenizer"]["tokenizer_name"]](
            **cfg_dict["tokenizer"], training_text=train_text
        )

num_rows = tokenizer.num_tokens + tokenizer.get_nwords() + 1
padding_idx = num_rows - 1
num_classes = max(mappings[Y].values()) + 1
categorical_vocab_sizes = []
for feature in cfg_dict["data"]["categorical_features"]:
    if feature == "SRF":
        categorical_vocab_sizes.append(5)
    else:
        categorical_vocab_sizes.append(max(mappings[feature].values()) + 1)

model = MODELS[cfg_dict["model"]["model_name"]](
    **cfg_dict["model"]["model_params"],
    tokenizer=tokenizer,
    num_rows=num_rows,
    num_classes=num_classes,
    categorical_vocabulary_sizes=categorical_vocab_sizes,
    padding_idx=padding_idx,
)

loss = LOSSES[cfg_dict["model"]["train_params"]["loss_name"]]()
optimizer = OPTIMIZERS[
    cfg_dict["model"]["train_params"]["optimizer_name"]
]  # without the () !
scheduler = SCHEDULERS[cfg_dict["model"]["train_params"]["scheduler_name"]]

module = MODULES[cfg_dict["model"]["model_name"]](
    model=model,
    loss=loss,
    optimizer=optimizer,
    scheduler=scheduler,
    **cfg_dict["model"]["train_params"],
)

In [None]:
trainer = TRAINERS[cfg_dict["model"]["train_params"]["trainer_name"]](
            **cfg_dict["model"]["train_params"],
        )


In [None]:
dataset_class = SoftClassifDataset


train_dataset = dataset_class(
    texts=train_text,
    categorical_variables=train_categorical_variables,
    tokenizer=tokenizer,
    outputs=df_train[Y].values,
    revision=cfg_dict["data"]["revision"],
    similarity_coefficients=similarity_coefficients,
)
val_dataset = dataset_class(
    texts=val_text,
    categorical_variables=val_categorical_variables,
    tokenizer=tokenizer,
    outputs=df_val[Y].values,
    revision=cfg_dict["data"]["revision"],
    similarity_coefficients=similarity_coefficients,
)

test_dataset = dataset_class(
    texts=test_text,
    categorical_variables=test_categorical_variables,
    tokenizer=tokenizer,
    outputs=df_test[Y].values,
    revision=cfg_dict["data"]["revision"],
    similarity_coefficients=similarity_coefficients,
)

In [None]:
train_dataloader = train_dataset.create_dataloader(
    **cfg_dict["model"]["train_params"]
)
val_dataloader = val_dataset.create_dataloader(**cfg_dict["model"]["train_params"])
test_dataloader = test_dataset.create_dataloader(
    **cfg_dict["model"]["train_params"]
)

In [None]:
def run_eval(df, dataloader, suffix='val'):
    """
    Run evaluation on the given dataloader and log the results.
    """

    predictions = trainer.predict(module, dataloader) # accumulates predictions over batches
    predictions_tensor = torch.cat(predictions).cpu().numpy() # (num_test_samples, num_classes)

    # Use your aggregation function
    aggregated_results = Evaluator.get_aggregated_preds(
        df=df,
        Y=Y,
        predictions=predictions_tensor,
        top_k=1
    )

    display(aggregated_results)

    accuracy = Evaluator.compute_accuracies(aggregated_preds=aggregated_results, suffix=suffix)

    return aggregated_results

run_eval(df_val, val_dataloader, suffix='val')
run_eval(df_test, test_dataloader, suffix='test')