In [None]:
import os
import sys

sys.path.append('src/')

import logging
import sys

import hydra
import mlflow
import numpy as np
import pandas as pd
import torch
from joblib import Memory
from omegaconf import DictConfig, OmegaConf

from framework_classes import (
    DATA_GETTER,
    DATASETS,
    LOSSES,
    MODELS,
    MODULES,
    OPTIMIZERS,
    PREPROCESSORS,
    SCHEDULERS,
    TOKENIZERS,
    TRAINERS,
)
from utils.data import get_df_naf, get_file_system, get_Y
from utils.mappings import mappings
from utils.mlflow import create_or_restore_experiment

%load_ext autoreload
%autoreload 2

In [None]:
revision = "NAF2008"
model_class = "torchFastText"
start_month = 1
start_year = 2018
text_feature = "libelle"
textual_features_1 = "NAT_LIB"
textual_features_2 = "AGRI"
categorical_features_1 = "TYP"
categorical_features_2 = "NAT"
categorical_features_3 = "SRF"
categorical_features_4 = "CJ"
categorical_features_5 = "CRT"

In [None]:
cfg_dict = {"data": 
                {"sirene":"sirene_4", 
                "start_month": start_month, 
                "start_year": start_year, 
                "revision": revision,
                "text_feature": text_feature,
                "textual_features" : [textual_features_1, textual_features_2],
                "categorical_features" : [categorical_features_1, categorical_features_2, categorical_features_3, categorical_features_4, categorical_features_5],}, 
                
            "model":{"name": "torchFastText",
                    "preprocessor": "PyTorch",}}

In [None]:
def load_or_preprocess_data(cfg_dict_data, cfg_dict_model_preprocessor):
    """
    Load and preprocess data, using joblib caching to avoid redundant computation.
    """
    # Fetch data
    df_s3, df_s4 = DATA_GETTER[cfg_dict_data["sirene"]](**cfg_dict_data)
    Y = get_Y(revision=cfg_dict_data["revision"])
    df_naf = get_df_naf(revision=cfg_dict_data["revision"])

    # Preprocess data
    preprocessor = PREPROCESSORS[cfg_dict_model_preprocessor]()

    if df_s4 is not None:
        df_train_s4, df_val_s4, df_test = preprocessor.preprocess(
            df=df_s4,
            df_naf=df_naf,
            y=Y,
            text_feature=cfg_dict_data["text_feature"],
            textual_features=cfg_dict_data["textual_features"],
            categorical_features=cfg_dict_data["categorical_features"],
            test_size=0.1,
        )
    else:
        raise ValueError("Sirene 4 data should be provided.")

    if df_s3 is not None:
        df_train_s3, df_val_s3, df_test_s3 = preprocessor.preprocess(
            df=df_s3,
            df_naf=df_naf,
            y=Y,
            text_feature=cfg_dict_data["text_feature"],
            textual_features=cfg_dict_data["textual_features"],
            categorical_features=cfg_dict_data["categorical_features"],
            test_size=0.1,
            s3=True,
        )
        # Merge Sirene 3 into the training set
        df_s3_processed = pd.concat([df_train_s3, df_val_s3, df_test_s3])
        df_train = pd.concat([df_s3_processed, df_train_s4]).reset_index(drop=True)

        # Assert no data was lost
        assert len(df_s3) == len(df_s3_processed)
        assert len(df_train_s4) + len(df_s3) == len(df_train)

    else:
        df_train = df_train_s4

    df_val = df_val_s4
    return df_train, df_val, df_test, Y

##### Data #########

df_train, df_val, df_test, Y = load_or_preprocess_data(
    cfg_dict["data"], cfg_dict["model"]["preprocessor"]
)



In [None]:

fs = get_file_system()
file_path_train = "projet-ape/model_comparison_splits/sirene4_20230101_20250211/df_train.parquet"
file_path_val = "projet-ape/model_comparison_splits/sirene4_20230101_20250211/df_val.parquet"
file_path_test = "projet-ape/model_comparison_splits/sirene4_20230101_20250211/df_test.parquet"

In [None]:
print('Start saving data')


dfs = [df_train, df_val, df_test]
file_paths = [file_path_train, file_path_val, file_path_test]

for i, file_path in enumerate(file_paths):
    dfs[i].libelle = dfs[i].libelle.astype(str)
    dfs[i].to_parquet(file_path, filesystem=fs, index=False, engine="pyarrow")

print('Data saved')

mlflow.log_param("number_of_training_observations", df_train.shape[0])

train_text, train_categorical_variables = (
    df_train[cfg_dict["data"]["text_feature"]].values,
    df_train[cfg_dict["data"]["categorical_features"]].values,
)
val_text, val_categorical_variables = (
    df_val[cfg_dict["data"]["text_feature"]].values,
    df_val[cfg_dict["data"]["categorical_features"]].values,
)

In [None]:
df_train.info()

In [None]:
df_train_read = pd.read_parquet(file_path_val,filesystem=fs)

df_train_read