In [None]:
import os

import pandas as pd

from constants import FRAMEWORK_CLASSES
from utils.data import get_all_data, get_df_naf, get_Y

%load_ext autoreload
%autoreload 2

In [None]:
import sys

# Get the absolute path to the project root directory
project_root = os.getcwd()  # or adjust if notebook is in a subdirectory

# Add both the torch-fastText root and the specific package directory
sys.path.append(os.path.join(project_root, "../torch-fastText"))
sys.path.append(os.path.join(project_root, "../torch-fastText", "torchFastText"))

# For debugging, you can print the sys.path to verify:
print("Python path:", sys.path)

# Now try importing

In [None]:
revision = "NAF2008"
model_class = "torchFastText"
start_month = 1
start_year = 2018
text_feature = "libelle"
textual_features_1 = "NAT_LIB"
textual_features_2 = "AGRI"
categorical_features_1 = "TYP"
categorical_features_2 = "NAT"
categorical_features_3 = "SRF"
categorical_features_4 = "CJ"
categorical_features_5 = "CRT"

In [None]:
Y = get_Y(revision=revision)
params = {
    key: value
    for key, value in locals().items()
    if (
        key
        not in [
            "remote_server_uri",
            "experiment_name",
            "run_name",
            "revision",
            "Y",
            "model_class",
            "text_feature",
            "pre_training_weights",
            "start_month",
            "start_year",
        ]
    )
    and not key.startswith("textual_features")
    and not key.startswith("categorical_features")
    and not key.startswith("embedding_dim")
}
params["thread"] = os.cpu_count()
textual_features = [value for key, value in locals().items() if key.startswith("textual_features_")]
categorical_features = [
    value for key, value in locals().items() if key.startswith("categorical_features_")
]
embedding_dims = [value for key, value in locals().items() if key.startswith("embedding_dim")]

In [None]:
textual_features

In [None]:
framework_classes = FRAMEWORK_CLASSES[model_class]

preprocessor = framework_classes["preprocessor"]()

# Load data
# Sirene 4
df_s3, df_s4 = get_all_data(revision=revision, start_month=start_month, start_year=start_year)
# Detailed NAF
df_naf = get_df_naf(revision=revision)

In [None]:
# Preprocess data
# Sirene 4
df_train_s4, df_val_s4, df_test = preprocessor.preprocess(
    df=df_s4,
    df_naf=df_naf,
    y=Y,
    text_feature=text_feature,
    textual_features=textual_features,
    categorical_features=categorical_features,
    test_size=0.1,
)
# # Get test_data from LabelStudio
# df_test_ls = pd.concat(
#     preprocessor.preprocess(
#         get_test_data(revision=revision, y=Y),
#         df_naf,
#         Y,
#         text_feature,
#         textual_features,
#         categorical_features,
#         add_codes=False,
#     ),
#     axis=0,
# )
# Sirene 3
if df_s3.empty:
    df_train = df_train_s4
else:
    df_train_s3 = pd.concat(
        preprocessor.preprocess(
            df_s3,
            df_naf,
            Y,
            text_feature,
            textual_features,
            categorical_features,
            recase=True,
        ),
        axis=0,
    )
    # All train data together
    df_train = pd.concat([df_train_s3, df_train_s4], axis=0).reset_index(drop=True)
# Shuffle in case it is not done later
df_train = df_train.sample(frac=1)

In [None]:
df_train