In [None]:
import os
import sys

import numpy as np
import pandas as pd

from framework_classes import PREPROCESSORS
from utils.data import get_all_data, get_df_naf, get_Y

%load_ext autoreload
%autoreload 2

In [None]:
import json

mappings = {}
for key in ["apet_finale", "CJ", "CRT", "NAT", "TYP"]:

    file = f"../../data/mappings/{key}_mapping.json"
    with open(file, "r") as f:
        mappings[key] = json.load(f)

In [None]:
revision = "NAF2008"
model_class = "torchFastText"
start_month = 1
start_year = 2018
text_feature = "libelle"
textual_features_1 = "NAT_LIB"
textual_features_2 = "AGRI"
categorical_features_1 = "TYP"
categorical_features_2 = "NAT"
categorical_features_3 = "SRF"
categorical_features_4 = "CJ"
categorical_features_5 = "CRT"

In [None]:
Y = get_Y(revision=revision)
params = {
    key: value
    for key, value in locals().items()
    if (
        key
        not in [
            "remote_server_uri",
            "experiment_name",
            "run_name",
            "revision",
            "Y",
            "model_class",
            "text_feature",
            "pre_training_weights",
            "start_month",
            "start_year",
        ]
    )
    and not key.startswith("textual_features")
    and not key.startswith("categorical_features")
    and not key.startswith("embedding_dim")
}
params["thread"] = os.cpu_count()
textual_features = [value for key, value in locals().items() if key.startswith("textual_features_")]
categorical_features = [
    value for key, value in locals().items() if key.startswith("categorical_features_")
]
embedding_dims = [value for key, value in locals().items() if key.startswith("embedding_dim")]

In [None]:
# Load data
# Sirene 4
df_s3, df_s4 = get_all_data(revision=revision, start_month=start_month, start_year=start_year)
# Detailed NAF
df_naf = get_df_naf(revision=revision)

In [None]:
df_s4_samp = df_s4.sample(frac=0.01)
df_s3_samp = df_s3.sample(frac=0.01)

In [None]:
preprocessor = PREPROCESSORS["PyTorch"]()

In [None]:
# Preprocess data
# Sirene 4
# df_train_s4, df_val_s4, df_test = preprocessor.preprocess(
#     df=df_s4_samp,
#     df_naf=df_naf,
#     y=Y,
#     text_feature=text_feature,
#     textual_features=textual_features,
#     categorical_features=categorical_features,
#     test_size=0.1,
# )
df_train_s4  = preprocessor.preprocess(
    df=df_s4_samp,
    df_naf=df_naf,
    y=Y,
    text_feature=text_feature,
    textual_features=textual_features,
    categorical_features=categorical_features,
    test_size=0.1,
    mapping=True
)
# # Get test_data from LabelStudio
# df_test_ls = pd.concat(
#     preprocessor.preprocess(
#         get_test_data(revision=revision, y=Y),
#         df_naf,
#         Y,
#         text_feature,
#         textual_features,
#         categorical_features,
#         add_codes=False,
#     ),
#     axis=0,
# )
# Sirene 3
# if df_s3.empty:
#     df_train = df_train_s4
# else:
#     df_train_s3, df_val_s3, df_test_s3 = preprocessor.preprocess(
#         df_s3_samp, df_naf, Y, text_feature, textual_features, categorical_features, recase=True, s3=True
#     )
#     # All train data together
#     df_s3_processed = pd.concat([df_train_s3, df_val_s3, df_test_s3])
#     df_train = pd.concat([df_s3_processed, df_train_s4], axis=0).reset_index(drop=True)

df_train = df_train_s4
#df_val = df_val_s4

In [None]:
df_train_s4[categorical_features].isna().sum()

In [None]:
import json

for variable in categorical_features:
    sorted_by_count_unique_values = df_train_s4[variable].value_counts()
    mapping = {k: v for v, k in enumerate(sorted_by_count_unique_values.index)}

    #save in json
    with open(f"../data/mappings/{variable}_mapping.json", "w") as f:
        json.dump(mapping, f, indent=3)

In [None]:
df_train_s4[df_train_s4["CJ"].isna()]

In [None]:
train_text, train_categorical_variables = (
    df_train[text_feature].values,
    df_train[categorical_features].values,
)
val_text, val_categorical_variables = (
    df_val[text_feature].values,
    df_val[categorical_features].values,
)

In [None]:
df_train[Y].isna().sum()

In [None]:
train_categorical_variables

In [None]:
(train_categorical_variables == np.nan).sum()

In [None]:
categorical_vocab_sizes = np.max(train_categorical_variables, axis=0) + 1
print("cat ", categorical_vocab_sizes)