In [None]:
import os
import sys

import numpy as np

# import pandas as pd
# import pyarrow.parquet as pq
# import s3fs
from sklearn.preprocessing import LabelEncoder

from torchFastText import torchFastText
from torchFastText.preprocess import (
    clean_and_tokenize_df,
    clean_text_feature,
    stratified_split_rare_labels,
)

%load_ext autoreload
%autoreload 2

In [None]:

def categorize_surface(
    df: pd.DataFrame, surface_feature_name: int, like_sirene_3: bool = True
) -> pd.DataFrame:
    """
    Categorize the surface of the activity.

    Args:
        df (pd.DataFrame): DataFrame to categorize.
        surface_feature_name (str): Name of the surface feature.
        like_sirene_3 (bool): If True, categorize like Sirene 3.

    Returns:
        pd.DataFrame: DataFrame with a new column "surf_cat".
    """
    df_copy = df.copy()
    df_copy[surface_feature_name] = df_copy[surface_feature_name].replace("nan", np.nan)
    df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(float)
    # Check surface feature exists
    if surface_feature_name not in df.columns:
        raise ValueError(f"Surface feature {surface_feature_name} not found in DataFrame.")
    # Check surface feature is a float variable
    if not (pd.api.types.is_float_dtype(df_copy[surface_feature_name])):
        raise ValueError(f"Surface feature {surface_feature_name} must be a float variable.")

    if like_sirene_3:
        # Categorize the surface
        df_copy["surf_cat"] = pd.cut(
            df_copy[surface_feature_name],
            bins=[0, 120, 400, 2500, np.inf],
            labels=["1", "2", "3", "4"],
        ).astype(str)
    else:
        # Log transform the surface
        df_copy["surf_log"] = np.log(df[surface_feature_name])

        # Categorize the surface
        df_copy["surf_cat"] = pd.cut(
            df_copy.surf_log,
            bins=[0, 3, 4, 5, 12],
            labels=["1", "2", "3", "4"],
        ).astype(str)

    df_copy[surface_feature_name] = df_copy["surf_cat"].replace("nan", "0")
    df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(int)
    df_copy = df_copy.drop(columns=["surf_log", "surf_cat"], errors="ignore")
    return df_copy


def clean_and_tokenize_df(
    df,
    categorical_features=["EVT", "CJ", "NAT", "TYP", "CRT"],
    text_feature="libelle_processed",
    label_col="apet_finale",
):
    df.fillna("nan", inplace=True)

    df = df.rename(
        columns={
            "evenement_type": "EVT",
            "cj": "CJ",
            "activ_nat_et": "NAT",
            "liasse_type": "TYP",
            "activ_surf_et": "SRF",
            "activ_perm_et": "CRT",
        }
    )

    les = []
    for col in categorical_features:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        les.append(le)

    df = categorize_surface(df, "SRF", like_sirene_3=True)
    df = df[[text_feature, "EVT", "CJ", "NAT", "TYP", "SRF", "CRT", label_col]]

    return df, les

In [None]:
pip show -f torchFastText

In [None]:
pip install git+https://github.com/inseefrlab/torch-fasttext@package

# Load and preprocess data

In [None]:
fs = s3fs.S3FileSystem(
    client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"},
    key=os.environ["AWS_ACCESS_KEY_ID"],
    secret=os.environ["AWS_SECRET_ACCESS_KEY"],
)
df = (
    pq.ParquetDataset(
        "projet-ape/extractions/20241027_sirene4.parquet",
        filesystem=fs,
    )
    .read_pandas()
    .to_pandas()
).sample(frac=0.01).fillna(np.nan)

In [None]:
with fs.open("projet-ape/data/naf2008.csv") as file:
    naf2008 = pd.read_csv(file, sep=";")
naf2008

In [None]:
def add_libelles(
    df: pd.DataFrame,
    df_naf: pd.DataFrame,
    y: str,
    text_feature: str,
    textual_features: list,
    categorical_features: list,
):
    missing_codes = set(df_naf["code"])
    fake_obs = df_naf[df_naf["code"].isin(missing_codes)]
    fake_obs[y] = fake_obs["code"]
    fake_obs[text_feature] = fake_obs[[text_feature]].apply(
        lambda row: " ".join(f"[{col}] {val}" for col, val in row.items() if val != ""), axis=1
    )
    df = pd.concat([df, fake_obs[[col for col in fake_obs.columns if col in df.columns]]])

    if textual_features is not None:
        for feature in textual_features:
            df[feature] = df[feature].fillna(value="")
    if categorical_features is not None:
        for feature in categorical_features:
            df[feature] = df[feature].fillna(value="NaN")

    print(f"\t*** {len(missing_codes)} missing codes have been added in the database...\n")
    return df

In [None]:
categorical_features = ["evenement_type", "cj",  "activ_nat_et", "liasse_type", "activ_surf_et", "activ_perm_et"]
text_feature = "libelle"
y = "apet_finale"
textual_features = None

df = add_libelles(df, naf2008, y, text_feature, textual_features, categorical_features)

## Preprocess text and target

We make available our processing function clean_text_feature for the text.

In [None]:
df["libelle_processed"] = clean_text_feature(df["libelle"])

In [None]:
encoder = LabelEncoder()
df["apet_finale"] = encoder.fit_transform(df["apet_finale"])

Put the columns in the right format:
 - First column contains the processed text (str)
 - Next ones contain the "tokenized" categorical variables in int format

In [None]:
df, _ = clean_and_tokenize_df(df, text_feature="libelle_processed") # NE PAS OUBLIER DE REMETYTRE PROCESSEd
X = df[["libelle_processed", "EVT", "CJ", "NAT", "TYP", "CRT", "SRF"]].values
y = df["apet_finale"].values
print(X)
print(y)

We split the data into train and test sets. We especially take care that:  
- classes with only one instance appear in the train set (instead of the test set)
- all classes are represented in the train set

The `stratified_split_rare_labels` function from the `preprocess` subpackage is used to carefully split the data.

In [None]:
X_train, X_test, y_train, y_test = stratified_split_rare_labels(X, y)
assert set(range(len(naf2008["code"]))) == set(np.unique(y_train))

# (Optional) Build the torch-fastText model (without training it)

In [None]:
# Parameters for model building

NUM_BUCKETS = int(1e5) # Number of rows in the embedding matrix
EMBED_DIM = 50 # Dimension of the embedding = number of columns in the embedding matrix
MIN_COUNT = 1 # Minimum number of occurrences of a word in the corpus to be included in the vocabulary
MIN_N = 3 # Minimum length of char n-grams
MAX_N = 6 # Maximum length of char n-grams
LEN_WORD_NGRAMS = 3 # Length of word n-grams
SPARSE = False # Whether to use sparse Embedding layer for fast computation (see PyTorch documentation)

In [None]:
model = torchFastText(
    num_buckets=NUM_BUCKETS,
    embedding_dim=EMBED_DIM,
    min_count=MIN_COUNT,
    min_n=MIN_N,
    max_n=MAX_N,
    len_word_ngrams=LEN_WORD_NGRAMS,
    sparse = SPARSE
)

We build the model using the training data. We have now access to the PyTorch model and a tokenizer.

In [None]:
LR = 4e-3
model.build(X_train, y_train, lightning=True, lr = LR)

In [None]:
print(model.pytorch_model)
print(model.tokenizer)
print(model.lightning_module)

This step is useful to initialize the full torchFastText model without training it, if needed for some reason. But if it is not necessary, and we could have directly launched the training (building is then handled automatically if necessary).

# Train a torchFastText model

In [None]:
NUM_EPOCHS = 1
BATCH_SIZE = 256
PATIENCE = 3

In [None]:
model.train(
    X_train,
    y_train,
    X_test,
    y_test,
    num_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    patience_scheduler=PATIENCE,
    patience_train=PATIENCE,
    lr=LR,
    verbose = True
)

# Load a trained model from a Lightning checkpoint

In [None]:
model.load_from_checkpoint(model.best_model_path)