In [1]:
import os
import sys

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import s3fs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

sys.path.append("../")
from torchFastText import torchFastText
from torchFastText.preprocess import clean_text_feature
from torchFastText.datasets import NGramTokenizer

%load_ext autoreload
%autoreload 2

# Some utils functions that will help us format our dataset

In [2]:
def categorize_surface(
    df: pd.DataFrame, surface_feature_name: int, like_sirene_3: bool = True
) -> pd.DataFrame:
    """
    Categorize the surface of the activity.

    Args:
        df (pd.DataFrame): DataFrame to categorize.
        surface_feature_name (str): Name of the surface feature.
        like_sirene_3 (bool): If True, categorize like Sirene 3.

    Returns:
        pd.DataFrame: DataFrame with a new column "surf_cat".
    """
    df_copy = df.copy()
    df_copy[surface_feature_name] = df_copy[surface_feature_name].replace("nan", np.nan)
    df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(float)
    # Check surface feature exists
    if surface_feature_name not in df.columns:
        raise ValueError(f"Surface feature {surface_feature_name} not found in DataFrame.")
    # Check surface feature is a float variable
    if not (pd.api.types.is_float_dtype(df_copy[surface_feature_name])):
        raise ValueError(f"Surface feature {surface_feature_name} must be a float variable.")

    if like_sirene_3:
        # Categorize the surface
        df_copy["surf_cat"] = pd.cut(
            df_copy[surface_feature_name],
            bins=[0, 120, 400, 2500, np.inf],
            labels=["1", "2", "3", "4"],
        ).astype(str)
    else:
        # Log transform the surface
        df_copy["surf_log"] = np.log(df[surface_feature_name])

        # Categorize the surface
        df_copy["surf_cat"] = pd.cut(
            df_copy.surf_log,
            bins=[0, 3, 4, 5, 12],
            labels=["1", "2", "3", "4"],
        ).astype(str)

    df_copy[surface_feature_name] = df_copy["surf_cat"].replace("nan", "0")
    df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(int)
    df_copy = df_copy.drop(columns=["surf_log", "surf_cat"], errors="ignore")
    return df_copy


def clean_and_tokenize_df(
    df,
    categorical_features=["EVT", "CJ", "NAT", "TYP", "CRT"],
    text_feature="libelle_processed",
    label_col="apet_finale",
):
    df.fillna("nan", inplace=True)

    df = df.rename(
        columns={
            "evenement_type": "EVT",
            "cj": "CJ",
            "activ_nat_et": "NAT",
            "liasse_type": "TYP",
            "activ_surf_et": "SRF",
            "activ_perm_et": "CRT",
        }
    )

    les = []
    for col in categorical_features:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        les.append(le)

    df = categorize_surface(df, "SRF", like_sirene_3=True)
    df = df[[text_feature, "EVT", "CJ", "NAT", "TYP", "SRF", "CRT", label_col]]

    return df, les


def stratified_split_rare_labels(X, y, test_size=0.2, min_train_samples=1):
    # Get unique labels and their frequencies
    unique_labels, label_counts = np.unique(y, return_counts=True)

    # Separate rare and common labels
    rare_labels = unique_labels[label_counts == 1]

    # Create initial mask for rare labels to go into training set
    rare_label_mask = np.isin(y, rare_labels)

    # Separate data into rare and common label datasets
    X_rare = X[rare_label_mask]
    y_rare = y[rare_label_mask]
    X_common = X[~rare_label_mask]
    y_common = y[~rare_label_mask]

    # Split common labels stratified
    X_common_train, X_common_test, y_common_train, y_common_test = train_test_split(
        X_common, y_common, test_size=test_size, stratify=y_common
    )

    # Combine rare labels with common labels split
    X_train = np.concatenate([X_rare, X_common_train])
    y_train = np.concatenate([y_rare, y_common_train])
    X_test = X_common_test
    y_test = y_common_test

    return X_train, X_test, y_train, y_test

# Load and preprocess data

In [3]:
fs = s3fs.S3FileSystem(
    client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"},
    anon=True,
)
df = (
    (
        pq.ParquetDataset(
            "projet-ape/extractions/20241027_sirene4.parquet",
            filesystem=fs,
        )
        .read_pandas()
        .to_pandas()
    )
    .sample(frac=0.001)
    .fillna(np.nan)
)

2025-02-24 18:21:41 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-02-24 18:21:41 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-02-24 18:21:42 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-02-24 18:21:48 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-02-24 18:21:48 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].


In [4]:
with fs.open("projet-ape/data/naf2008.csv") as file:
    naf2008 = pd.read_csv(file, sep=";")
naf2008

2025-02-24 18:21:54 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].


Unnamed: 0,code,libelle
0,0111Z,"Culture de céréales (à l'exception du riz), de..."
1,0112Z,Culture du riz
2,0113Z,"Culture de légumes, de melons, de racines et d..."
3,0114Z,Culture de la canne à sucre
4,0115Z,Culture du tabac
...,...,...
727,9609Z,Autres services personnels n.c.a.
728,9700Z,Activités des ménages en tant qu'employeurs de...
729,9810Z,Activités indifférenciées des ménages en tant ...
730,9820Z,Activités indifférenciées des ménages en tant ...


In [5]:
def add_libelles(
    df: pd.DataFrame,
    df_naf: pd.DataFrame,
    y: str,
    text_feature: str,
    textual_features: list,
    categorical_features: list,
):
    missing_codes = set(df_naf["code"])
    fake_obs = df_naf[df_naf["code"].isin(missing_codes)]
    fake_obs[y] = fake_obs["code"]
    fake_obs[text_feature] = fake_obs[[text_feature]].apply(
        lambda row: " ".join(f"[{col}] {val}" for col, val in row.items() if val != ""), axis=1
    )
    df = pd.concat([df, fake_obs[[col for col in fake_obs.columns if col in df.columns]]])

    if textual_features is not None:
        for feature in textual_features:
            df[feature] = df[feature].fillna(value="")
    if categorical_features is not None:
        for feature in categorical_features:
            df[feature] = df[feature].fillna(value="NaN")

    print(f"\t*** {len(missing_codes)} codes have been added in the database...\n")
    return df

In [6]:
categorical_features = [
    "evenement_type",
    "cj",
    "activ_nat_et",
    "liasse_type",
    "activ_surf_et",
    "activ_perm_et",
]
text_feature = "libelle"
y = "apet_finale"
textual_features = None

df = add_libelles(df, naf2008, y, text_feature, textual_features, categorical_features)

	*** 732 codes have been added in the database...



## Preprocess text and target

We make available our processing function clean_text_feature for the text.

In [7]:
df["libelle_processed"] = clean_text_feature(df["libelle"])

In [8]:
encoder = LabelEncoder()
df["apet_finale"] = encoder.fit_transform(df["apet_finale"])

Put the columns in the right format:
 - First column contains the processed text (str)
 - Next ones contain the "tokenized" categorical (discrete) variables in int format

In [9]:
df, _ = clean_and_tokenize_df(df, text_feature="libelle_processed")
X = df[["libelle_processed", "EVT", "CJ", "NAT", "TYP", "CRT", "SRF"]].values
y = df["apet_finale"].values
print(X)
print(y)

[['format' 1 13 ... 1 1 0]
 ["l'acquisition, gestion, l'administration, mis valeur, transformation, construction, location, cession tous immeubl"
  1 19 ... 3 0 0]
 ["distribu materiel medical, l'assist administr operationnel aupr professionnel sante, conseil managem"
  1 13 ... 1 1 0]
 ...
 ['[libelle] activit indifferencie menag tant producteur bien usag propr'
  22 26 ... 7 0 0]
 ['[libelle] activit indifferencie menag tant producteur servic usag propr'
  22 26 ... 7 0 0]
 ['[libelle] activit organis organ extraterritorial' 22 26 ... 7 0 0]]
[660 579 409 ... 729 730 731]


  df.fillna("nan", inplace=True)


We split the data into train and test sets. We especially take care that:  
- classes with only one instance appear in the train set (instead of the test set)
- all classes are represented in the train set

The `stratified_split_rare_labels` function from the `preprocess` subpackage is used to carefully split the data.

In [10]:
X_train, X_test, y_train, y_test = stratified_split_rare_labels(X, y)
assert set(range(len(naf2008["code"]))) == set(np.unique(y_train))

# Build the torch-fastText model (without training it)

We first initialize the model (without building it).

In [11]:
# Parameters for model building
NUM_TOKENS = int(1e5)  # Number of rows in the embedding matrix
EMBED_DIM = 50  # Dimension of the embedding = number of columns in the embedding matrix
SPARSE = (
    False  # Whether to use sparse Embedding layer for fast computation (see PyTorch documentation)
)
CAT_EMBED_DIM = 10  # Dimension of the embedding for categorical features

# Parameters for tokenizer
MIN_COUNT = (
    1  # Minimum number of occurrences of a word in the corpus to be included in the vocabulary
)
MIN_N = 3  # Minimum length of char n-grams
MAX_N = 6  # Maximum length of char n-grams
LEN_WORD_NGRAMS = 3  # Length of word n-grams

# Parameters for training - not useful immediately
NUM_EPOCHS = 1
BATCH_SIZE = 256
PATIENCE = 3

In [12]:
model = torchFastText(
    num_tokens=NUM_TOKENS,
    embedding_dim=EMBED_DIM,
    categorical_embedding_dims=CAT_EMBED_DIM,
    min_count=MIN_COUNT,
    min_n=MIN_N,
    max_n=MAX_N,
    len_word_ngrams=LEN_WORD_NGRAMS,
    sparse=SPARSE,
)

We can save these parameters to a JSON file. Initialization can also be done providing a JSON file path.

In [13]:
model.to_json("torchFastText_config.json")

In [14]:
model = torchFastText.from_json("torchFastText_config.json")

We build the model using the training data. We have now access to the tokenizer, the PyTorch model as well as a PyTorch Lightning module ready to be trained.

In [15]:
LR = 4e-3
model.build(X_train, y_train, lightning=True, lr=LR)

2025-02-24 18:21:56 - torchFastText.model.pytorch_model - num_rows is different from the number of tokens in the tokenizer. Using provided num_rows.
2025-02-24 18:21:56 - torchFastText.torchFastText - No scheduler parameters provided. Using default parameters (suited for ReduceLROnPlateau).


In [16]:
print(model.pytorch_model)
print(model.tokenizer)
print(model.lightning_module)

FastTextModel(
  (embeddings): EmbeddingBag(103910, 50, mode='mean', padding_idx=103909)
  (emb_0): Embedding(23, 10)
  (emb_1): Embedding(27, 10)
  (emb_2): Embedding(8, 10)
  (emb_3): Embedding(11, 10)
  (emb_4): Embedding(3, 10)
  (emb_5): Embedding(4, 10)
  (fc): Linear(in_features=60, out_features=732, bias=True)
)
<NGramTokenizer(min_n=3, max_n=6, num_tokens=100000, word_ngrams=3, nwords=3909)>
FastTextModule(
  (model): FastTextModel(
    (embeddings): EmbeddingBag(103910, 50, mode='mean', padding_idx=103909)
    (emb_0): Embedding(23, 10)
    (emb_1): Embedding(27, 10)
    (emb_2): Embedding(8, 10)
    (emb_3): Embedding(11, 10)
    (emb_4): Embedding(3, 10)
    (emb_5): Embedding(4, 10)
    (fc): Linear(in_features=60, out_features=732, bias=True)
  )
  (loss): CrossEntropyLoss()
  (accuracy_fn): MulticlassAccuracy()
)


This step is useful to initialize the full torchFastText model without training it, if needed for some reason. But if it is not necessary, and we could have directly launched the training (building is then handled automatically if necessary).

You can play with the tokenizer.

In [17]:
sentence = ["lorem ipsum dolor sit amet"]
print(model.tokenizer.tokenize(sentence)[2])

[{102131: '<lo', 72511: 'lor', 56440: 'ore', 68076: 'rem', 63344: 'em>', 17273: '<lor', 19762: 'lore', 63903: 'orem', 78336: 'rem>', 36328: '<lore', 46245: 'lorem', 44849: 'orem>', 33327: '<lorem', 11039: 'lorem>', 88727: '<ip', 18306: 'ips', 44696: 'psu', 95469: 'sum', 49600: 'um>', 40312: '<ips', 41789: 'ipsu', 64447: 'psum', 58727: 'sum>', 92983: '<ipsu', 73940: 'ipsum', 88689: 'psum>', 33298: '<ipsum', 69736: 'ipsum>', 98251: '<do', 15821: 'dol', 99008: 'olo', 29299: 'or>', 4867: '<dol', 22698: 'dolo', 63696: 'olor', 56433: 'lor>', 73512: '<dolo', 98554: 'dolor', 53908: 'olor>', 45704: '<dolor', 39382: 'dolor>', 23656: '<si', 38756: 'sit', 53263: 'it>', 4090: '<sit', 41656: 'sit>', 65238: '<sit>', 14846: '<am', 13287: 'ame', 61390: 'met', 40939: 'et>', 83545: '<ame', 26983: 'amet', 27898: 'met>', 61477: '<amet', 16569: 'amet>', 5215: '<amet>', 0: '</s>', 25396: 'lorem ipsum', 31759: 'ipsum dolor', 70555: 'dolor sit', 39020: 'sit amet', 85844: 'amet </s>', 4015: 'lorem ipsum dolor',

Saving parameters to JSON can also be done after building, but the model needs to be rebuilt after loading.

In [18]:
model.to_json("torchFastText_config.json")
model = torchFastText.from_json("torchFastText_config.json")
model.build(X_train, y_train, lightning=True, lr=LR)

2025-02-24 18:21:57 - torchFastText.model.pytorch_model - num_rows is different from the number of tokens in the tokenizer. Using provided num_rows.
2025-02-24 18:21:57 - torchFastText.torchFastText - No scheduler parameters provided. Using default parameters (suited for ReduceLROnPlateau).


### Alternative way to build torchFastText

The training data is only useful to initialize the tokenizer, but X_train and y_train are not needed to initialize the PyTorch model, provided we give the right parameters to construct layer. 

To highlight this, we provide a lower-level process to build the model where one can first build the tokenizer, and then build the model with custom architecture parameters. 

The tokenizer can be loaded **from the same JSON file** as the model parameters, or initialized using the right arguments.

In [19]:
del model

In [22]:
training_text = X_train[:, 0].tolist()
categorical_variables = X_train[:, 1:]

# Before: this was inferred during the build method ; now required
CAT_VOCAB_SIZE = (np.max(categorical_variables, axis=0) + 1).astype(int).tolist()
NUM_CLASSES = len(np.unique(y_train))
NUM_CAT_VAR = categorical_variables.shape[1]

# Tokenizer needs training text to build the vocabulary
tokenizer = NGramTokenizer.from_json(
    "torchFastText_config.json", training_text
)  # alternative 1 - see that it is the same JSON file as before
tokenizer = NGramTokenizer(
    min_n=MIN_N,
    max_n=MAX_N,
    num_tokens=NUM_TOKENS,
    len_word_ngrams=LEN_WORD_NGRAMS,
    min_count=MIN_COUNT,
    training_text=training_text,
)  # alternative 2

# This model constructor is now independent from training data
model = torchFastText.build_from_tokenizer(
    tokenizer,
    embedding_dim=EMBED_DIM,
    categorical_embedding_dims=CAT_EMBED_DIM,
    sparse=SPARSE,
    lr=LR,
    num_classes=NUM_CLASSES,
    num_categorical_features=NUM_CAT_VAR,
    categorical_vocabulary_sizes=CAT_VOCAB_SIZE,
)

2025-02-24 18:23:48 - torchFastText.model.pytorch_model - num_rows is different from the number of tokens in the tokenizer. Using provided num_rows.


2025-02-24 18:23:48 - torchFastText.torchFastText - No scheduler parameters provided. Using default parameters (suited for ReduceLROnPlateau).


Note that the PyTorch model and the Lightning module are now directly built.

In [23]:
print(model.pytorch_model)
print(model.tokenizer)
print(model.lightning_module)

FastTextModel(
  (embeddings): EmbeddingBag(103910, 50, mode='mean', padding_idx=103909)
  (emb_0): Embedding(23, 10)
  (emb_1): Embedding(27, 10)
  (emb_2): Embedding(8, 10)
  (emb_3): Embedding(11, 10)
  (emb_4): Embedding(3, 10)
  (emb_5): Embedding(4, 10)
  (fc): Linear(in_features=60, out_features=732, bias=True)
)
<NGramTokenizer(min_n=3, max_n=6, num_tokens=100000, word_ngrams=3, nwords=3909)>
FastTextModule(
  (model): FastTextModel(
    (embeddings): EmbeddingBag(103910, 50, mode='mean', padding_idx=103909)
    (emb_0): Embedding(23, 10)
    (emb_1): Embedding(27, 10)
    (emb_2): Embedding(8, 10)
    (emb_3): Embedding(11, 10)
    (emb_4): Embedding(3, 10)
    (emb_5): Embedding(4, 10)
    (fc): Linear(in_features=60, out_features=732, bias=True)
  )
  (loss): CrossEntropyLoss()
  (accuracy_fn): MulticlassAccuracy()
)


If the PyTorch model building did not use the training data, please keep in mind that its architecture (that you customize here) should match the vocabulary size of the categorical variables and the total number of class, otherwise the model will raise an error during training.

# Train a torchFastText model

In [24]:
model.train(
    X_train,
    y_train,
    X_test,
    y_test,
    num_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    patience_scheduler=PATIENCE,
    patience_train=PATIENCE,
    lr=LR,
    verbose=True,
)

## The library uses lightning library to train the model. It is possible to add some specific parameters to the training method to use it :
##
## trainer_params = {'profiler': 'simple', 'enable_progress_bar': False}
##
## model.train(
##    X_train,
##    y_train,
##    X_test,
##    y_test,
##    num_epochs=NUM_EPOCHS,
##    batch_size=BATCH_SIZE,
##    patience_scheduler=PATIENCE,
##    patience_train=PATIENCE,
##    lr=LR,
##    verbose = True,
##    trainer_params = trainer_params
##)

2025-02-24 18:24:11 - torchFastText.torchFastText - Checking inputs...
2025-02-24 18:24:11 - torchFastText.torchFastText - Inputs successfully checked. Starting the training process..
2025-02-24 18:24:11 - torchFastText.torchFastText - Running on: cpu
2025-02-24 18:24:11 - torchFastText.datasets.dataset - Creating DataLoader with 12 workers.
2025-02-24 18:24:11 - torchFastText.datasets.dataset - Creating DataLoader with 12 workers.
2025-02-24 18:24:11 - torchFastText.torchFastText - Lightning module successfully created.


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
2025-02-24 18:24:11 - torchFastText.torchFastText - Launching training...

  | Name        | Type               | Params | Mode 
-----------------------------------------------------------
0 | model       | FastTextModel      | 5.2 M  | train
1 | loss        | CrossEntropyLoss   | 0      | train
2 | accuracy_fn | MulticlassAccuracy | 0      | train
--------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
2025-02-24 18:24:46 - torchFastText.torchFastText - Training done in 35.16 seconds.


# Load a trained model from a Lightning checkpoint

In [None]:
del model
model.load_from_checkpoint(model.best_model_path)  # or any other checkpoint path (string)

In [37]:
import torch
torch.load(path, weights_only=False)

{'epoch': 0,
 'global_step': 11,
 'pytorch-lightning_version': '2.5.0.post0',
 'state_dict': OrderedDict([('model.embeddings.weight',
               tensor([[-0.4155,  0.5001, -0.0511,  ..., -0.2551, -1.0067,  0.4193],
                       [-0.6866, -0.5464,  0.3444,  ..., -0.3271,  1.8750, -1.1317],
                       [-0.8242, -0.1431, -3.5056,  ...,  1.0447, -0.4933, -1.1999],
                       ...,
                       [ 2.1735, -0.8185, -0.1860,  ...,  0.8569,  0.8747, -0.8153],
                       [ 0.1416, -0.4247,  0.0305,  ...,  1.0649, -0.9892,  0.3626],
                       [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])),
              ('model.emb_0.weight',
               tensor([[ 0.2547, -2.8003,  1.4013,  1.5881, -0.7033,  0.8540, -0.5783,  0.2256,
                         0.7499,  2.9711],
                       [-0.2905,  0.4540,  0.7088, -2.1960, -0.0837,  0.2569,  0.0917, -0.4995,
                        -0.7544, -0.8420],
          

In [34]:
model = torchFastText.from_json("torchFastText_config.json")
path = "lightning_logs/version_0/checkpoints/epoch=0-step=11.ckpt"
model.load_from_checkpoint_test(path, "torchFastText_config.json")

AttributeError: 'NoneType' object has no attribute 'num_classes'

# Make predictions

In [None]:
text = ["coiffeur, boulangerie, pâtisserie"]
X = np.array([[text[0], 0, 0, 0, 0, 0, 0]])  # our new entry
TOP_K = 5

pred, conf = model.predict(X, top_k=TOP_K)
pred_naf = encoder.inverse_transform(pred.reshape(-1))
subset = naf2008.set_index("code").loc[np.flip(pred_naf)]

for i in range(TOP_K - 1, -1, -1):
    print(
        f"Prediction: {pred_naf[i]}, confidence:  {conf[0, i]}, description: {subset['libelle'][pred_naf[i]]}"
    )

# Explainability

In [None]:
from torchFastText.explainability.visualisation import (
    visualize_letter_scores,
    visualize_word_scores,
)

pred, conf, all_scores, all_scores_letters = model.predict_and_explain(X)
visualize_word_scores(all_scores, text, pred_naf.reshape(1, -1))
visualize_letter_scores(all_scores_letters, text, pred_naf.reshape(1, -1))