# Environment 

In [1]:
!pip install -r ../requirements.txt



In [2]:
import os
import sys

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import s3fs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


Temporary (torchFastText in active development)

In [4]:
sys.path.append("../")
from torchFastText import torchFastText
from torchFastText.preprocess import clean_text_feature
from torchFastText.datasets import NGramTokenizer

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# !pip install torchFastText

Collecting torchFastText
  Downloading torchfasttext-0.0.2-py3-none-any.whl.metadata (5.3 kB)
Downloading torchfasttext-0.0.2-py3-none-any.whl (28 kB)
Installing collected packages: torchFastText
Successfully installed torchFastText-0.0.2


Some useful functions that will help us format our dataset

In [9]:
sys.path.append("notebooks/")
from utils import categorize_surface, clean_and_tokenize_df, stratified_split_rare_labels, add_libelles

# Load and preprocess data

In [7]:
fs = s3fs.S3FileSystem(
    client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"},
    anon=True,
)
df = (
    pq.ParquetDataset(
        "projet-ape/extractions/20241027_sirene4.parquet",
        filesystem=fs,
    )
    .read_pandas()
    .to_pandas()
).sample(frac=0.001).fillna(np.nan)

2025-02-25 16:09:05 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-02-25 16:09:05 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-02-25 16:09:07 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-02-25 16:09:09 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-02-25 16:09:09 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].


In [8]:
with fs.open("projet-ape/data/naf2008.csv") as file:
    naf2008 = pd.read_csv(file, sep=";")
naf2008

2025-02-25 16:09:22 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].


Unnamed: 0,code,libelle
0,0111Z,"Culture de céréales (à l'exception du riz), de..."
1,0112Z,Culture du riz
2,0113Z,"Culture de légumes, de melons, de racines et d..."
3,0114Z,Culture de la canne à sucre
4,0115Z,Culture du tabac
...,...,...
727,9609Z,Autres services personnels n.c.a.
728,9700Z,Activités des ménages en tant qu'employeurs de...
729,9810Z,Activités indifférenciées des ménages en tant ...
730,9820Z,Activités indifférenciées des ménages en tant ...


In [10]:
categorical_features = ["evenement_type", "cj",  "activ_nat_et", "liasse_type", "activ_surf_et", "activ_perm_et"]
text_feature = "libelle"
y = "apet_finale"
textual_features = None

df = add_libelles(df, naf2008, y, text_feature, textual_features, categorical_features)

	*** 732 codes have been added in the database...



## Preprocess text and target

We make available our processing function clean_text_feature for the text.

In [11]:
df["libelle_processed"] = clean_text_feature(df["libelle"])

In [12]:
encoder = LabelEncoder()
df["apet_finale"] = encoder.fit_transform(df["apet_finale"])

Put the columns in the right format:
 - First column contains the processed text (str)
 - Next ones contain the "tokenized" categorical (discrete) variables in int format

In [None]:
df, _ = clean_and_tokenize_df(df, text_feature="libelle_processed")
X = df[["libelle_processed", "EVT", "CJ", "NAT", "TYP", "CRT", "SRF"]].values
y = df["apet_finale"].values


[['loueur meubl non professionnel' 2 28 ... 5 1 0]
 ['realis artist' 2 28 ... 6 1 0]
 ['locat echafaudag montag demontag' 2 28 ... 7 1 0]
 ...
 ['[libelle] activit indifferencie menag tant producteur bien usag propr'
  22 28 ... 8 0 0]
 ['[libelle] activit indifferencie menag tant producteur servic usag propr'
  22 28 ... 8 0 0]
 ['[libelle] activit organis organ extraterritorial' 22 28 ... 8 0 0]]
[578 693 612 ... 729 730 731]


  "cj": "CJ",


In [21]:
print("Features for the 3 first obs\n")
print(X[:3])
print("\n")
print("NAF codes (labels) for the 3 first obs\n")
print(y[:3])

Features for the 3 first obs

[['loueur meubl non professionnel' 2 28 7 5 1 0]
 ['realis artist' 2 28 7 6 1 0]
 ['locat echafaudag montag demontag' 2 28 3 7 1 0]]


NAF codes (labels) for the 3 first obs

[578 693 612]


We split the data into train and test sets. We especially take care that:  
- classes with only one instance appear in the train set (instead of the test set)
- all classes are represented in the train set

The `stratified_split_rare_labels` function from the `preprocess` subpackage is used to carefully split the data.

In [11]:
X_train, X_test, y_train, y_test = stratified_split_rare_labels(X, y)
assert set(range(len(naf2008["code"]))) == set(np.unique(y_train))

# Build the torch-fastText model (without training it)

We first initialize the model (without building it).

In [59]:
# Parameters for model building
NUM_TOKENS= int(1e5) # Number of rows in the embedding matrix
EMBED_DIM = 50 # Dimension of the embedding = number of columns in the embedding matrix
SPARSE = False # Whether to use sparse Embedding layer for fast computation (see PyTorch documentation)
CAT_EMBED_DIM = 10 # Dimension of the embedding for categorical features

# Parameters for tokenizer
MIN_COUNT = 1 # Minimum number of occurrences of a word in the corpus to be included in the vocabulary
MIN_N = 3 # Minimum length of char n-grams
MAX_N = 6 # Maximum length of char n-grams
LEN_WORD_NGRAMS = 3 # Length of word n-grams

# Parameters for training - not useful immediately
NUM_EPOCHS = 1
BATCH_SIZE = 256
PATIENCE = 3


In [13]:
model = torchFastText(
    num_tokens=NUM_TOKENS,
    embedding_dim=EMBED_DIM,
    categorical_embedding_dims=CAT_EMBED_DIM,
    min_count=MIN_COUNT,
    min_n=MIN_N,
    max_n=MAX_N,
    len_word_ngrams=LEN_WORD_NGRAMS,
    sparse = SPARSE
)

2025-01-27 15:25:00 - torchFastText.torchFastText - categorical_embedding_dims provided but not categorical_vocabulary_sizes. It will be inferred later
2025-01-27 15:25:00 - torchFastText.torchFastText - categorical_embedding_dims provided as int but not num_categorical_features. It will be inferred later


We can save these parameters to a JSON file. Initialization can also be done providing a JSON file path.

In [14]:
model.to_json('torchFastText_config.json')

In [15]:
model = torchFastText.from_json('torchFastText_config.json')

2025-01-27 15:25:05 - torchFastText.torchFastText - categorical_embedding_dims provided but not categorical_vocabulary_sizes. It will be inferred later
2025-01-27 15:25:05 - torchFastText.torchFastText - categorical_embedding_dims provided as int but not num_categorical_features. It will be inferred later


We build the model using the training data. We have now access to the tokenizer, the PyTorch model as well as a PyTorch Lightning module ready to be trained.

In [43]:
LR = 4e-3
model.build(X_train, y_train, lightning=True, lr = LR)

2025-01-27 15:47:12 - torchFastText.torchFastText - num_categorical_features: old value is 2. New value is 6.


ValueError: Categorical vocabulary sizes and their embedding dimensions must have the same length

In [17]:
print(model.pytorch_model)
print(model.tokenizer)
print(model.lightning_module)

FastTextModel(
  (embeddings): EmbeddingBag(103992, 50, mode='mean')
  (emb_0): Embedding(21, 10)
  (emb_1): Embedding(26, 10)
  (emb_2): Embedding(8, 10)
  (emb_3): Embedding(12, 10)
  (emb_4): Embedding(3, 10)
  (emb_5): Embedding(4, 10)
  (fc): Linear(in_features=50, out_features=732, bias=True)
)
<NGramTokenizer(min_n=3, max_n=6, num_buckets=100000, word_ngrams=3, nwords=3991)>
FastTextModule(
  (model): FastTextModel(
    (embeddings): EmbeddingBag(103992, 50, mode='mean')
    (emb_0): Embedding(21, 10)
    (emb_1): Embedding(26, 10)
    (emb_2): Embedding(8, 10)
    (emb_3): Embedding(12, 10)
    (emb_4): Embedding(3, 10)
    (emb_5): Embedding(4, 10)
    (fc): Linear(in_features=50, out_features=732, bias=True)
  )
  (loss): CrossEntropyLoss()
  (accuracy_fn): MulticlassAccuracy()
)


This step is useful to initialize the full torchFastText model without training it, if needed for some reason. But if it is not necessary, and we could have directly launched the training (building is then handled automatically if necessary).

You can play with the tokenizer.

In [18]:
sentence = ["lorem ipsum dolor sit amet"]
print(model.tokenizer.tokenize(sentence)[2])

[{102213: '<lo', 72593: 'lor', 56522: 'ore', 68158: 'rem', 63426: 'em>', 17355: '<lor', 19844: 'lore', 63985: 'orem', 78418: 'rem>', 36410: '<lore', 46327: 'lorem', 44931: 'orem>', 33409: '<lorem', 11121: 'lorem>', 88809: '<ip', 18388: 'ips', 44778: 'psu', 95551: 'sum', 49682: 'um>', 40394: '<ips', 41871: 'ipsu', 64529: 'psum', 58809: 'sum>', 93065: '<ipsu', 74022: 'ipsum', 88771: 'psum>', 33380: '<ipsum', 69818: 'ipsum>', 98333: '<do', 15903: 'dol', 99090: 'olo', 29381: 'or>', 4949: '<dol', 22780: 'dolo', 63778: 'olor', 56515: 'lor>', 73594: '<dolo', 98636: 'dolor', 53990: 'olor>', 45786: '<dolor', 39464: 'dolor>', 23738: '<si', 38838: 'sit', 53345: 'it>', 4172: '<sit', 41738: 'sit>', 65320: '<sit>', 14928: '<am', 13369: 'ame', 61472: 'met', 41021: 'et>', 83627: '<ame', 27065: 'amet', 27980: 'met>', 61559: '<amet', 16651: 'amet>', 5297: '<amet>', 0: '</s>', 66049: 'lorem ipsum', 65182: 'ipsum dolor', 75764: 'dolor sit', 88112: 'sit amet', 74161: 'amet </s>', 38837: 'lorem ipsum dolor'

Saving parameters to JSON can also be done after building, but the model needs to be rebuilt after loading.

In [19]:
model.to_json('torchFastText_config.json')
model = torchFastText.from_json('torchFastText_config.json')
model.build(X_train, y_train, lightning=True, lr = LR)

2025-01-27 15:25:13 - torchFastText.torchFastText - No scheduler parameters provided. Using default parameters (suited for ReduceLROnPlateau).


### Alternative way to build torchFastText

The training data is only useful to initialize the tokenizer, but X_train and y_train are not needed to initialize the PyTorch model, provided we give the right parameters to construct layer. 

To highlight this, we provide a lower-level process to build the model where one can first build the tokenizer, and then build the model with custom architecture parameters. 

The tokenizer can be loaded **from the same JSON file** as the model parameters, or initialized using the right arguments.

In [None]:
del model

In [65]:
training_text = X_train[:, 0].tolist()
categorical_variables = X_train[:, 1:]

# Before: this was inferred during the build method ; now required
CAT_VOCAB_SIZE = (np.max(categorical_variables, axis=0) + 1).astype(int).tolist()

# Tokenizer needs training text to build the vocabulary
tokenizer = NGramTokenizer.from_json('torchFastText_config.json', training_text) # alternative 1 - see that it is the same JSON file as before
tokenizer = NGramTokenizer(min_n=MIN_N, max_n=MAX_N, num_tokens= NUM_TOKENS,len_word_ngrams=LEN_WORD_NGRAMS, min_count=MIN_COUNT, training_text=training_text) # alternative 2

# This model constructor is now independent from training data
model = torchFastText.build_from_tokenizer(tokenizer, embedding_dim=EMBED_DIM, categorical_embedding_dims=CAT_EMBED_DIM, sparse=SPARSE, lr = LR, num_classes=NUM_CLASSES, num_categorical_features=NUM_CAT_VAR, categorical_vocabulary_sizes=CAT_VOCAB_SIZE)

2025-01-27 16:49:17 - torchFastText.torchFastText - No scheduler parameters provided. Using default parameters (suited for ReduceLROnPlateau).


Note that the PyTorch model and the Lightning module are now directly built.

In [66]:
print(model.pytorch_model)
print(model.tokenizer)
print(model.lightning_module)

FastTextModel(
  (embeddings): EmbeddingBag(103992, 50, mode='mean')
  (emb_0): Embedding(21, 10)
  (emb_1): Embedding(26, 10)
  (emb_2): Embedding(8, 10)
  (emb_3): Embedding(12, 10)
  (emb_4): Embedding(3, 10)
  (emb_5): Embedding(4, 10)
  (fc): Linear(in_features=60, out_features=732, bias=True)
)
<NGramTokenizer(min_n=3, max_n=6, num_buckets=100000, word_ngrams=3, nwords=3991)>
FastTextModule(
  (model): FastTextModel(
    (embeddings): EmbeddingBag(103992, 50, mode='mean')
    (emb_0): Embedding(21, 10)
    (emb_1): Embedding(26, 10)
    (emb_2): Embedding(8, 10)
    (emb_3): Embedding(12, 10)
    (emb_4): Embedding(3, 10)
    (emb_5): Embedding(4, 10)
    (fc): Linear(in_features=60, out_features=732, bias=True)
  )
  (loss): CrossEntropyLoss()
  (accuracy_fn): MulticlassAccuracy()
)


If the PyTorch model building did not use the training data, please keep in mind that its architecture (that you customize here) should match the vocabulary size of the categorical variables and the total number of class, otherwise the model will raise an error during training.

# Train a torchFastText model

In [None]:
model.train(
    X_train,
    y_train,
    X_test,
    y_test,
    num_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    patience_scheduler=PATIENCE,
    patience_train=PATIENCE,
    lr=LR,
    verbose = True
)

## The library uses lightning library to train the model. It is possible to add some specific parameters to the training method to use it :
##
## trainer_params = {'profiler': 'simple', 'enable_progress_bar': False}
##
## model.train(
##    X_train,
##    y_train,
##    X_test,
##    y_test,
##    num_epochs=NUM_EPOCHS,
##    batch_size=BATCH_SIZE,
##    patience_scheduler=PATIENCE,
##    patience_train=PATIENCE,
##    lr=LR,
##    verbose = True,
##    trainer_params = trainer_params
##)

2025-01-27 16:49:23 - torchFastText.torchFastText - Checking inputs...
2025-01-27 16:49:23 - torchFastText.torchFastText - Inputs successfully checked. Starting the training process..
2025-01-27 16:49:23 - torchFastText.torchFastText - Running on: cpu
2025-01-27 16:49:23 - torchFastText.torchFastText - Lightning module successfully created.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
2025-01-27 16:49:23 - torchFastText.torchFastText - Launching training...

  | Name        | Type               | Params | Mode 
-----------------------------------------------------------
0 | model       | FastTextModel      | 5.2 M  | train
1 | loss        | CrossEntropyLoss   | 0      | train
2 | accuracy_fn | MulticlassAccuracy | 0      | train
-----------------------------------------------------------
5.2 M     Trainable params
0         Non-trainable params
5.2 M     Total params
20.980    Total estimated model params size (MB)
11   

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
2025-01-27 16:49:26 - torchFastText.torchFastText - Training done in 3.69 seconds.


# Load a trained model from a Lightning checkpoint

In [None]:
model.load_from_checkpoint(model.best_model_path) # or any other checkpoint path (string)

# Make predictions

In [None]:
text = ["coiffeur, boulangerie, pâtisserie"]
X= np.array([[text[0], 0, 0, 0, 0, 0, 0]]) # our new entry
TOP_K = 5

pred, conf = model.predict(X, top_k=TOP_K)
pred_naf = encoder.inverse_transform(pred.reshape(-1))
subset = naf2008.set_index("code").loc[np.flip(pred_naf)]

for i in range(TOP_K-1, -1, -1):
    print(f"Prediction: {pred_naf[i]}, confidence:  {conf[0, i]}, description: {subset['libelle'][pred_naf[i]]}")


# Explainability

In [None]:
from torchFastText.explainability.visualisation import (
    visualize_letter_scores,
    visualize_word_scores,
)

pred, conf, all_scores, all_scores_letters = model.predict_and_explain(X)
visualize_word_scores(all_scores, text, pred_naf.reshape(1, -1))
visualize_letter_scores(all_scores_letters, text, pred_naf.reshape(1, -1))