In [None]:
import os
import sys

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import s3fs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

sys.path.append("../src/")
from config.preprocess import clean_and_tokenize_df, clean_text_feature
from torchFastText import torchFastText

%load_ext autoreload
%autoreload 2

# Load and preprocess data

In [None]:
fs = s3fs.S3FileSystem(
    client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"},
    key=os.environ["AWS_ACCESS_KEY_ID"],
    secret=os.environ["AWS_SECRET_ACCESS_KEY"],
)
df = (
    pq.ParquetDataset(
        "projet-ape/extractions/20241027_sirene4.parquet",
        filesystem=fs,
    )
    .read_pandas()
    .to_pandas()
).fillna(np.nan)

In [None]:
with fs.open("projet-ape/data/naf2008.csv") as file:
    naf2008 = pd.read_csv(file, sep=";")
naf2008

In [None]:
def add_missing_codes(
    df: pd.DataFrame,
    df_naf: pd.DataFrame,
    y: str,
    text_feature: str,
    textual_features: list,
    categorical_features: list,
):
    missing_codes = set(df_naf["code"])
    fake_obs = df_naf[df_naf["code"].isin(missing_codes)]
    fake_obs[y] = fake_obs["code"]
    fake_obs[text_feature] = fake_obs[[text_feature]].apply(
        lambda row: " ".join(f"[{col}] {val}" for col, val in row.items() if val != ""), axis=1
    )
    df = pd.concat([df, fake_obs[[col for col in fake_obs.columns if col in df.columns]]])

    if textual_features is not None:
        for feature in textual_features:
            df[feature] = df[feature].fillna(value="")
    if categorical_features is not None:
        for feature in categorical_features:
            df[feature] = df[feature].fillna(value="NaN")

    print(f"\t*** {len(missing_codes)} missing codes have been added in the database...\n")
    return df

In [None]:
categorical_features = ["evenement_type", "cj",  "activ_nat_et", "liasse_type", "activ_surf_et", "activ_perm_et"]
text_feature = "libelle"
y = "apet_finale"
textual_features = None

df= add_missing_codes(df, naf2008, y, text_feature, textual_features, categorical_features)

## Preprocess text and target

We make available our processing function clean_text_feature for the text.

In [None]:
df["libelle_processed"] = clean_text_feature(df["libelle"])

In [None]:
encoder = LabelEncoder()
df["apet_finale"] = encoder.fit_transform(df["apet_finale"])

Put the columns in the right format:
 - First column contains the processed text (str)
 - Next ones contain the "tokenized" categorical variables in int format

In [None]:
df, _ = clean_and_tokenize_df(df, text_feature="libelle") # NE PAS OUBLIER DE REMETYTRE PROCESSEd
X = df[["libelle", "EVT", "CJ", "NAT", "TYP", "CRT", "SRF"]].values
y = df["apet_finale"].values
print(X)
print(y)

We split the data into train and test sets. We especially take care that classes with only one instance appear in the train set.

In [None]:
unique, counts = np.unique(y, return_counts=True)
class_counts = dict(zip(unique, counts))
single_instance_classes = {cls for cls, count in class_counts.items() if count == 1}
is_single_instance = np.isin(y, list(single_instance_classes))
is_remaining = ~is_single_instance

X_remaining, X_test, y_remaining, y_test = train_test_split(
    X[is_remaining], y[is_remaining], test_size=0.2, random_state=42, shuffle = True
)

# Combine single-instance samples with the training set
X_train = np.vstack([X_remaining, X[is_single_instance]])
y_train = np.concatenate([y_remaining, y[is_single_instance]])


# (Optional) Build the torch-fastText model (without training it)

In [None]:
# Parameters for model building

NUM_BUCKETS = int(2e6) # Number of rows in the embedding matrix
EMBED_DIM = 180 # Dimension of the embedding = number of columns in the embedding matrix
MIN_COUNT = 1 # Minimum number of occurrences of a word in the corpus to be included in the vocabulary
MIN_N = 3 # Minimum length of char n-grams
MAX_N = 6 # Maximum length of char n-grams
LEN_WORD_NGRAMS = 3 # Length of word n-grams
SPARSE = False # Whether to use sparse Embedding layer for fast computation (see PyTorch documentation)

In [None]:
model = torchFastText(
    num_buckets=NUM_BUCKETS,
    embedding_dim=EMBED_DIM,
    min_count=MIN_COUNT,
    min_n=MIN_N,
    max_n=MAX_N,
    len_word_ngrams=LEN_WORD_NGRAMS,
    sparse = SPARSE
)

We build the model using the training data. We have now access to the PyTorch model and a tokenizer.

In [None]:
model.build(X_train, y_train)

In [None]:
print(model.pytorch_model)
print(model.tokenizer)

This step is useful to initialize the full torchFastText model without training it, if needed for some reason. But if it is not necessary, and we could have directly launched the training (building is then handled automatically if necessary).

# Train a torchFastText model

In [None]:
NUM_EPOCHS = 5
BATCH_SIZE = 32
PATIENCE = 3
LR = 4e-3

In [None]:
model.train(
    X_train,
    y_train,
    X_test,
    y_test,
    num_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    patience_scheduler=PATIENCE,
    patience_train=PATIENCE,
    lr=LR,
    verbose = True
)

In [None]:
training_text = X_train[text_feature].to_list()
tokenizer = NGramTokenizer(min_count, min_n, max_n, buckets, word_ngrams, training_text)

train_dataset = FastTextModelDataset(
    categorical_variables=[
        X_train[column].astype(int).to_list() for column in X_train[categorical_features]
    ],
    texts=training_text,
    outputs=y_train.to_list(),
    tokenizer=tokenizer,
)
val_dataset = FastTextModelDataset(
    categorical_variables=[
        X_val[column].astype(int).to_list() for column in X_val[categorical_features]
    ],
    texts=X_val[text_feature].to_list(),
    outputs=y_val.to_list(),
    tokenizer=tokenizer,
)

In [None]:
train_dataloader = train_dataset.create_dataloader(batch_size=batch_size, num_workers=4)
val_dataloader = val_dataset.create_dataloader(batch_size=batch_size, num_workers=4)

In [None]:
x = next(iter(train_dataloader))

In [None]:
len(x)

In [None]:
num_classes = df[y].nunique()
categorical_vocabulary_sizes = [np.max(X_train[feature]) + 1 for feature in categorical_features]

In [None]:
categorical_vocabulary_sizes

In [None]:
df.CJ.nunique()

In [None]:
model = FastTextModel(
    tokenizer=tokenizer,
    nace_encoder=encoder,
    embedding_dim=embedding_dim,
    vocab_size=buckets + tokenizer.get_nwords() + 1,
    num_classes=num_classes,
    categorical_vocabulary_sizes=categorical_vocabulary_sizes,
    padding_idx=buckets + tokenizer.get_nwords(),
    sparse=sparse,
    direct_bagging=True,
)

In [None]:
model(x[0], x[1]).shape

In [None]:
# Define optimizer & scheduler
if sparse:
    optimizer = SGD
else:
    optimizer = Adam
optimizer_params = {"lr": lr}
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau
scheduler_params = {
    "mode": "min",
    "patience": patience,
}
loss = nn.CrossEntropyLoss()
# Lightning module
module = FastTextModule(
    model=model,
    loss=loss,
    optimizer=optimizer,
    optimizer_params=optimizer_params,
    scheduler=scheduler,
    scheduler_params=scheduler_params,
    scheduler_interval="epoch",
)

In [None]:
module.training_step(x, 0)