In [None]:
import os
import sys

import mlflow
import numpy as np
import pyarrow.parquet as pq
import s3fs
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch import nn
from torch.optim import SGD, Adam

sys.path.append("../src/")

import explainability.utils
from config.dataset import FastTextModelDataset
from config.preprocess import clean_and_tokenize_df
from pytorch_model import FastTextModel, FastTextModule
from tokenizer import NGramTokenizer

# Automatic discovery : if MLFlow has been launched before Jupyter/VSCode
if "MLFLOW_TRACKING_URI" in os.environ:
    print(os.environ["MLFLOW_TRACKING_URI"])
else:
    print("MLflow was not automatically discovered, a tracking URI must be provided manually.")

%load_ext autoreload
%autoreload 2

In [None]:
model_name = "fasttext-pytorch"
version = 8
module = mlflow.pytorch.load_model(model_uri=f"models:/{model_name}/{version}")
model = module.model

In [None]:
model_name = "fasttext"
version = 1
fasttext = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{version}")

In [None]:
model = torch.load("model.pth")
model.direct_bagging = False
model.eval()

In [None]:
text = [
    "Rénovation bâtiments dont: électricité, plomberie, serrurerie, menuiserie",
    "Gestion de portefeuille pour le compte de tiers et gestion de fonds d'investissement",
    "L'acquisition, l'apport, la propriété, la mise en valeur, la transformation, la construction, l'aménagement, l'administration, la location",
    "Saisie de documents et extraction de donnée pour le compte d'entreprises",
    "L'investissement immobilier, l'achat et vente de biens immobiliers en qualité de marchand de Biens, la promotion immobilière, la gestion loc",
    "La Société a pour objet en France et à l'étranger : - l'activité de conseil au profit de toute personne physique ou morale ; - l'enseignement et la formation de toutes matières en cours collectifs et particuliers à domicile ou dans des établissements scolaires",
]

text = ["Rénovation bâtiments dont: électricité, plomberie, serrurerie, menuiserie"]
topk = 4
params = {"additional_var": [1] * len(text)}
# params = {f"feature_{i}": x[1 + i][0].numpy().tolist() for i in range(len(x) - 1)}
pred, confidence, all_scores, all_scores_letters = model.predict_and_explain(
    text, params, top_k=topk
)
print(pred)
print(confidence)
explainability.explainability_viz.visualize_word_scores(all_scores, text, pred)
explainability.explainability_viz.visualize_letter_scores(all_scores_letters, text, pred)

In [None]:
import time

from tqdm import tqdm

fasttext_times = []
pytorch_times = []
pytorch_explain_times = []
for i in tqdm(range(1000)):
    start = time.time()
    _ = fasttext.predict(text)
    end = time.time()
    fasttext_times.append(end - start)
    start = time.time()
    _ = model.predict(text, params, explain=False)
    end = time.time()
    pytorch_times.append(end - start)
    start = time.time()
    _ = model.predict(text, params, explain=True)
    end = time.time()
    pytorch_explain_times.append(end - start)

In [None]:
print(f"FastText: {np.mean(fasttext_times)}")
print(f"Pytorch: {np.mean(pytorch_times)}")
print(f"Pytorch Explain: {np.mean(pytorch_explain_times)}")

In [None]:
fs = s3fs.S3FileSystem(
    client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"},
    key=os.environ["AWS_ACCESS_KEY_ID"],
    secret=os.environ["AWS_SECRET_ACCESS_KEY"],
)
df = (
    pq.ParquetDataset(
        "projet-ape/extractions/20241027_sirene4.parquet",
        filesystem=fs,
    )
    .read_pandas()
    .to_pandas()
)

df = df.fillna(np.nan)

In [None]:
# df = clean_text_feature(df, text_feature="libelle")
df, _ = clean_and_tokenize_df(df)
encoder = LabelEncoder()
df["apet_finale"] = encoder.fit_transform(df["apet_finale"])

In [None]:
df

In [None]:
model.model

In [None]:
y = "apet_finale"
text_feature = "libelle"
categorical_features = ["EVT", "CJ", "NAT", "TYP", "CRT", "SRF"]
features = [text_feature]
if categorical_features is not None:
    features += categorical_features
X_train, X_val, y_train, y_val = train_test_split(
    df[features],
    df[y],
    test_size=1 - 0.8,
    random_state=0,
    shuffle=True,
)

In [None]:
X_train.values[:, 1:].astype(int).shape

In [None]:
isinstance(X_train.values, np.ndarray)

In [None]:
params = {
    "max_epochs": 1,
    "patience": 3,
    "train_proportion": 0.8,
    "batch_size": 256,
    "lr": 0.004,
    "buckets": 2000000,
    "dim": 180,
    "minCount": 1,
    "minn": 3,
    "maxn": 6,
    "wordNgrams": 3,
    "sparse": False,
}

max_epochs = params["max_epochs"]
patience = params["patience"]
train_proportion = params["train_proportion"]
batch_size = params["batch_size"]
lr = params["lr"]
buckets = params["buckets"]
embedding_dim = params["dim"]
min_count = params["minCount"]
min_n = params["minn"]
max_n = params["maxn"]
word_ngrams = params["wordNgrams"]
sparse = params["sparse"]

In [None]:
from torchFastText import torchFastText

model = torchFastText(
    num_buckets=buckets,
    embedding_dim=embedding_dim,
    num_classes=21,
    min_count=min_count,
    min_n=min_n,
    max_n=max_n,
    len_word_ngrams=word_ngrams,
)

model.build(df["libelle"], df[["EVT", "CJ", "NAT", "TYP", "CRT", "SRF"]])

In [None]:
training_text = X_train[text_feature].to_list()
tokenizer = NGramTokenizer(min_count, min_n, max_n, buckets, word_ngrams, training_text)

train_dataset = FastTextModelDataset(
    categorical_variables=[
        X_train[column].astype(int).to_list() for column in X_train[categorical_features]
    ],
    texts=training_text,
    outputs=y_train.to_list(),
    tokenizer=tokenizer,
)
val_dataset = FastTextModelDataset(
    categorical_variables=[
        X_val[column].astype(int).to_list() for column in X_val[categorical_features]
    ],
    texts=X_val[text_feature].to_list(),
    outputs=y_val.to_list(),
    tokenizer=tokenizer,
)

In [None]:
train_dataloader = train_dataset.create_dataloader(batch_size=batch_size, num_workers=4)
val_dataloader = val_dataset.create_dataloader(batch_size=batch_size, num_workers=4)

In [None]:
x = next(iter(train_dataloader))

In [None]:
len(x)

In [None]:
num_classes = df[y].nunique()
categorical_vocabulary_sizes = [np.max(X_train[feature]) + 1 for feature in categorical_features]

In [None]:
categorical_vocabulary_sizes

In [None]:
df.CJ.nunique()

In [None]:
model = FastTextModel(
    tokenizer=tokenizer,
    nace_encoder=encoder,
    embedding_dim=embedding_dim,
    vocab_size=buckets + tokenizer.get_nwords() + 1,
    num_classes=num_classes,
    categorical_vocabulary_sizes=categorical_vocabulary_sizes,
    padding_idx=buckets + tokenizer.get_nwords(),
    sparse=sparse,
    direct_bagging=True,
)

In [None]:
model(x[0], x[1]).shape

In [None]:
# Define optimizer & scheduler
if sparse:
    optimizer = SGD
else:
    optimizer = Adam
optimizer_params = {"lr": lr}
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau
scheduler_params = {
    "mode": "min",
    "patience": patience,
}
loss = nn.CrossEntropyLoss()
# Lightning module
module = FastTextModule(
    model=model,
    loss=loss,
    optimizer=optimizer,
    optimizer_params=optimizer_params,
    scheduler=scheduler,
    scheduler_params=scheduler_params,
    scheduler_interval="epoch",
)

In [None]:
module.training_step(x, 0)