In [None]:
%env AWS_ACCESS_KEY_ID=projet-ape-sa
%env AWS_SECRET_ACCESS_KEY=0obEe7LB59g1Zj65nueDa84OQvrlyfPH

In [None]:
import os
del os.environ["AWS_SESSION_TOKEN"]

In [None]:
!pip install nltk
!pip install sentencepiece

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import sys
sys.path.append("../")
sys.path.append("../src/")
from src.camembert.camembert_preprocessor import CamembertPreprocessor
from src.camembert.camembert_trainer import CamembertTrainer

In [None]:
preprocessor = CamembertPreprocessor()
trainer = CamembertTrainer()

In [None]:
import pyarrow.parquet as pq
import s3fs


fs = s3fs.S3FileSystem(
    client_kwargs={'endpoint_url': 'https://'+'minio.lab.sspcloud.fr'}
)
# Load data
df = pq.read_table(
    "projet-ape/extractions/20240124_sirene4.parquet", filesystem=fs
).to_pandas()

df.head()


In [None]:
df.rename(columns={"activ_nat_et": "NAT_SICORE", "activ_surf_et": "SURF"}, inplace=True)

In [None]:
df.rename(columns={"apet_finale": "APE_NIV5"}, inplace=True)

In [None]:
# Preprocess data
df_train, df_test = preprocessor.preprocess(
    df=df,
    y="APE_NIV5",
    text_feature="activ_pr_lib_et",
    categorical_features=["NAT_SICORE", "SURF"],
)

In [None]:
df_train.head()

In [None]:
from transformers import CamembertTokenizer, Trainer, TrainingArguments
from camembert.camembert_model import CustomCamembertModel

In [None]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base")

In [None]:
def tokenize(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
y = "APE_NIV5"
text_feature = "activ_pr_lib_et"
categorical_features = ["NAT_SICORE", "SURF"]

In [None]:
from utils.mappings import mappings

In [None]:
model = CustomCamembertModel.from_pretrained(
    "camembert/camembert-base",
    len(mappings.get("APE_NIV5")),
    categorical_features=categorical_features,
)

In [None]:
num_epochs = 2
train_proportion = 0.8
batch_size = 8
learning_rate = 5e-5

In [None]:
# Train/val split
features = [text_feature]
if categorical_features is not None:
    features += categorical_features

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

In [None]:
df_train = df_train.rename(columns={text_feature: "text", y: "labels"})
df_train.head()

In [None]:
df_train.NAT_SICORE.unique()

In [None]:
df_train.SURF.unique()

In [None]:
df_train["SURF"] = df_train["SURF"].fillna(0).astype("int")
df_train.head()

In [None]:
df_train["categorical_inputs"] = df_train[categorical_features].apply(lambda x: x.tolist(), axis=1)
df_train.head()

In [None]:
df_train = df_train.drop(columns=categorical_features)
df_train.head()

In [None]:
df_train = df_train.drop(columns=["APE_NIV1", "APE_NIV2", "APE_NIV3", "APE_NIV4"])
df_train.head()

In [None]:
df_train = df_train.dropna(subset=["labels"])
df_train.head()

In [None]:
df_train.labels = df_train.labels.astype(int)
df_train.head()

In [None]:
df_train = df_train.head(2000)

In [None]:
train_df, val_df = train_test_split(
    df_train[["text", "labels", "categorical_inputs"]],
    test_size=1 - train_proportion,
    random_state=0,
    shuffle=True,
)

In [None]:
train_df.head()

In [None]:
train_ds = Dataset.from_pandas(train_df, split="train")
test_ds = Dataset.from_pandas(val_df, split="test")

In [None]:
train_ds[0]

In [None]:
tokenized_train_ds = train_ds.map(tokenize)
tokenized_test_ds = test_ds.map(tokenize)

In [None]:
tokenized_train_ds[0]

In [None]:
!pip install accelerate -U

In [None]:
training_args = TrainingArguments(
    output_dir="camembert_model",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()