In [25]:

import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel , prepare_model_for_kbit_training , get_peft_model
from trl import SFTTrainer


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np

In [27]:
import os
# disable Weights and Biases
os.environ['WANDB_DISABLED']="true"

In [28]:
df = pd.read_csv('fr_aides.csv', on_bad_lines='skip')

In [29]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

In [30]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)

In [31]:
import pandas as pd
from datasets import Dataset
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
import torch
import json

df = pd.read_csv('fr_aides.csv', delimiter=';')


# Map each unique tag to a numerical label
tag_to_label = {tag: idx for idx, tag in enumerate(df['tag'].unique())}
df['labels'] = df['tag'].map(tag_to_label)


with open('tag_to_label.json', 'w') as f:
    json.dump(tag_to_label, f)


dataset = Dataset.from_pandas(df)

model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(tag_to_label))
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['projets'], padding="max_length", truncation=True)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

device = torch.device('cpu')

# Move the model to the CPU
model.to(device)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    no_cuda=True,  # Explicitly disable CUDA
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
)

# Train the model
trainer.train()


                                           aide_href  \
0  https://les-aides.fr/aide/cUFf3w/ddfip/frr-exo...   
1  https://les-aides.fr/aide/RjNf3w/region-nouvel...   
2  https://les-aides.fr/aide/RjNv3w/region-nouvel...   
3  https://les-aides.fr/aide/cjmP3w/region-nouvel...   
4  https://les-aides.fr/aide/cRWf3w/region-nouvel...   

                               tag  \
0  Allègement des charges fiscales   
1                       Subvention   
2                       Subvention   
3                       Subvention   
4        Prise en charge des coûts   

                                             aid  \
0    FRR : exonération d'impôt sur les bénéfices   
1               Aide à l'hôtellerie indépendante   
2  Aide à l'hôtellerie de plein air indépendante   
3       Soutien aux hébergements - Gîtes d'étape   
4                      Objectif transmission TPE   

                                             projets  
0  Les entreprises créées ou reprises en zone Fra...  
1  Accompag

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1874 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


  0%|          | 0/1770 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.8549115061759949, 'eval_runtime': 227.4494, 'eval_samples_per_second': 8.239, 'eval_steps_per_second': 0.519, 'epoch': 1.0}


  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.6406155824661255, 'eval_runtime': 2155.4064, 'eval_samples_per_second': 0.869, 'eval_steps_per_second': 0.055, 'epoch': 2.0}


  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.5542246103286743, 'eval_runtime': 229.7639, 'eval_samples_per_second': 8.156, 'eval_steps_per_second': 0.514, 'epoch': 3.0}


  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.4253044128417969, 'eval_runtime': 315.5357, 'eval_samples_per_second': 5.939, 'eval_steps_per_second': 0.374, 'epoch': 4.0}
{'loss': 0.7957, 'grad_norm': 2.2997381687164307, 'learning_rate': 1.4350282485875708e-05, 'epoch': 4.24}


  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.3245341181755066, 'eval_runtime': 206.3267, 'eval_samples_per_second': 9.083, 'eval_steps_per_second': 0.572, 'epoch': 5.0}


  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.25927257537841797, 'eval_runtime': 221.3598, 'eval_samples_per_second': 8.466, 'eval_steps_per_second': 0.533, 'epoch': 6.0}


  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.2661759853363037, 'eval_runtime': 230.1316, 'eval_samples_per_second': 8.143, 'eval_steps_per_second': 0.513, 'epoch': 7.0}


  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.1619272083044052, 'eval_runtime': 218.0867, 'eval_samples_per_second': 8.593, 'eval_steps_per_second': 0.541, 'epoch': 8.0}
{'loss': 0.3315, 'grad_norm': 18.470346450805664, 'learning_rate': 8.700564971751413e-06, 'epoch': 8.47}


  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.13669157028198242, 'eval_runtime': 220.3377, 'eval_samples_per_second': 8.505, 'eval_steps_per_second': 0.536, 'epoch': 9.0}


  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.12305154651403427, 'eval_runtime': 228.5675, 'eval_samples_per_second': 8.199, 'eval_steps_per_second': 0.516, 'epoch': 10.0}


  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.09621628373861313, 'eval_runtime': 1179.8651, 'eval_samples_per_second': 1.588, 'eval_steps_per_second': 0.1, 'epoch': 11.0}


  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.08449530601501465, 'eval_runtime': 231.057, 'eval_samples_per_second': 8.111, 'eval_steps_per_second': 0.511, 'epoch': 12.0}
{'loss': 0.1386, 'grad_norm': 0.4945200979709625, 'learning_rate': 3.0508474576271192e-06, 'epoch': 12.71}


  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.07695002108812332, 'eval_runtime': 221.5933, 'eval_samples_per_second': 8.457, 'eval_steps_per_second': 0.533, 'epoch': 13.0}


  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.07259970903396606, 'eval_runtime': 228.3828, 'eval_samples_per_second': 8.206, 'eval_steps_per_second': 0.517, 'epoch': 14.0}


  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.07161016762256622, 'eval_runtime': 222.9146, 'eval_samples_per_second': 8.407, 'eval_steps_per_second': 0.529, 'epoch': 15.0}
{'train_runtime': 31584.788, 'train_samples_per_second': 0.89, 'train_steps_per_second': 0.056, 'train_loss': 0.37260923008460783, 'epoch': 15.0}


TrainOutput(global_step=1770, training_loss=0.37260923008460783, metrics={'train_runtime': 31584.788, 'train_samples_per_second': 0.89, 'train_steps_per_second': 0.056, 'total_flos': 3724455450193920.0, 'train_loss': 0.37260923008460783, 'epoch': 15.0})

In [32]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(labels, pred)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

trainer.evaluate()

results = trainer.evaluate()
print(results)



  0%|          | 0/118 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

{'eval_loss': 0.07161016762256622, 'eval_runtime': 236.7788, 'eval_samples_per_second': 7.915, 'eval_steps_per_second': 0.498, 'epoch': 15.0}


In [33]:
model.save_pretrained('./results/model')
tokenizer.save_pretrained('./results/tokenizer')

model = AutoModelForSequenceClassification.from_pretrained('./results/model')
tokenizer = AutoTokenizer.from_pretrained('./results/tokenizer')


In [34]:
import json
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

import json

tag_to_label = {tag: idx for idx, tag in enumerate(df['tag'].unique())}
df['labels'] = df['tag'].map(tag_to_label)

with open('tag_to_label.json', 'w') as f:
    json.dump(tag_to_label, f)


with open('tag_to_label.json', 'r') as f:
    tag_to_label = json.load(f)


label_to_tag = {v: k for k, v in tag_to_label.items()}


def predict(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)
    text_label = label_to_tag[predictions.item()]
    return text_label




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
df = pd.read_csv('fr_aides.csv', delimiter=';')

sample_df = df.sample(n=500, random_state=42)

test_texts = sample_df['projets'].tolist()
true_labels = sample_df['tag'].tolist()


model_name = "./results/model"  # Path to the fine-tuned model
tokenizer_name = "./results/tokenizer"  # Path to the fine-tuned tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

with open('tag_to_label.json', 'r') as f:
    tag_to_label = json.load(f)

# Reverse the mapping to get label_to_tag
label_to_tag = {v: k for k, v in tag_to_label.items()}


In [36]:
def test_model(texts, true_labels):
    predicted_labels = [predict(text) for text in texts]
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

results = test_model(test_texts, true_labels)
print(results)

{'accuracy': 0.982, 'precision': 0.9749900161030596, 'recall': 0.982, 'f1': 0.9764624186245748}


  _warn_prf(average, modifier, msg_start, len(result))
