# Analyse dataset

## Misc

In [1]:
import torch
print(torch.__version__)

2.6.0+cu118


In [2]:
import torch
print("CUDA disponible :", torch.cuda.is_available())
print("Nombre de GPU :", torch.cuda.device_count())
if torch.cuda.is_available():
    print("Nom du GPU :", torch.cuda.get_device_name(0))
    print("Version CUDA utilisée par PyTorch :", torch.version.cuda)

CUDA disponible : True
Nombre de GPU : 1
Nom du GPU : NVIDIA GeForce GTX 1650 Ti
Version CUDA utilisée par PyTorch : 11.8


In [3]:
import logging

# Niveau de log : DEBUG pour tout voir
logging.basicConfig(level=logging.WARNING)

In [4]:
import torch
torch.cuda.empty_cache()

In [5]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [6]:
!nvidia-smi

Sun Apr 27 12:04:04 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.99                 Driver Version: 555.99         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1650 Ti   WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   61C    P8              4W /   50W |     379MiB /   4096MiB |     35%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [7]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


## Analyse

In [None]:
import pandas as pd
import json
from ydata_profiling import ProfileReport

In [None]:
df = pd.read_json('dataset_big_patent_v3.json')
profile = ProfileReport(df, title="Profiling Report")

In [None]:
profile.to_file("report.html")

2 versions :
https://sbert.net/docs/sentence_transformer/loss_overview.html
1. (anchor, positive, negative) triplets -> Ommission de query -> MultipleNegativesRankingLoss
2. Create a cutom Loss function for quadruplet -> (anchor, query, positive, negative)

Nouvelle idée : mix des deux triplets et une seule loss fonction pour les deux

# V1

In [9]:
!pip install sentence-transformers



In [10]:
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

# Load a model to finetune
model = SentenceTransformer(
    "intfloat/multilingual-e5-large-instruct", # Restart kernel if not working but it is a functionnal V0-> "sentence-transformers/all-MiniLM-L6-v2",
    trust_remote_code=True,
    model_card_data=SentenceTransformerModelCardData(
        language="en"
    )
)




In [11]:
dataset = load_dataset("json", data_files="dataset_big_patent_v3.json", split="train")

dataset = dataset.remove_columns('query')

# Split en train (80%) et test (20%)
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

In [12]:
# from transformers import MarianMTModel, MarianTokenizer

# def back_translate(text, src_lang='en', tgt_lang='fr'):
#     # Traduction source -> cible
#     model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
#     tokenizer = MarianTokenizer.from_pretrained(model_name)
    
#     model = MarianMTModel.from_pretrained(model_name)
#     translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))
#     intermediate_text = tokenizer.decode(translated[0], skip_special_tokens=True)

#     # Source
#     model_name = f'Helsinki-NLP/opus-mt-{tgt_lang}-{src_lang}'
    
#     tokenizer = MarianTokenizer.from_pretrained(model_name)
#     model = MarianMTModel.from_pretrained(model_name)
#     back_translated = model.generate(**tokenizer(intermediate_text, return_tensors="pt", padding=True))
#     return tokenizer.decode(back_translated[0], skip_special_tokens=True)

In [13]:
!pip install nlpaug



In [14]:
!pip install sentencepiece



In [15]:
!pip install sacremoses



In [16]:
!pip install huggingface_hub[hf_xet]



In [17]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Matts\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [18]:
# Data augmentation -> Too long

# import nlpaug.augmenter.word as naw
# from transformers import pipeline
# from tqdm import tqdm

# def augment_triplet(anchor, positive, negative):

#     paraphraser = pipeline('text2text-generation', model='t5-base')
#     positive_aug = paraphraser(positive)[0]['generated_text']
    
#     # Remplacer 30% des mots dans negative
#     aug = naw.SynonymAug(aug_src='wordnet')
#     negative_aug = aug.augment(negative)[0]
    
#     return (anchor, positive_aug, negative_aug)

# augmented_data = []
# for a, p, n in tqdm(zip(train_dataset["anchor"], train_dataset["positive"], train_dataset["negative"])):
#     augmented_data.append(augment_triplet(a, p, n))

# dataset = Dataset.from_dict({
#     "anchor": [t[0] for t in augmented_data],
#     "positive": [t[1] for t in augmented_data],
#     "negative": [t[2] for t in augmented_data]
# })


In [19]:
loss = MultipleNegativesRankingLoss(model)

In [20]:
args = SentenceTransformerTrainingArguments(
    # Required
    output_dir="models/big-patent-triplet",
    # Optional
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    fp16=True,  
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="big-patent-triplet-V1", 
)

In [21]:
model.max_seq_length = 64

In [22]:
dev_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name="big-patent-dev",
    batch_size=2
)
dev_evaluator(model)

{'big-patent-dev_cosine_accuracy': 0.6399999856948853}

In [23]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)
trainer.train()

triplet_evaluator = TripletEvaluator(
    anchors=train_dataset["anchor"],
    positives=train_dataset["positive"],
    negatives=train_dataset["negative"],
    name="V1_eval_triplet"
)
triplet_evaluator(model)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss


{'V1_eval_triplet_cosine_accuracy': 0.7794486284255981}

In [27]:
model.save_pretrained("models/big-patent-e5-large-triplet-V2/final")

In [25]:
# Next step :
# Data augmentation : Trop long
# fine tuning : Done -> 0.78 acc
# changer de modèle : ? 
# Multi triplet : Out Of memory -> Modèle moins gourmand ?

In [None]:
from transformers.utils import is_torch_bf16_gpu_available
print(is_torch_bf16_gpu_available())

In [None]:
# https://huggingface.co/blog/train-sentence-transformers

# V2 -> CUDA OUT OF MEMORY

In [29]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer
from sentence_transformers.losses import CoSENTLoss, MultipleNegativesRankingLoss, SoftmaxLoss

# Load a model to finetune
model = SentenceTransformer(
    "intfloat/multilingual-e5-large-instruct", # Restart kernel if not working but it is a functionnal V0-> "sentence-transformers/all-MiniLM-L6-v2",
    trust_remote_code=True,
    model_card_data=SentenceTransformerModelCardData(
        language="en"
    )
)

In [36]:
# 2. Loadseveral Datasets to train with
# (anchor, positive, negative)
anchor_triplet_train = load_dataset("json", data_files="dataset_big_patent_v3.json", split="train")
# (query, positive, negative)
query_triplet_train = load_dataset("json", data_files="dataset_big_patent_v3.json", split="train")

anchor_triplet_train = anchor_triplet_train.remove_columns('query')
query_triplet_train = query_triplet_train.remove_columns('anchor')

# Split en train (80%) et test (20%)
def split(dataset) :
    split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
    train_dataset = split_dataset["train"]
    test_dataset = split_dataset["test"]
    return train_dataset, test_dataset

train_dataset_anchor, test_dataset_anchor = split(anchor_triplet_train) 
train_dataset_query, test_dataset_query = split(query_triplet_train) 

# Combine all datasets into a dictionary with dataset names to datasets
train_dataset = {
    "anchor-triplet": train_dataset_anchor,
    "query-triplet": train_dataset_query,
}

# Use a dictionary for the evaluation dataset too, or just use one dataset or none at all
test_dataset  = {
    "anchor-triplet": test_dataset_anchor,
    "query-triplet": test_dataset_query,
}

In [31]:
# 4. Load several loss functions to train with
mnrl_loss = MultipleNegativesRankingLoss(model)

In [33]:
from sentence_transformers.evaluation import SequentialEvaluator

# Create a mapping with dataset names to loss functions, so the trainer knows which loss to apply where
# Note: You can also just use one loss if all your training/evaluation datasets use the same loss
losses = {
    "anchor-triplet": mnrl_loss,
    "query-triplet": mnrl_loss,
}

# Evaluators

dev_evaluator_1 = TripletEvaluator(
    anchors=train_dataset_anchor["anchor"],
    positives=train_dataset_anchor["positive"],
    negatives=train_dataset_anchor["negative"],
    name="big-patent-dev",
    batch_size=2
)

dev_evaluator_2 = TripletEvaluator(
    anchors=train_dataset_query["query"],
    positives=train_dataset_query["positive"],
    negatives=train_dataset_query["negative"],
    name="big-patent-dev",
    batch_size=2
)

# Combines evaluators
dev_evaluator = SequentialEvaluator([dev_evaluator_1, dev_evaluator_2], 
                                    main_score_function=lambda scores: scores[0])

In [38]:
args = SentenceTransformerTrainingArguments(
    # Required
    output_dir="models/big-patent-triplet",
    # Optional
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    fp16=True,  
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="big-patent-triplet-V1", 
)

In [39]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss=losses,
    evaluator=dev_evaluator,
)
trainer.train()

anchor_triplet_evaluator = TripletEvaluator(
    anchors=test_dataset_anchor["anchor"],
    positives=test_dataset_anchor["positive"],
    negatives=test_dataset_anchor["negative"],
    name="V1_eval_triplet"
)
anchor_triplet_evaluator(model)

query_triplet_evaluator = TripletEvaluator(
    anchors=test_dataset_query["query"],
    positives=test_dataset_query["positive"],
    negatives=test_dataset_query["negative"],
    name="V1_eval_triplet"
)
query_triplet_evaluator(model)

OutOfMemoryError: CUDA out of memory. Tried to allocate 978.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 17.23 GiB is allocated by PyTorch, and 606.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# 6. Save the trained model and optionally push it to the Hugging Face Hub
model.save_pretrained("big-patent-e5-large-multi-dataset")