# Analyse dataset

In [1]:
import torch
print(torch.__version__)

2.6.0+cu118


In [2]:
import torch
print("CUDA disponible :", torch.cuda.is_available())
print("Nombre de GPU :", torch.cuda.device_count())
if torch.cuda.is_available():
    print("Nom du GPU :", torch.cuda.get_device_name(0))
    print("Version CUDA utilisée par PyTorch :", torch.version.cuda)

CUDA disponible : True
Nombre de GPU : 1
Nom du GPU : NVIDIA GeForce GTX 1650 Ti
Version CUDA utilisée par PyTorch : 11.8


In [3]:
import logging

# Niveau de log : DEBUG pour tout voir
logging.basicConfig(level=logging.WARNING)

In [4]:
import torch
torch.cuda.empty_cache()

In [5]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [6]:
!nvidia-smi

Sat Apr 26 19:14:44 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.99                 Driver Version: 555.99         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1650 Ti   WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   65C    P0             10W /   50W |     292MiB /   4096MiB |      2%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [7]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [8]:
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# from datasets import load_dataset, Dataset, DatasetDict

# dataset = load_dataset("json", data_files="dataset_big_patent_v3.json", split="train")

# dataset = dataset.remove_columns('query')

# # Split en train (80%) et test (20%)
# split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
# train_dataset = split_dataset["train"]
# test_dataset = split_dataset["test"]

2 versions :
https://sbert.net/docs/sentence_transformer/loss_overview.html
1. (anchor, positive, negative) triplets -> Ommission de query -> MultipleNegativesRankingLoss
2. Create a cutom Loss function for quadruplet -> (anchor, query, positive, negative)

In [9]:
!pip install sentence-transformers



In [10]:
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

# 1. Load a model to finetune with 2. (Optional) model card data
model = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2", # Restart kernel if not working but it is a functionnal V0-> "sentence-transformers/all-MiniLM-L6-v2",
    trust_remote_code=True,
    model_card_data=SentenceTransformerModelCardData(
        language="en"
    )
)




In [11]:
# 3. Load a dataset to finetune on
dataset = load_dataset("json", data_files="dataset_big_patent_v3.json", split="train")

dataset = dataset.remove_columns('query')

# Split en train (80%) et test (20%)
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

In [12]:
# 4. Define a loss function
loss = MultipleNegativesRankingLoss(model)

In [13]:
# 5. (Optional) Specify training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/big-patent-triplet",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if your GPU can't handle FP16
    bf16=False,  # Set to True if your GPU supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # Losses using "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="big-patent-triplet-V1",  # Used in W&B if `wandb` is installed
)

In [14]:
model.max_seq_length = 64

In [15]:
# 6. (Optional) Create an evaluator & evaluate the base model
dev_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name="big-patent-dev",
    batch_size=2
)
dev_evaluator(model)

{'big-patent-dev_cosine_accuracy': 0.6600000262260437}

In [17]:
# 7. Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)
trainer.train()

# (Optional) Evaluate the trained model on the test set, after training completes
triplet_evaluator = TripletEvaluator(
    anchors=train_dataset["anchor"],
    positives=train_dataset["positive"],
    negatives=train_dataset["negative"],
    name="V1_eval_triplet"
)
triplet_evaluator(model)

Step,Training Loss,Validation Loss


{'V1_eval_triplet_cosine_accuracy': 0.7343358397483826}

In [19]:
# 8. Save the trained model
model.save_pretrained("models/big-patent-all-MiniLM-L6-triplet/final")

In [None]:
# Data vizualisation
import pandas as pd

df = pd.DataFrame(dataset)
del df["query"]
df

In [None]:
# # Quadruplet original
# quadruplet = ("anchor", "query", "positive", "negative")

# # Transformation
# triplet_1 = InputExample(texts=[quadruplet[0], quadruplet[2], quadruplet[3]])  # (anchor, positive, negative)
# triplet_2 = InputExample(texts=[quadruplet[1], quadruplet[2], quadruplet[3]])  # (query, positive, negative)

In [None]:
# results = triplet_evaluator(model)
# print(triplet_evaluator.primary_metric)  # ex: "mon_eval_triplet_cosine_accuracy"
# print(results[triplet_evaluator.primary_metric])  # ex: 0.92 pour 92% d’exactitude

In [None]:
# loss = losses.CachedMultipleNegativesRankingLoss(
#     model,
#     mini_batch_size=128  # Traite 128 ex. par sous-lot
# )

In [None]:
from transformers.utils import is_torch_bf16_gpu_available
print(is_torch_bf16_gpu_available())

In [None]:
# # Pendant le fit :
# model.fit(
#     train_objectives=[...],
#     evaluator=evaluator,
#     epochs=1,
#     evaluation_steps=1000
# )

In [None]:
# https://huggingface.co/blog/train-sentence-transformers

In [None]:
# def preprocess_function(dataset):
#     texts = [f"Context: {c}\nQuestion: {q}\nAnswer: {a}" for c, q, a in zip(dataset["anchor"], dataset["query"], dataset["positive"])]
#     # Tokenize
#     model_inputs = tokenizer(texts, max_length=384, truncation=True, padding="max_length")
#     # labels = input_ids
#     model_inputs["labels"] = model_inputs["input_ids"].copy()
#     return model_inputs

In [None]:
import torch

# https://www.learnpytorch.io/pytorch_cheatsheet/
# Setup device-agnostic code 
if torch.cuda.is_available():
    device = "cuda" # NVIDIA GPU
elif torch.backends.mps.is_available():
    device = "mps" # Apple GPU
else:
    device = "cpu" # Defaults to CPU if NVIDIA GPU/Apple GPU aren't available

In [None]:
# # Zero-shot performance

# prompt = "Question: How does the crowdsourcing method is used to adjust a video game element ?\nAnswer:" # Expected : A processor retrieves a plurality of received game element feedback data from a plurality of users of a game and causes the game element to be adjusted during execution of the game 
# inputs = tokenizer(prompt, return_tensors="pt").to(device)
# outputs = model.generate(**inputs, max_new_tokens=50)
# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print(response)

In [None]:
# # Fine-tuned performance after

# prompt = "Question: How does the crowdsourcing method is used to adjust a video game element ?\nAnswer:" # Expected : A processor retrieves a plurality of received game element feedback data from a plurality of users of a game and causes the game element to be adjusted during execution of the game
# inputs = tokenizer(prompt, return_tensors="pt").to(device)
# outputs = model.generate(**inputs, max_new_tokens=50)
# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print(response)