# Analyze dataset

In [None]:
pip install ydata-profiling

In [None]:
import pandas as pd
import json
from ydata_profiling import ProfileReport

In [None]:
df = pd.read_json('dataset_big_patent_v3.json')
profile = ProfileReport(df, title="Profiling Report")

In [None]:
profile.to_file("report.html")

In [None]:
# profile.to_notebook_iframe()

In [None]:
# with open('dataset_big_patent_v3.json') as f:
#     data = json.load(f)

# df = pd.json_normalize(data, meta=['anchor', "query", "positive", "negative",])

# # Display the DataFrame
# df

In [None]:
df.isna().sum()

In [None]:
# Special display to see better
from IPython.display import display, HTML

display(HTML(df.sample(n=10).to_html()))

In [None]:
# Fine-tune a LLM

In [None]:
# Nom des colonnes à tokeniser
column_names = df.columns.tolist()

# Choses à faire demain

1. jupyter notebook gpu acceleration -> Done for TF (useless) and Torch with Cuda
2. https://www.datacamp.com/fr/tutorial/fine-tuning-large-language-models
3. Savoir quoi tokenizer
4. Supervised learning -> Q&A # https://towardsdatascience.com/fine-tuning-large-language-models-llms-23473d763b91/

In [None]:
import torch
print(torch.__version__)

In [None]:
import torch
print("CUDA disponible :", torch.cuda.is_available())
print("Nombre de GPU :", torch.cuda.device_count())
if torch.cuda.is_available():
    print("Nom du GPU :", torch.cuda.get_device_name(0))
    print("Version CUDA utilisée par PyTorch :", torch.version.cuda)

In [None]:
import logging

# Niveau de log : DEBUG pour tout voir
logging.basicConfig(level=logging.DEBUG)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:64'
import torch

In [None]:
torch.cuda.memory_summary(device=None, abbreviated=False)

In [None]:
!nvidia-smi

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset, Dataset, DatasetDict

dataset = load_dataset("json", data_files="dataset_big_patent_v3.json", split="train")

# Split en train (80%) et test (20%)
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

In [9]:
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4")
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-0.5B-Instruct",
    quantization_config=bnb_config,
    padding_side="left", add_eos_token=True, add_bos_token=True, use_fast=False, # NOT TESTED
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

# A tester avec padding
# tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", trust_remote_code=True, padding_side="left", add_eos_token=True, add_bos_token=True, use_fast=False)
# tokenizer.pad_token = tokenizer.eos_token

In [3]:
import pandas as pd

pd.DataFrame(dataset)

Unnamed: 0,anchor,query,positive,negative
0,RELATED APPLICATIONS This application claims t...,What are the key advantages and applications o...,The present technology introduces an innovativ...,The invention relates to the design and utilit...
1,RELATED APPLICATIONS This application claims t...,How does a magnetic energy harvester operate w...,The advanced energy accumulation equipment bei...,The invention relates to the design and utilit...
2,RELATED APPLICATIONS This application claims t...,How does an energy harvester operate without a...,The invention relates to the design and utilit...,The present technology introduces an innovativ...
3,BACKGROUND OF THE INVENTION I. Field of the In...,How can buffer blocks for ruminant animals be ...,The innovative technique pertains to mineral s...,The latest invention provides novel systems an...
4,RELATED APPLICATION The present application cl...,What advancements does the described patent pr...,The current text discusses a novel mechanical ...,The present invention addresses various improv...
...,...,...,...,...
494,CROSS-REFERENCE TO RELATED APPLICATION [0001] ...,What is the role of Onjisaponin B in the treat...,Delineation of the effect of a botanical enhan...,This invention pertains to a unique compound f...
495,CROSS-REFERENCE TO RELATED APPLICATION [0001] ...,What is the mechanism by which Onjisaponin B e...,This invention pertains to a unique compound f...,Delineation of the effect of a botanical enhan...
496,RELATED APPLICATIONS [0001] This application i...,What are the advantages of the improved dental...,The present innovations focus on an enhanced s...,[0001] This document covers the detailed aspec...
497,FIELD [0001] The invention refers to a ventric...,What are the benefits and mechanisms of a nove...,The technology relates to an innovation in hem...,The present technology involves an advancement...


In [4]:
def preprocess_function(dataset):
    texts = [f"Context: {c}\nQuestion: {q}\nAnswer: {a}" for c, q, a in zip(dataset["anchor"], dataset["query"], dataset["positive"])]
    # Tokenize
    model_inputs = tokenizer(texts, max_length=384, truncation=True, padding="max_length")
    # labels = input_ids
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

In [10]:
tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True)
tokenized_dataset_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/399 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./qwen-qa-finetune",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4, # TO FINE TUNE
    fp16=True,
    save_strategy="epoch",
    logging_steps=10,
)

In [13]:
import torch

# https://www.learnpytorch.io/pytorch_cheatsheet/
# Setup device-agnostic code 
if torch.cuda.is_available():
    device = "cuda" # NVIDIA GPU
elif torch.backends.mps.is_available():
    device = "mps" # Apple GPU
else:
    device = "cpu" # Defaults to CPU if NVIDIA GPU/Apple GPU aren't available

In [14]:
# Zero-shot performance

prompt = "Question: How does the crowdsourcing method is used to adjust a video game element ?\nAnswer:" # Expected : A processor retrieves a plurality of received game element feedback data from a plurality of users of a game and causes the game element to be adjusted during execution of the game 
inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=50)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)



Question: How does the crowdsourcing method is used to adjust a video game element ?
Answer: Crowdsourced software development can be used to create new games. This is done by creating a community of people who want to contribute to the creation of a game. The players are not paid for their contributions, but they are compensated based on how well


In [15]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8, # Rang des matrices LoRA : élevé-> adaptation grande mais aussi mémoire utilisée
    lora_alpha=32, # Facteur de mise à l’échelle pour les matrices LoRA.
    lora_dropout=0.05, # Evite l'overfitting
    target_modules=["q_proj", "v_proj"],  # adapte selon le modèle
)
model = get_peft_model(model, lora_config)

In [16]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   return metric.compute(predictions=predictions, references=labels)

In [17]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,2.3545
20,2.1632
30,2.127
40,2.1853
50,2.1135
60,2.1487
70,2.0251
80,2.2434
90,1.9581
100,2.0695


TrainOutput(global_step=372, training_loss=2.0596542153307187, metrics={'train_runtime': 7081.6916, 'train_samples_per_second': 0.211, 'train_steps_per_second': 0.053, 'total_flos': 1227202636087296.0, 'train_loss': 2.0596542153307187, 'epoch': 2.9779559118236474})

In [20]:
# 6. Sauvegarde du modèle fine-tuned
model.save_pretrained("./qwen-qa-saved")
tokenizer.save_pretrained("./qwen-qa-saved")

('./qwen-qa-saved\\tokenizer_config.json',
 './qwen-qa-saved\\special_tokens_map.json',
 './qwen-qa-saved\\vocab.json',
 './qwen-qa-saved\\merges.txt',
 './qwen-qa-saved\\added_tokens.json',
 './qwen-qa-saved\\tokenizer.json')

In [18]:
import evaluate

trainer.evaluate()

{'eval_loss': 1.9980943202972412,
 'eval_runtime': 237.5271,
 'eval_samples_per_second': 0.421,
 'eval_steps_per_second': 0.421,
 'epoch': 2.9779559118236474}

In [19]:
# Fine-tuned performance after

prompt = "Question: How does the crowdsourcing method is used to adjust a video game element ?\nAnswer:" # Expected : A processor retrieves a plurality of received game element feedback data from a plurality of users of a game and causes the game element to be adjusted during execution of the game
inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=50)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Question: How does the crowdsourcing method is used to adjust a video game element ?
Answer: The crowdsourcing method is used in order to adapt a video game element. Crowdsourcing involves bringing together an audience of people who are interested and knowledgeable about a particular topic, such as the video game itself. In this case, the video game would
