* https://mlabonne.github.io/blog/posts/Fine_Tune_Your_Own_Llama_2_Model_in_a_Colab_Notebook.html
* https://deci.ai/blog/fine-tune-llama-2-with-lora-for-question-answering/

In [None]:
!pip install -q  torch peft==0.4.0 bitsandbytes  trl==0.4.7 accelerate
!pip install --upgrade transformers

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig
from trl import SFTTrainer

import json
from google.colab import drive
import os
import io
from datasets import Dataset

In [None]:
# Monter Google Drive
drive.mount('/content/drive')

# Chemin vers votre fichier JSON dans Google Drive
file_path = '/content/finetuning_dataset.json'  # Changez ceci par le chemin de votre fichier



def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f


def _make_w_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f_dirname = os.path.dirname(f)
        if f_dirname != "":
            os.makedirs(f_dirname, exist_ok=True)
        f = open(f, mode=mode)
    return f


def jdump(obj, f, mode="w", indent=4, default=str):
    """Dump a str or dictionary to a file in json format.

    Args:
        obj: An object to be written.
        f: A string path to the location on disk.
        mode: Mode for opening the file.
        indent: Indent for storing json dictionaries.
        default: A function to handle non-serializable entries; defaults to `str`.
    """
    f = _make_w_io_base(f, mode)
    if isinstance(obj, (dict, list)):
        json.dump(obj, f, indent=indent, default=default)
    elif isinstance(obj, str):
        f.write(obj)
    else:
        raise ValueError(f"Unexpected type: {type(obj)}")
    f.close()


def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict

data = jload(file_path)

# Calculating the size of a quarter of the dataset.
quarter_length = len(data) // 4

# Selecting a quarter of the dataset.
dataset = data[:quarter_length]

# Renaming the key 's' to 'text' for each entry in the dataset.
for entry in dataset:
    entry['text'] = entry.pop('s')

# Converting the modified dataset to a Hugging Face Dataset.
hf_dataset = Dataset.from_dict({'text': [entry['text'] for entry in dataset]})

Mounted at /content/drive
{'text': '<question:>  I woke up this morning feeling the whole room is spinning when i was sitting down. I went to the bathroom walking unsteadily, as i tried to focus i feel nauseous. I try to vomit but it wont come out.. After taking panadol and sleep for few hours, i still feel the same.. By the way, if i lay down or sit down, my head do not spin, only when i want to move around then i feel the whole world is spinning.. And it is normal stomach discomfort at the same time? Earlier after i relieved myself, the spinning lessen so i am not sure whether its connected or coincidences.. Thank you doc! <answer> Hi, Thank you for posting your query. The most likely cause for your symptoms is benign paroxysmal positional vertigo (BPPV), a type of peripheral vertigo. In this condition, the most common symptom is dizziness or giddiness, which is made worse with movements. Accompanying nausea and vomiting are common. The condition is due to problem in the ear, and imp

In [None]:
# Dataset
training_data = hf_dataset

# Model and tokenizer names
base_model_name = "epfl-llm/meditron-7b"
refined_model = "refined_model" #You can give it your own name

# Tokenizer
meditron_tokenizer = AutoTokenizer.from_pretrained(base_model_name,
                                                   trust_remote_code=True,
                                                   token="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")
# Setting a padding token if it's not already set in the tokenizer
if meditron_tokenizer.pad_token is None:
    meditron_tokenizer.pad_token = '[PAD]'

meditron_tokenizer.padding_side = "right"  # Fix for fp16

base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        local_files_only=False,
        trust_remote_code=True,
        load_in_4bit=True,
        token="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",
        device_map={"": 0}
    )

# Configuring the base model: disabling cache to save memory and setting pretraining_tp to 1
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

In [None]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=meditron_tokenizer,
    args=train_params
)

# Training
fine_tuning.train()

In [None]:
# Save Model
fine_tuning.model.save_pretrained(refined_model)

In [None]:
# TEST
model_path = "/content/results_modified/checkpoint-1750"
tokenizer_path = "epfl-llm/meditron-7b"  # Ou le chemin local si le tokenizer a été modifié et sauvegardé

model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
text_gen = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)

In [None]:
query = "My mother is 80 yrs. detected with vulva cancer. vulva is swollen red ,around discolourisation, white growth now extending to rectum. biopsy shows keratinizing squamous carcinoma vulva(welldifferentiated,invasive).she is little asthmatic, has cervical spondeolytis . "

output = text_gen(f"<question>: {query} <answer>")
output[0]["generated_text"]

#END TEST