<a href="https://colab.research.google.com/github/Miguel9712/Estadia/blob/main/CAREQA_MODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install unsloth



In [2]:
!pip install unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git


Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: -y


In [3]:
import torch
if torch.cuda.get_device_capability()[0] >= 8:
  !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

In [4]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 1024 #Choose any! Unsloth auto supports RoPE Scaling internally.
dtype = None
load_in_4bit = True #We use this for 4bit quantization to reduce memory usage. Can be false.


fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-9b-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.11: Fast Gemma2 patching. Transformers: 4.54.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("HPAI-BSC/CareQA", "CareQA_en")

In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank de LoRA. Un valor común es 16 o 32.
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",], # Capas a las que se aplicará LoRA. Estas son comunes para modelos como Llama y Gemma.
    lora_alpha = 16, # Escala para los pesos LoRA.
    lora_dropout = 0, # Dropout para las capas LoRA.
    bias = "none",    # Tipo de bias para las capas LoRA.
    use_gradient_checkpointing = "unsloth", # Ayuda a reducir el uso de memoria durante el entrenamiento.
    random_state = 3407, # Semilla para la reproducibilidad.
    use_rslora = False,  # Usar o no rslora.
    loftq_config = None, # Configuración de LoFTQ si se usa.
)

Unsloth 2025.7.11 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


In [10]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Define los argumentos de entrenamiento
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = ds["test"], # Usamos el split de "test" como ejemplo, cámbialo si tienes un split de entrenamiento diferente
    # dataset_text_field = "text", # Reemplaza "text" con el nombre de la columna en tu dataset que contiene el texto para entrenar
    formatting_func = formatting_func, # Agregamos la función de formato aquí
    max_seq_length = max_seq_length,
    dataset_num_proc = 2, # Número de procesos para procesar el dataset
    packing = False, # Empaquetar secuencias cortas para entrenar más rápido. Puede aumentar el uso de memoria.
    args = TrainingArguments(
        per_device_train_batch_size = 2, # Tamaño del lote por dispositivo (GPU)
        per_device_eval_batch_size = 1, # Reducimos el tamaño de lote de evaluación
        gradient_accumulation_steps = 4, # Pasos de acumulación de gradiente
        warmup_steps = 5, # Pasos de calentamiento del scheduler de tasa de aprendizaje
        max_steps = 60, # Número máximo de pasos de entrenamiento (ajusta según tu necesidad)
        learning_rate = 2e-4, # Tasa de aprendizaje
        fp16 = not torch.cuda.is_bf16_supported(), # Usar FP16 si bfloat16 no es soportado
        bf16 = torch.cuda.is_bf16_supported(), # Usar bfloat16 si es soportado
        logging_steps = 1, # Registrar métricas cada 1 paso
        optim = "adamw_8bit", # Optimizador
        weight_decay = 0.01, # Regularización L2
        lr_scheduler_type = "linear", # Tipo de scheduler de tasa de aprendizaje
        seed = 3407, # Semilla para la reproducibilidad
        output_dir = "outputs", # Directorio de salida para los checkpoints y logs
        remove_unused_columns=False, # Añadimos esto para intentar evitar que se eliminen columnas
    ),
)

In [9]:
def formatting_func(examples):
    text = []
    for i in range(len(examples["question"])):
        question = examples["question"][i]
        op1 = examples["op1"][i]
        op2 = examples["op2"][i]
        op3 = examples["op3"][i]
        op4 = examples["op4"][i]
        # cop = examples["cop"][i] # Puedes incluir la respuesta correcta si el objetivo es que el modelo la prediga

        # Formato de ejemplo: Puedes ajustarlo según la tarea y el modelo.
        # Si es para generar la respuesta correcta, podrías incluirla.
        formatted_text = f"Pregunta: {question}\nOpciones:\n1. {op1}\n2. {op2}\n3. {op3}\n4. {op4}"
        text.append(formatted_text)
    return text # Modificado para retornar la lista directamente

In [20]:
# Inicia el entrenamiento
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 5,621 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 54,018,048 of 9,295,724,032 (0.58% trained)


Step,Training Loss
1,2.1848
2,2.0961
3,1.9178
4,1.8226
5,1.5031
6,1.2977
7,1.0369
8,0.8609
9,0.7752
10,0.7238


TrainOutput(global_step=200, training_loss=0.8889316090941429, metrics={'train_runtime': 1448.4469, 'train_samples_per_second': 1.105, 'train_steps_per_second': 0.138, 'total_flos': 1.37260378784256e+16, 'train_loss': 0.8889316090941429})

In [21]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# EOS_TOKEN = tokenizer.eos_token #EOS: End Of Sequence, SOS: Start Of Sequence, UNK: Unkwon Token
# Using the tokenizer's EOS token is generally recommended
EOS_TOKEN = tokenizer.eos_token


def formatting_prompts_func(examples):
    instructions = ["Responde la siguiente pregunta de opción múltiple seleccionando el número de la opción correcta."] * len(examples["question"])
    # Combine question and options for the Input section
    inputs = [
        f"Pregunta: {q}\nOpciones:\n1. {op1}\n2. {op2}\n3. {op3}\n4. {op4}"
        for q, op1, op2, op3, op4 in zip(
            examples["question"],
            examples["op1"],
            examples["op2"],
            examples["op3"],
            examples["op4"]
        )
    ]
    # Use the correct option (cop) as the Response
    outputs = [str(cop) for cop in examples["cop"]] # Ensure cop is a string


    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        # Format the prompt and add the EOS token
        text = alpaca_prompt.format(instruction, input_text, output) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}

# Use your loaded dataset instead of loading a new one
# Assuming ds["test"] is the split you want to use for training
dataset = ds["test"].map(formatting_prompts_func, batched=True)

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported


trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text", # Use the 'text' column created by formatting_prompts_func
    max_seq_length = max_seq_length, # Use the max_seq_length defined earlier
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 200, # Aumentamos el número de pasos de entrenamiento
        learning_rate = 2e-4,
        fp16 = False, # Desactivar FP16
        bf16 = False, # Desactivar BF16
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407, # Using the same seed as before for consistency
        output_dir = "outputs",
        report_to = "none", # Keep report_to none to avoid external logging issues
    ),
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/5621 [00:00<?, ? examples/s]

In [22]:
# Re-enable native 2x faster inference after potential training/evaluation modes
FastLanguageModel.for_inference(model)

# --- Add this line to ensure model is on device ---
model.to(model.device)
# --------------------------------------------------

# Define your question and options for inference
# Replace these with the actual question and options you want to ask
inference_question = "In relation to iron metabolism and its control mediated by hepcidin, it is true that:"
inference_op1 = "The drop in partial oxygen pressure promotes the activation of the hypoxia-inducible factor (HIF), which increases the expression of hepcidin."
inference_op2 = "The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin."
inference_op3 = "Hepcidin reduces intestinal iron absorption through the inactivation of the divalent metal transporter 1 (DMT1)."
inference_op4 = "In hereditary hemochromatosis type 1, mutations in the human hemochromatosis protein (HFE) cause an increase in the production of hepcidin."

# Adapt the alpaca_prompt format to your dataset structure for inference
# The Instruction and Input sections will be filled, Response will be empty for generation
inference_prompt = alpaca_prompt.format(
    "Responde la siguiente pregunta de opción múltiple seleccionando el número de la opción correcta.", # Instruction
    f"Pregunta: {inference_question}\nOpciones:\n1. {inference_op1}\n2. {inference_op2}\n3. {inference_op3}\n4. {inference_op4}", # Input
    "", # Response (empty for generation)
)

# Tokenize the input prompt
inputs = tokenizer([inference_prompt], return_tensors="pt").to(model.device)

# --- Example 1: Direct generation ---
print("--- Direct Generation ---")
outputs = model.generate(**inputs, max_new_tokens=10, use_cache=True, pad_token_id=tokenizer.eos_token_id) # Reduced max_new_tokens for a concise answer (option number)
# Decode only the newly generated tokens
generated_tokens = outputs[0][inputs.input_ids.shape[-1]:]
response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
print(f"Respuesta generada: {response}")


print("\n--- Example 2: Generation with TextStreamer ---")
# --- Example 2: Generation with TextStreamer ---
# You might need to re-tokenize if the previous generation consumed tokens differently
inputs_stream = tokenizer([inference_prompt], return_tensors="pt").to(model.device)
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
# Generate and stream the response
_ = model.generate(**inputs_stream, streamer=text_streamer, max_new_tokens=10, use_cache=True, pad_token_id=tokenizer.eos_token_id) # Reduced max_new_tokens


--- Direct Generation ---
Respuesta generada: 2

--- Example 2: Generation with TextStreamer ---
<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Responde la siguiente pregunta de opción múltiple seleccionando el número de la opción correcta.

### Input:
Pregunta: In relation to iron metabolism and its control mediated by hepcidin, it is true that:
Opciones:
1. The drop in partial oxygen pressure promotes the activation of the hypoxia-inducible factor (HIF), which increases the expression of hepcidin.
2. The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.
3. Hepcidin reduces intestinal iron absorption through the inactivation of the divalent metal transporter 1 (DMT1).
4. In hereditary hemochromatosis type 1, mutations in the human hemochromatosis protein (HFE