# Generación y enternamiento del modelo

## Librerías

In [None]:
!pip install transformers torch unsloth trl datasets
!pip install gdown

from IPython.display import clear_output
clear_output()

In [None]:
import torch
import gdown
import json
from datasets import Dataset, load_dataset
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template, standardize_sharegpt, train_on_responses_only
import transformers
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


## Obtención y pre-procesamiento de datos

In [None]:
%%capture
files_to_download = {
    "formatted_train.json": "1Fm7aPdCv6bguP7UgoCqdUVcm8xpDo18_",
    "formatted_test.json": "1a4YeF--Sks7WA1ZIQL2zDZx4teKk7p4m",
    "formatted_validation.json": "1PC9OhZhNZt8lFO9wifhydy0BrIYe8Dkm"
}

for destination, file_id in files_to_download.items():
    gdown.download(f"https://drive.google.com/uc?id={file_id}", destination, quiet=False)

In [None]:
# RUTAS
ruta_train = "/content/formatted_train.json"
ruta_test = "/content/formatted_test.json"
ruta_validation = "/content/formatted_validation.json"

# abrimos y leemos archivos
with open(ruta_train, 'r') as archivo:
    datos_train = json.load(archivo)

with open(ruta_test, 'r') as archivo:
    datos_test = json.load(archivo)

with open(ruta_validation, 'r') as archivo:
    datos_validation = json.load(archivo)

In [None]:
dataset = []
for conversacion in datos_train:
    diccionario = {
        "instructions": "",
        "conversation": []
    }
    for i in range(len(conversacion) - 1):
        if conversacion[i]["role"] == "system":
            diccionario["instructions"] = conversacion[i]["content"]
        else:
            role = "user" if conversacion[i]["role"] == "user" else "assistant"
            diccionario["conversation"].append(f"{role}: " + conversacion[i]["content"])
    dataset.append(diccionario)

print(dataset[:2])  # Muestra los primeros pares para verificar

[{'instructions': "You are a movie recommendation assistant. Use the user's movie history and preferences to suggest movies that align with their interests.\n\nSeen movies and their corresponding evaluation and details:\nMovie: Creature from the Black Lagoon (1954)\nPositive comments: Lots of action, suspense, and a brilliant 50s vibe. Julie Adams was wonderful to watch.\nNegative comments: Light on dialogue.\nDetails:\nTitle: Creature from the Black Lagoon (1954)\nGenre: Horror, Sci-Fi\nDirector: Jack Arnold\nCast: Richard Carlson, Julie Adams, Richard Denning\nAbstract: [Like] \n- The movie was one of the last truly great monster movies produced by Universal Studios\n- The change in setting to the Amazon River in the jungles of Brazil created an atmosphere of fear and dread\n- The underwater photography was stunning and breathtaking\n- The movie was a completely original monster design, not based on a novel or folklore\n\n[Dislike]\n- Certain aspects of the picture have not aged well






## Modelo

In [None]:
model_name = "unsloth/gemma-2-9b-bnb-4bit"  # Nombre del modelo de Gemma

max_seq = 2048  # Longitud máxima de la secuencia de entrada

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_length=max_seq,
    dtype=None,
    load_in_4bit=True  # Para cargar el modelo en formato de 4 bits y ahorrar memoria
)

==((====))==  Unsloth 2024.12.1: Fast Gemma2 patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.12.1 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


In [None]:
multi_turn_with_instruction_prompt = """Below is an instruction followed by a multi-turn conversation between a user and an AI. The task is to continue the conversation appropriately based on the instruction provided.

### Instruction:
{}

### Conversation:
{}

Response:
"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def format_conversation_with_instructions(examples):
    instructions = examples["instructions"]  # Columna con las instrucciones
    conversations = examples["conversation"]  # Lista de turnos de diálogo
    formatted_texts = []

    for instruction, conversation in zip(instructions, conversations):
        # Concatenar todo el historial de conversación
        history = "\n".join([turn for turn in conversation])

        # Formatear la conversación con la instrucción y el historial
        text = multi_turn_with_instruction_prompt.format(instruction, history) + EOS_TOKEN
        formatted_texts.append(text)

    return {"text": formatted_texts}

dataset_train = Dataset.from_list(dataset)
dataset_train = dataset_train.map(format_conversation_with_instructions, batched = True,)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    dataset_text_field = "text",
    max_seq_length = max_seq,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = transformers.TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 200,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)


Map (num_proc=2):   0%|          | 0/50000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 200
 "-____-"     Number of trainable parameters = 54,018,048


Step,Training Loss
1,1.6487
2,1.6517
3,1.5437
4,1.655
5,1.62
6,1.4072
7,1.3289
8,1.2615
9,1.2168
10,1.1979


In [None]:
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(trainer_stats.metrics)

7429.3404 seconds used for training.
123.82 minutes used for training.
{'train_runtime': 7429.3404, 'train_samples_per_second': 0.215, 'train_steps_per_second': 0.027, 'total_flos': 8.838331768429363e+16, 'train_loss': 1.0795770359039307, 'epoch': 0.032}


## Guardando el modelo

In [None]:
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.model',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

In [None]:
!zip -r lora_model.zip lora_model/
!zip -r train_output.zip train_output/

  adding: lora_model/ (stored 0%)
  adding: lora_model/tokenizer.json (deflated 84%)
  adding: lora_model/adapter_config.json (deflated 54%)
  adding: lora_model/README.md (deflated 66%)
  adding: lora_model/tokenizer.model (deflated 51%)
  adding: lora_model/tokenizer_config.json (deflated 96%)
  adding: lora_model/adapter_model.safetensors (deflated 7%)
  adding: lora_model/special_tokens_map.json (deflated 76%)

zip error: Nothing to do! (try: zip -r train_output.zip . -i train_output/)


# Obtención de Métricas
Se debe reiniciar la sesión del nb para vaciar la RAM de la GPU

## Librerías

In [None]:
!pip install transformers torch unsloth trl datasets
!pip install gdown
!pip install evaluate bert_score

from IPython.display import clear_output
clear_output()

In [None]:
import torch
import gdown
import json
from datasets import Dataset, load_dataset
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template, standardize_sharegpt, train_on_responses_only
import transformers
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

## Carga del modelo entrenado

In [None]:
!unzip -q lora_model.zip -d ./

In [None]:
max_seq = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_model",
    max_seq_length = max_seq,
    dtype=None,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.12.2: Fast Gemma2 patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.12.2 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584, padding_idx=0)
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

## Carga y pre-procesamiento de set de testeo

In [None]:
%%capture
files_to_download = {
    "formatted_train.json": "1Fm7aPdCv6bguP7UgoCqdUVcm8xpDo18_",
    "formatted_test.json": "1a4YeF--Sks7WA1ZIQL2zDZx4teKk7p4m",
    "formatted_validation.json": "1PC9OhZhNZt8lFO9wifhydy0BrIYe8Dkm"
}

for destination, file_id in files_to_download.items():
    gdown.download(f"https://drive.google.com/uc?id={file_id}", destination, quiet=False)

In [None]:
with open("formatted_test.json", "r") as f:
    data = json.load(f)

# dataset = Dataset.from_dict({"conversations": data})
print(data)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
dataset = []
for conversacion in data:
    diccionario = {
        "instructions": "",
        "conversation": []
    }
    for i in range(len(conversacion) - 1):
        if conversacion[i]["role"] == "system":
            diccionario["instructions"] = conversacion[i]["content"]
        else:
            role = "user" if conversacion[i]["role"] == "user" else "assistant"
            diccionario["conversation"].append(f"{role}: " + conversacion[i]["content"])


    dataset.append(diccionario)

print(dataset[:2])  # Muestra los primeros pares para verificar

[{'instructions': "You are a movie recommendation assistant. Use the user's movie history and preferences to suggest movies that align with their interests.\n\nSeen movies and their corresponding evaluation and details:\nMovie: Death Race  (2008)\nPositive comments: - The movie delivers exactly what the target audience wants- Embraces its ridiculous and cheesy elements- Entertaining and fun for those who enjoy this type of movie\nNegative comments: - Only one race track throughout the entire film\nDetails:\nTitle: Death Race  (2008)\nGenre: Action, Sci-Fi, Thriller\nDirector: Paul W.S. Anderson\nCast: Jason Statham, Joan Allen, Ian McShane\nAbstract: [Like]\n- Honest to God action movie without bigger-than-life pretensions\n- Follows a tested and tried recipe\n- Influences from racing video games are well mastered\n- Nervous and thrilling narrative\n- Professional effects and balanced violence\n- Admirable cinematography with chromatic and plastic values\n\n[Dislike]\n- None\n\nMovie: 

In [None]:
multi_turn_with_instruction_prompt = """Below is an instruction followed by a multi-turn conversation between a user and an AI. The task is to continue the conversation appropriately based on the instruction provided.

### Instruction:
{}

### Conversation:
{}

Response:
"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def format_conversation_with_instructions(examples):
    instructions = examples["instructions"]  # Columna con las instrucciones
    conversations = examples["conversation"]  # Lista de turnos de diálogo
    formatted_texts = []

    for instruction, conversation in zip(instructions, conversations):
        # Concatenar todo el historial de conversación
        history = "\n".join([turn for turn in conversation])

        # Formatear la conversación con la instrucción y el historial
        text = multi_turn_with_instruction_prompt.format(instruction, history) + EOS_TOKEN
        formatted_texts.append(text)

    return {"text": formatted_texts}

dataset_test = Dataset.from_list(dataset)
dataset_test = dataset_test.map(format_conversation_with_instructions, batched = True,)

Map:   0%|          | 0/2277 [00:00<?, ? examples/s]

In [None]:
%%capture
!pip install tqdm

In [None]:
print(dataset_test)

Dataset({
    features: ['instructions', 'conversation', 'text'],
    num_rows: 2277
})


## Medición

In [None]:
percentage = 0.2
sample_size = int(len(dataset_test) * percentage)
subset_test_dataset = dataset_test.shuffle(seed=42).select(range(sample_size))

print(f"Usando {sample_size} ejemplos para la evaluación.")

Usando 455 ejemplos para la evaluación.


In [None]:
from tqdm import tqdm

generation_args = {
    "max_new_tokens": 100,
    "temperature": 0.3,
    "use_cache": True,
    "top_p": 0.9,
    "top_k": 50,
}

# Listas para predicciones y referencias
predictions = []
references = []

for example in tqdm(subset_test_dataset):

    # Crear la entrada para el modelo

    inputs = tokenizer(
    [
        multi_turn_with_instruction_prompt.format(
            example["instructions"], # instruction
            example["conversation"][-12:-2],
        )
    ], return_tensors = "pt").to("cuda")


    # Generar respuesta
    with torch.no_grad():
        outputs = model.generate(**inputs, **generation_args)

    # Decodificar respuesta generada
    generated_response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    if "assistant" in generated_response:
        generated_response = generated_response.split("assistant: ")[-1].strip()

    if "\']\n\nResponse:" in generated_response:
        generated_response = generated_response.split("\']\n\nResponse:")[0].strip()

    predictions.append(generated_response)

    # Última respuesta esperada (referencia)
    references.append(example["conversation"][-1].split("assistant: ")[-1].strip())

100%|██████████| 455/455 [10:33<00:00,  1.39s/it]


### BLEU

In [None]:
from nltk.translate.bleu_score import corpus_bleu

print(len(references))
print(len(predictions))
bleu_score = corpus_bleu(
    [[ref] for ref in references],  # Referencias esperadas
    predictions  # Respuestas generadas
)
print(f"BLEU Score: {bleu_score}")

455
455
BLEU Score: 0.47419060471318714


### ROUGE

In [None]:
%%capture
!pip install rouge_score

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
rouge_scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]

rouge1 = sum([score["rouge1"].fmeasure for score in rouge_scores]) / len(rouge_scores)
rouge2 = sum([score["rouge2"].fmeasure for score in rouge_scores]) / len(rouge_scores)
rougeL = sum([score["rougeL"].fmeasure for score in rouge_scores]) / len(rouge_scores)

print(f"ROUGE-1: {rouge1}")
print(f"ROUGE-2: {rouge2}")
print(f"ROUGE-L: {rougeL}")

ROUGE-1: 0.40776916044428474
ROUGE-2: 0.16173604267308014
ROUGE-L: 0.29841473133895424


In [None]:
from collections import Counter

from nltk.translate import bleu_score
from nltk.translate.bleu_score import SmoothingFunction
import numpy as np


def distinct(seqs):
    """ Calculate intra/inter distinct 1/2. """
    """ Recuperado de https://github.com/PaddlePaddle/models/blob/release/1.6/PaddleNLP/Research/Dialogue-PLATO/plato/metrics/metrics.py"""
    batch_size = len(seqs)
    intra_dist1, intra_dist2 = [], []
    unigrams_all, bigrams_all = Counter(), Counter()
    for seq in seqs:
        unigrams = Counter(seq)
        bigrams = Counter(zip(seq, seq[1:]))
        intra_dist1.append((len(unigrams)+1e-12) / (len(seq)+1e-5))
        intra_dist2.append((len(bigrams)+1e-12) / (max(0, len(seq)-1)+1e-5))

        unigrams_all.update(unigrams)
        bigrams_all.update(bigrams)

    inter_dist1 = (len(unigrams_all)+1e-12) / (sum(unigrams_all.values())+1e-5)
    inter_dist2 = (len(bigrams_all)+1e-12) / (sum(bigrams_all.values())+1e-5)
    intra_dist1 = np.average(intra_dist1)
    intra_dist2 = np.average(intra_dist2)
    return intra_dist1, intra_dist2, inter_dist1, inter_dist2

### Distinct

In [None]:
import nltk
nltk.download('punkt_tab')

tokenized_text = [nltk.word_tokenize(text) for text in predictions]
resultado = distinct(tokenized_text)
print(resultado)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


(0.8208746454816914, 0.9873834492439653, 0.11170435905888247, 0.3630365596666008)


### BERTScore

In [None]:
from evaluate import load
bertscore = load("bertscore")
results = bertscore.compute(predictions=predictions, references=references, lang="en", model_type="distilbert-base-uncased")

In [None]:
from statistics import mean
print(results.keys())
print(f"F1: {mean(results['f1'])}")
print(f"Precision: {mean(results['precision'])}")
print(f"Recall: {mean(results['recall'])}")

dict_keys(['precision', 'recall', 'f1', 'hashcode'])
F1: 0.8366881264435067
Precision: 0.8355197403457139
Recall: 0.8382421057302873


---

In [None]:
predictions[0]

'How about "Pirates of the Caribbean: Dead Man\\\'s Chest"? It has fascinating new characters, consistently good acting, and an outstanding music score. The movie also has a right mix of action and lulls, as well as impressive special effects and stunts. Plus, it looks absolutely stunning and it\\\'s very entertaining.'

In [None]:
references[0]

'How about "The Curious Case of Benjamin Button"? It has vibrant characters, powerful storytelling, and heartwarming moments, along with subtle messages about kindness and strength. It\'s definitely a movie with deep emotional impact and character development that I think you would appreciate.'

# Métricas Zeroshot

In [None]:
!pip install transformers torch unsloth trl datasets
!pip install gdown
!pip install evaluate bert_score

from IPython.display import clear_output
clear_output()

In [None]:
import torch
import gdown
import json
from datasets import Dataset, load_dataset
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template, standardize_sharegpt, train_on_responses_only
import transformers
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
model_name = "unsloth/gemma-2-9b-bnb-4bit"  # Nombre del modelo de Gemma

max_seq = 2048  # Longitud máxima de la secuencia de entrada

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_length=max_seq,
    dtype=None,
    load_in_4bit=True  # Para cargar el modelo en formato de 4 bits y ahorrar memoria
)
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.12.4: Fast Gemma2 patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 3584, padding_idx=0)
    (layers): ModuleList(
      (0-41): 42 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear4bit(in_features=3584, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
          (v_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=3584, bias=False)
          (rotary_emb): GemmaFixedRotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=3584, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
        (pre_feedforward_lay

In [None]:
%%capture
files_to_download = {
    "formatted_train.json": "1Fm7aPdCv6bguP7UgoCqdUVcm8xpDo18_",
    "formatted_test.json": "1a4YeF--Sks7WA1ZIQL2zDZx4teKk7p4m",
    "formatted_validation.json": "1PC9OhZhNZt8lFO9wifhydy0BrIYe8Dkm"
}

for destination, file_id in files_to_download.items():
    gdown.download(f"https://drive.google.com/uc?id={file_id}", destination, quiet=False)

In [None]:
with open("formatted_test.json", "r") as f:
    data = json.load(f)

# dataset = Dataset.from_dict({"conversations": data})
print(data)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
dataset = []
for conversacion in data:
    diccionario = {
        "instructions": "",
        "conversation": []
    }
    for i in range(len(conversacion) - 1):
        if conversacion[i]["role"] == "system":
            diccionario["instructions"] = conversacion[i]["content"]
        else:
            role = "user" if conversacion[i]["role"] == "user" else "assistant"
            diccionario["conversation"].append(f"{role}: " + conversacion[i]["content"])


    dataset.append(diccionario)

print(dataset[:2])  # Muestra los primeros pares para verificar

[{'instructions': "You are a movie recommendation assistant. Use the user's movie history and preferences to suggest movies that align with their interests.\n\nSeen movies and their corresponding evaluation and details:\nMovie: Death Race  (2008)\nPositive comments: - The movie delivers exactly what the target audience wants- Embraces its ridiculous and cheesy elements- Entertaining and fun for those who enjoy this type of movie\nNegative comments: - Only one race track throughout the entire film\nDetails:\nTitle: Death Race  (2008)\nGenre: Action, Sci-Fi, Thriller\nDirector: Paul W.S. Anderson\nCast: Jason Statham, Joan Allen, Ian McShane\nAbstract: [Like]\n- Honest to God action movie without bigger-than-life pretensions\n- Follows a tested and tried recipe\n- Influences from racing video games are well mastered\n- Nervous and thrilling narrative\n- Professional effects and balanced violence\n- Admirable cinematography with chromatic and plastic values\n\n[Dislike]\n- None\n\nMovie: 

In [None]:
multi_turn_with_instruction_prompt = """Below is an instruction followed by a multi-turn conversation between a user and an AI. The task is to continue the conversation appropriately based on the instruction provided.

### Instruction:
{}

### Conversation:
{}

Response:
"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def format_conversation_with_instructions(examples):
    instructions = examples["instructions"]  # Columna con las instrucciones
    conversations = examples["conversation"]  # Lista de turnos de diálogo
    formatted_texts = []

    for instruction, conversation in zip(instructions, conversations):
        # Concatenar todo el historial de conversación
        history = "\n".join([turn for turn in conversation])

        # Formatear la conversación con la instrucción y el historial
        text = multi_turn_with_instruction_prompt.format(instruction, history) + EOS_TOKEN
        formatted_texts.append(text)

    return {"text": formatted_texts}

dataset_test = Dataset.from_list(dataset)
dataset_test = dataset_test.map(format_conversation_with_instructions, batched = True,)

Map:   0%|          | 0/2277 [00:00<?, ? examples/s]

In [None]:
%%capture
!pip install tqdm

In [None]:
print(dataset_test)

Dataset({
    features: ['instructions', 'conversation', 'text'],
    num_rows: 2277
})


In [None]:
percentage = 0.2
sample_size = int(len(dataset_test) * percentage)
subset_test_dataset = dataset_test.shuffle(seed=42).select(range(sample_size))

print(f"Usando {sample_size} ejemplos para la evaluación.")

Usando 455 ejemplos para la evaluación.


In [None]:
from tqdm import tqdm

generation_args = {
    "max_new_tokens": 100,
    "temperature": 0.3,
    "use_cache": True,
    "top_p": 0.9,
    "top_k": 50,
}

# Listas para predicciones y referencias
predictions = []
references = []

for example in tqdm(subset_test_dataset):

    # Crear la entrada para el modelo

    inputs = tokenizer(
    [
        multi_turn_with_instruction_prompt.format(
            example["instructions"], # instruction
            example["conversation"][-12:-2],
        )
    ], return_tensors = "pt").to("cuda")


    # Generar respuesta
    with torch.no_grad():
        outputs = model.generate(**inputs, **generation_args)

    # Decodificar respuesta generada
    generated_response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    if "assistant" in generated_response:
        generated_response = generated_response.split("assistant: ")[-1].strip()

    if "\']\n\nResponse:" in generated_response:
        generated_response = generated_response.split("\']\n\nResponse:")[0].strip()

    predictions.append(generated_response)

    # Última respuesta esperada (referencia)
    references.append(example["conversation"][-1].split("assistant: ")[-1].strip())

100%|██████████| 455/455 [1:10:02<00:00,  9.24s/it]


In [None]:
from nltk.translate.bleu_score import corpus_bleu

print(len(references))
print(len(predictions))
bleu_score = corpus_bleu(
    [[ref] for ref in references],  # Referencias esperadas
    predictions  # Respuestas generadas
)
print(f"BLEU Score: {bleu_score}")

455
455
BLEU Score: 0.44002352278661394


In [None]:
%%capture
!pip install rouge_score

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
rouge_scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]

rouge1 = sum([score["rouge1"].fmeasure for score in rouge_scores]) / len(rouge_scores)
rouge2 = sum([score["rouge2"].fmeasure for score in rouge_scores]) / len(rouge_scores)
rougeL = sum([score["rougeL"].fmeasure for score in rouge_scores]) / len(rouge_scores)

print(f"ROUGE-1: {rouge1}")
print(f"ROUGE-2: {rouge2}")
print(f"ROUGE-L: {rougeL}")

ROUGE-1: 0.3707030860551115
ROUGE-2: 0.14187971784423734
ROUGE-L: 0.26611283522413376


In [None]:
from collections import Counter

from nltk.translate import bleu_score
from nltk.translate.bleu_score import SmoothingFunction
import numpy as np


def distinct(seqs):
    """ Calculate intra/inter distinct 1/2. """
    """ Recuperado de https://github.com/PaddlePaddle/models/blob/release/1.6/PaddleNLP/Research/Dialogue-PLATO/plato/metrics/metrics.py"""
    batch_size = len(seqs)
    intra_dist1, intra_dist2 = [], []
    unigrams_all, bigrams_all = Counter(), Counter()
    for seq in seqs:
        unigrams = Counter(seq)
        bigrams = Counter(zip(seq, seq[1:]))
        intra_dist1.append((len(unigrams)+1e-12) / (len(seq)+1e-5))
        intra_dist2.append((len(bigrams)+1e-12) / (max(0, len(seq)-1)+1e-5))

        unigrams_all.update(unigrams)
        bigrams_all.update(bigrams)

    inter_dist1 = (len(unigrams_all)+1e-12) / (sum(unigrams_all.values())+1e-5)
    inter_dist2 = (len(bigrams_all)+1e-12) / (sum(bigrams_all.values())+1e-5)
    intra_dist1 = np.average(intra_dist1)
    intra_dist2 = np.average(intra_dist2)
    return intra_dist1, intra_dist2, inter_dist1, inter_dist2

In [None]:
import nltk
nltk.download('punkt_tab')

tokenized_text = [nltk.word_tokenize(text) for text in predictions]
resultado = distinct(tokenized_text)
print(resultado)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


(0.8184753938208926, 0.9826326491458297, 0.10768999130210792, 0.34409466898587066)


In [None]:
from evaluate import load
bertscore = load("bertscore")
results = bertscore.compute(predictions=predictions, references=references, lang="en", model_type="distilbert-base-uncased")

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
from statistics import mean
print(results.keys())
print(f"F1: {mean(results['f1'])}")
print(f"Precision: {mean(results['precision'])}")
print(f"Recall: {mean(results['recall'])}")

dict_keys(['precision', 'recall', 'f1', 'hashcode'])
F1: 0.8166963628360203
Precision: 0.8129520743757814
Recall: 0.8211153537362487


In [None]:
predictions[0]

'"Pirates of the Caribbean: Dead Man\\\'s Chest" sounds like a great choice! It has all the elements you mentioned, and it\\\'s known for its clever dialogue, interesting characters, and a fast-paced story. Plus, the music score is outstanding and the special effects and stunts are impressive. I think you\\\'ll really enjoy it.\']'

In [None]:
references[0]

'How about "The Curious Case of Benjamin Button"? It has vibrant characters, powerful storytelling, and heartwarming moments, along with subtle messages about kindness and strength. It\'s definitely a movie with deep emotional impact and character development that I think you would appreciate.'