In [1]:
!pip install -U transformers trl datasets torch==2.6.0 accelerate bitsandbytes peft

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting trl
  Downloading trl-0.18.1-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)

In [2]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model
)

# ✅ 1. Model ID (switched from PhoGPT to Vistral)
model_id = "Viet-Mistral/Vistral-7B-Chat"

# ✅ 2. Configure 4-bit or 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Use 8-bit quantization
    bnb_8bit_use_double_quant=True,
    bnb_8bit_quant_type="nf8",
    bnb_8bit_compute_dtype=torch.float16
)

# ✅ 3. Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

# ✅ 4. Prepare model for PEFT + LoRA
model = prepare_model_for_kbit_training(model) # Removed this line

tokenizer_config.json:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/597k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.15M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/133 [00:00<?, ?B/s]

In [7]:
!pip install nltk rouge-score tqdm

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=69abad96f4ec868458848aac2e37adc37b6b8d919d33f58e278b131b26c31a4e
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [3]:
import json
from datasets import Dataset
from transformers import AutoTokenizer

def prepare_dataset(data_path):
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("Viet-Mistral/Vistral-7B-Chat", trust_remote_code=True)

    # Load your JSON data
    with open(data_path, 'r', encoding='utf-8') as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON file: {e}")
            print("Please check if the JSON file is correctly formatted.")
            return None

    # Ensure the loaded data is a list
    if not isinstance(data, list):
        print("JSON data is not a list of objects. Please check the file format.")
        return None

    # Format the data according to the chat template
    formatted_data = []
    for item in data:
        # Skip items that have qa_pairs
        if isinstance(item, dict) and "qa_pairs" in item:
            continue

        # Process items that don't have qa_pairs
        if isinstance(item, dict):
            if "prompt" in item and "response" in item:
                messages = [
                    {"role": "user", "content": item["prompt"]},
                    {"role": "assistant", "content": item["response"]}
                ]
                prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
                formatted_data.append({"text": prompt})
            else:
                print(f"Skipping item due to missing 'prompt' or 'response' key: {item}")
        else:
            print(f"Skipping item as it is not a dictionary: {item}")

    # Return Dataset.from_list only if formatted_data is not empty
    if formatted_data:
        return Dataset.from_list(formatted_data)
    else:
        print("No valid data found to create a dataset.")
        return None

# Save dataset to HuggingFace format
dataset = prepare_dataset("/content/combined_qa_dataset.json")

# Only save to disk if the dataset was successfully created
if dataset is not None:
    dataset.save_to_disk("/content/sample_instruction_following_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/1418 [00:00<?, ? examples/s]

In [4]:
from datasets import load_from_disk

# Load the dataset we just prepared
dataset = load_from_disk("/content/sample_instruction_following_dataset")

train_dataset, eval_dataset = dataset.train_test_split(test_size=0.2).values()

print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(eval_dataset)}")

Training dataset size: 1134
Validation dataset size: 284


In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel # Although not used in this cell, keep for consistency
from datasets import load_from_disk # Although not used in this cell, keep for consistency
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import numpy as np
from tqdm import tqdm
import json
import re

def extract_prompt_and_response(text):
    # Extract prompt from [INST] tags
    prompt_match = re.search(r'\[INST\](.*?)\[/INST\]', text)
    if prompt_match:
        prompt = prompt_match.group(1).strip()
    else:
        prompt = ""

    # Extract response after [/INST]
    response_match = re.search(r'\[/INST\](.*?)(?:</s>|$)', text, re.DOTALL)
    if response_match:
        response = response_match.group(1).strip()
    else:
        response = ""

    return prompt, response

def generate_response(model, tokenizer, prompt, max_new_tokens=512):
    messages = [{"role": "user", "content": prompt}]
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the assistant's response from the formatted output
    response_split = response.split("[/INST]")
    if len(response_split) > 1:
      response = response_split[-1].strip()
    else:
      response = "" # Or handle cases where the split doesn't work as expected


    return response

def calculate_metrics(predictions, references):
    # Initialize metrics
    bleu_scores = []
    rouge_scores = {
        'rouge1': [],
        'rouge2': [],
        'rougeL': []
    }

    # Initialize ROUGE scorer
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    smoothie = SmoothingFunction().method1

    # Calculate metrics for each prediction-reference pair
    for pred, ref in zip(predictions, references):
        # BLEU Score
        # Ensure reference is a list of tokens
        reference_tokens = [ref.split()]
        prediction_tokens = pred.split()
        bleu_score = sentence_bleu(reference_tokens, prediction_tokens, smoothing_function=smoothie)
        bleu_scores.append(bleu_score)

        # ROUGE Scores
        scores = rouge_scorer_obj.score(ref, pred)
        for metric in rouge_scores.keys():
            rouge_scores[metric].append(scores[metric].fmeasure)

    # Calculate averages
    metrics = {
        'bleu': np.mean(bleu_scores),
        'rouge1': np.mean(rouge_scores['rouge1']),
        'rouge2': np.mean(rouge_scores['rouge2']),
        'rougeL': np.mean(rouge_scores['rougeL'])
    }

    return metrics

def evaluate_model(model, tokenizer, test_dataset, num_samples=None):
    predictions = []
    references = []
    prompts = []

    # If num_samples is provided, limit the evaluation
    if num_samples:
        test_dataset = test_dataset.select(range(min(num_samples, len(test_dataset))))

    print("Generating predictions...")
    for item in tqdm(test_dataset):
        # Extract prompt and reference from the text field
        # Assuming the format is "[INST] prompt [/INST] response </s>"
        prompt, reference = extract_prompt_and_response(item['text'])

        if prompt and reference:  # Only process if both prompt and reference are found
            # Generate prediction
            prediction = generate_response(model, tokenizer, prompt)

            predictions.append(prediction)
            references.append(reference)
            prompts.append(prompt)

    # Calculate metrics
    print("Calculating metrics...")
    metrics = calculate_metrics(predictions, references)

    return metrics, predictions, references, prompts

def save_results(metrics, predictions, references, prompts, output_file):
    results = {
        'metrics': metrics,
        'examples': [
            {
                'prompt': p,
                'prediction': pred,
                'reference': ref
            }
            for p, pred, ref in zip(prompts, predictions, references)
        ]
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)


def load_base_model_and_tokenizer(base_model_name):
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    return model, tokenizer

def main_evaluate_base_model():
    # Model path
    base_model_name = "Viet-Mistral/Vistral-7B-Chat"

    # Load base model and tokenizer
    print("Loading base model and tokenizer...")
    base_model, base_tokenizer = load_base_model_and_tokenizer(base_model_name)

    # Load test dataset
    print("Loading evaluation dataset...")
    test_dataset = eval_dataset

    # Evaluate base model
    print("Starting evaluation of base model...")
    metrics, predictions, references, prompts = evaluate_model(
        base_model,
        base_tokenizer,
        test_dataset,
        num_samples=100  # Evaluate on 100 samples for quicker results
    )

    # Print metrics
    print("\nBase Model Evaluation Metrics:")
    print(f"BLEU Score: {metrics['bleu']:.4f}")
    print(f"ROUGE-1: {metrics['rouge1']:.4f}")
    print(f"ROUGE-2: {metrics['rouge2']:.4f}")
    print(f"ROUGE-L: {metrics['rougeL']:.4f}")

    # Save results
    save_results(metrics, predictions, references, prompts, "base_model_evaluation_results.json")
    print("\nBase model results saved to base_model_evaluation_results.json")

if __name__ == "__main__":
    main_evaluate_base_model()

Loading base model and tokenizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading evaluation dataset...
Starting evaluation of base model...
Generating predictions...


100%|██████████| 100/100 [07:28<00:00,  4.49s/it]


Calculating metrics...

Base Model Evaluation Metrics:
BLEU Score: 0.0857
ROUGE-1: 0.3968
ROUGE-2: 0.2248
ROUGE-L: 0.3158

Base model results saved to base_model_evaluation_results.json


In [9]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)

# Configure LoRA (correct target modules)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

# Apply LoRA
model = get_peft_model(model, lora_config) # Re-added this line

In [10]:
model.print_trainable_parameters()

trainable params: 13,631,488 || all params: 7,307,538,432 || trainable%: 0.1865


In [11]:
from trl import SFTTrainer
# Configure training arguments
training_args = TrainingArguments(
    output_dir="./vistral-lora-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    warmup_ratio=0.1,
    weight_decay=0.01,
    max_grad_norm=0.3,
    report_to="none"  # Disable WandB if not used
)

# Initialize the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    args=training_args,

)

# Start training
trainer.train()

# Save the final model and tokenizer
model.save_pretrained("./vistral-lora-checkpoint")
tokenizer.save_pretrained("./vistral-lora-checkpoint")

Converting train dataset to ChatML:   0%|          | 0/1134 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1134 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1134 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1134 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Step,Training Loss
10,3.2029
20,2.4024
30,1.9377
40,1.5965
50,1.3324
60,1.1931
70,1.1527
80,0.9708
90,0.9384
100,1.0866


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*a

('./vistral-lora-checkpoint/tokenizer_config.json',
 './vistral-lora-checkpoint/special_tokens_map.json',
 './vistral-lora-checkpoint/chat_template.jinja',
 './vistral-lora-checkpoint/tokenizer.model',
 './vistral-lora-checkpoint/added_tokens.json',
 './vistral-lora-checkpoint/tokenizer.json')

In [12]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

def load_model_and_tokenizer(base_model_name, adapter_path):
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

    # Load base model
    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )

    # Load LoRA adapter
    model = PeftModel.from_pretrained(model, adapter_path)

    return model, tokenizer

def generate_response(model, tokenizer, prompt, max_new_tokens=512):
    # Format the prompt using the chat template
    messages = [{"role": "user", "content": prompt}]
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Tokenize the input
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode and return the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the assistant's response
    response = response.split("Assistant: ")[-1].strip()
    return response

def main():
    # Model paths
    base_model_name = "Viet-Mistral/Vistral-7B-Chat"
    adapter_path = "./vistral-lora-checkpoint"

    # Load model and tokenizer
    print("Loading model and tokenizer...")
    model, tokenizer = load_model_and_tokenizer(base_model_name, adapter_path)
    print("Model and tokenizer loaded successfully!")

    # Example question
    question = "Đại học Quốc tế được thành lập vào ngày nào?"
    print(f"\nQuestion: {question}")

    # Generate and print response
    response = generate_response(model, tokenizer, question)
    print(f"\nAnswer: {response}")

if __name__ == "__main__":
    main()

Loading model and tokenizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and tokenizer loaded successfully!

Question: Đại học Quốc tế được thành lập vào ngày nào?

Answer: [INST]  Đại học Quốc tế được thành lập vào ngày nào? [/INST]  Trường Đại học Quốc tế được Thủ tướng Chính phủ ký quyết định thành lập chính thức vào ngày 05/12/2003.


In [15]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from datasets import load_from_disk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import numpy as np
from tqdm import tqdm
import json
import re

def load_model_and_tokenizer(base_model_name, adapter_path):
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    model = PeftModel.from_pretrained(model, adapter_path)
    return model, tokenizer

def extract_prompt_and_response(text):
    # Extract prompt from [INST] tags
    prompt_match = re.search(r'\[INST\](.*?)\[/INST\]', text)
    if prompt_match:
        prompt = prompt_match.group(1).strip()
    else:
        prompt = ""

    # Extract response after [/INST]
    response_match = re.search(r'\[/INST\](.*?)(?:</s>|$)', text, re.DOTALL)
    if response_match:
        response = response_match.group(1).strip()
    else:
        response = ""

    return prompt, response

def generate_response(model, tokenizer, prompt, max_new_tokens=512):
    messages = [{"role": "user", "content": prompt}]
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("Assistant: ")[-1].strip()
    return response

def calculate_metrics(predictions, references):
    # Initialize metrics
    bleu_scores = []
    rouge_scores = {
        'rouge1': [],
        'rouge2': [],
        'rougeL': []
    }

    # Initialize ROUGE scorer
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    smoothie = SmoothingFunction().method1

    # Calculate metrics for each prediction-reference pair
    for pred, ref in zip(predictions, references):
        # BLEU Score
        bleu_score = sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)
        bleu_scores.append(bleu_score)

        # ROUGE Scores
        scores = rouge_scorer_obj.score(ref, pred)
        for metric in rouge_scores.keys():
            rouge_scores[metric].append(scores[metric].fmeasure)

    # Calculate averages
    metrics = {
        'bleu': np.mean(bleu_scores),
        'rouge1': np.mean(rouge_scores['rouge1']),
        'rouge2': np.mean(rouge_scores['rouge2']),
        'rougeL': np.mean(rouge_scores['rougeL'])
    }

    return metrics

def evaluate_model(model, tokenizer, test_dataset, num_samples=None):
    predictions = []
    references = []
    prompts = []

    # If num_samples is provided, limit the evaluation
    if num_samples:
        test_dataset = test_dataset.select(range(min(num_samples, len(test_dataset))))

    print("Generating predictions...")
    for item in tqdm(test_dataset):
        # Extract prompt and reference from the text field
        prompt, reference = extract_prompt_and_response(item['text'])

        if prompt and reference:  # Only process if both prompt and reference are found
            # Generate prediction
            prediction = generate_response(model, tokenizer, prompt)

            predictions.append(prediction)
            references.append(reference)
            prompts.append(prompt)

    # Calculate metrics
    print("Calculating metrics...")
    metrics = calculate_metrics(predictions, references)

    return metrics, predictions, references, prompts

def save_results(metrics, predictions, references, prompts, output_file):
    results = {
        'metrics': metrics,
        'examples': [
            {
                'prompt': p,
                'prediction': pred,
                'reference': ref
            }
            for p, pred, ref in zip(prompts, predictions, references)
        ]
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

def main():
    # Model paths
    base_model_name = "Viet-Mistral/Vistral-7B-Chat"
    adapter_path = "./vistral-lora-checkpoint"

    # Load model and tokenizer
    print("Loading model and tokenizer...")
    model, tokenizer = load_model_and_tokenizer(base_model_name, adapter_path)

    # Load test dataset
    print("Loading test dataset...")
    test_dataset = eval_dataset

    # Evaluate model
    print("Starting evaluation...")
    metrics, predictions, references, prompts = evaluate_model(
        model,
        tokenizer,
        test_dataset,
        num_samples=100  # Evaluate on 100 samples
    )

    # Print metrics
    print("Finetuned Model Evaluation Metrics:")
    print(f"BLEU Score: {metrics['bleu']:.4f}")
    print(f"ROUGE-1: {metrics['rouge1']:.4f}")
    print(f"ROUGE-2: {metrics['rouge2']:.4f}")
    print(f"ROUGE-L: {metrics['rougeL']:.4f}")

    # Save results
    save_results(metrics, predictions, references, prompts, "evaluation_results.json")
    print("\nResults saved to evaluation_results.json")

if __name__ == "__main__":
    main()

Loading model and tokenizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading test dataset...
Starting evaluation...
Generating predictions...


100%|██████████| 100/100 [03:55<00:00,  2.35s/it]


Calculating metrics...
Finetuned Model Evaluation Metrics:
BLEU Score: 0.1917
ROUGE-1: 0.5496
ROUGE-2: 0.3645
ROUGE-L: 0.4430

Results saved to evaluation_results.json
