## Evaluation: Generating Responses Using the RAFT fine-tuned model

This evaluation script generates model responses for comparing the fine-tuned model's performance. RAFT training specifically focused on teaching the model to distinguish relevant ("golden") context from distracting documents in a RAG setting - a critical skill for accurate retrieval-based responses. The script processes evaluation examples containing both oracle and distractor documents, generating responses that will be evaluated using Azure AI Studio's metrics including groundedness (factual alignment with provided context), relevance (appropriateness to the query), and F1 score (precision and recall balance). These metrics will be compared against a baseline model's performance on the same dataset to quantify the effectiveness of the RAFT fine-tuning approach.

In [None]:
!pip install peft
!pip install transformers
!pip install torch
!pip install tqdm
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
!pip install --upgrade peft

Collecting peft
  Using cached peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Using cached peft-0.13.2-py3-none-any.whl (320 kB)
Installing collected packages: peft
Successfully installed peft-0.13.2


In [None]:
import json
import peft
import torch
from tqdm import tqdm
from datasets import load_dataset
from peft import PeftModel, PeftConfig
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

# paths and model IDs
EVAL_FILE = "/content/eval.jsonl"  # Update this path
ANSWER_FILE = "/content/eval_output.jsonl"  # Update this path
BASE_MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
ADAPTER_MODEL_ID = "ijuliet/Llama-2-7b-chat-hf-mental-health"

PyTorch version: 2.5.0+cu121
CUDA available: True
CUDA version: 12.1


In [None]:
def load_jsonl(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

def save_jsonl(data, file_path):
    with open(file_path, 'w') as f:
        for item in data:
            json.dump(item, f)
            f.write('\n')

def generate_answer(model, tokenizer, instruction, device):
    inputs = tokenizer(instruction, return_tensors="pt", truncation=True).to(device)
    # Move the model to the device if it's not already there
    model.to(device)  # Add this line
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=1024, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def main():
    # Set up device
    device =  torch.device("cuda" if torch.cuda.is_available() else "cpu") #torch.device("cpu")
    print(f"Using device: {device}")

    # Load model and tokenizer
    print("Loading model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

    # Load the base model
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_ID,
        torch_dtype=torch.float16,
        #device_map="auto",  # Automatically place model layers
        #low_cpu_mem_usage=True
    )

    # Load the PEFT adapter
    model.load_adapter(ADAPTER_MODEL_ID)
    #base_model.to(device)

    model.eval()

    # Load evaluation data
    print(f"Loading evaluation data from {EVAL_FILE}")
    # limit samples to reduce memory usage
    dataset = load_dataset("json", data_files="eval.jsonl", split='train')
    eval_data = dataset.select(range(10,500))

    # Generate answers
    results = []
    for item in tqdm(eval_data, desc="Generating answers"):
        instruction = item['instruction']
        gold_answer = item['gold_answer']

        # Generate answer using the model
        model_answer = generate_answer(model, tokenizer, instruction, device)

        # Prepare result
        result = {
            'instruction': instruction,
            'gold_answer': gold_answer,
            'model_answer': model_answer
        }
        results.append(result)

    # Save results
    print(f"Saving results to {ANSWER_FILE}")
    save_jsonl(results, ANSWER_FILE)
    print("Evaluation complete!")


In [None]:
# NOTE: I had tested it on the first 10 datapoints which were separately saved, then just ran it on the remaining 490 here.
# The two datasets were eventually combined
if __name__ == "__main__":
    main()

Using device: cuda
Loading model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Loading evaluation data from /content/eval.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Generating answers:   0%|          | 0/490 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Generating answers:  63%|██████▎   | 310/490 [1:05:45<35:13, 11.74s/it]This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Generating answers: 100%|██████████| 490/490 [1:44:30<00:00, 12.80s/it]

Saving results to /content/eval_output1.jsonl
Evaluation complete!





In [None]:
# format output nicely
def format_jsonl(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            # Load each line as JSON
            data = json.loads(line)

            # Extract the necessary fields
            instruction = data.get("instruction", "N/A")
            gold_answer = data.get("gold_answer", "N/A")
            model_answer = data.get("model_answer", "N/A")

            # Extract the relevant portion from the model answer
            extracted_answer = model_answer.split("<ANSWER>:")[-1].strip() if "<ANSWER>:" in model_answer else model_answer

            # Format the output
            formatted_output = {
                "Instruction": instruction,
                "Gold Answer": gold_answer,
                "Extracted Model Answer": extracted_answer
            }

            # Write the formatted JSON to the output file
            outfile.write(json.dumps(formatted_output) + '\n')

# Usage example:
input_file = 'input.jsonl'   # actual file path
output_file = 'formatted_output.jsonl'  #  desired output file path
format_jsonl(input_file, output_file)
