In [None]:
! pip install -q datasets huggingface_hub google-generativeai tqdm
! pip install -q transformers accelerate bitsandbytes peft scipy

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h

## 1. Configuration

Set your API keys and workspace configurations here.

In [None]:
import os
from llm_engineering.settings import settings

GEMINI_API_KEY = settings.GOOGLE_API_KEY
HF_TOKEN = settings.HF_TOKEN

# Configuration
DATASET_HUGGINGFACE_WORKSPACE = "K-1303"
MODEL_HUGGINGFACE_WORKSPACE = "K-1303"
IS_DUMMY = "no"
USE_MERGED_MODELS = "yes"

print("\n====== EVAL PARAMETERS ======")
print(f"Dataset Workspace: {DATASET_HUGGINGFACE_WORKSPACE}")
print(f"Model Workspace: {MODEL_HUGGINGFACE_WORKSPACE}")
print(f"Dummy Mode: {IS_DUMMY}")
print(f"Merged Models: {USE_MERGED_MODELS}")
print("=============================")


Dataset Workspace: K-1303
Model Workspace: K-1303
Dummy Mode: no
Merged Models: yes


## 2. Import Libraries

In [None]:
import gc
import json
import torch
import google.generativeai as genai
from typing import Tuple

from datasets import load_dataset
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import time
import queue
from threading import Lock
from tqdm import tqdm
from datasets import load_dataset

genai.configure(api_key=GEMINI_API_KEY)

## 3. Helper Functions

In [4]:
def check_if_huggingface_model_exists(model_id: str, default_value: str) -> str:
    api = HfApi()
    try:
        api.model_info(model_id, token=HF_TOKEN)
        print(f"Found: '{model_id}'")
        return model_id
    except RepositoryNotFoundError:
        print(f"Not found: '{model_id}'")
        print(f"Using: '{default_value}'")
        return default_value

def check_if_huggingface_dataset_exists(dataset_id: str, default_value: str) -> str:
    api = HfApi()
    try:
        api.dataset_info(dataset_id, token=HF_TOKEN)
        print(f"Found: '{dataset_id}'")
        return dataset_id
    except RepositoryNotFoundError:
        print(f"Not found: '{dataset_id}'")
        print(f"Using: '{default_value}'")
        return default_value

## 4. Model Loading Functions

In [5]:
def load_merged_model(model_name: str) -> Tuple:
    """Load a merged model (post SFT+DPO+merge)"""
    print(f"Loading merged model: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        token=HF_TOKEN,
        trust_remote_code=True,
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
        token=HF_TOKEN,
    )

    print("Merged model loaded")
    return model, tokenizer


def load_model_with_adapter(
    base_model_name: str,
    adapter_model_name: str,
    load_in_4bit: bool = True,
) -> Tuple:
    """Load base model with LoRA adapter (if model not merged yet)"""
    print(f"Loading base: {base_model_name}")
    print(f"Loading adapter: {adapter_model_name}")

    # Configure quantization
    bnb_config = None
    if load_in_4bit:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )
        print("Using 4-bit quantization")

    # Load base model
    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16 if not load_in_4bit else None,
        token=HF_TOKEN,
    )

    # Load adapter
    model = PeftModel.from_pretrained(
        model,
        adapter_model_name,
        token=HF_TOKEN,
    )

    # Merge adapter into base for inference
    print("Merging adapter...")
    model = model.merge_and_unload()

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        base_model_name,
        token=HF_TOKEN,
        trust_remote_code=True,
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print("Model with adapter loaded")
    return model, tokenizer

## 5. Answer Generation

In [6]:
def format_prompt(sample):
    return (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        f"### Instruction:\n{sample['instruction']}\n\n### Response:\n"
    )


def generate_answers(
    model_id: str,
    dataset_name: str,
    use_merged: bool = True,
    base_model: str = None,
):
    """Generate answers using the model"""
    # Load dataset
    dataset = load_dataset(dataset_name, split="test", token=HF_TOKEN)

    if IS_DUMMY:
        try:
            dataset = dataset.select(range(10))
            print("Dummy mode: 10 samples")
        except:
            pass

    print(f"Dataset size: {len(dataset)}")
    dataset = dataset.map(lambda sample: {"prompt": format_prompt(sample)})

    # Load model
    if use_merged:
        model, tokenizer = load_merged_model(model_id)
    else:
        if base_model is None:
            raise ValueError("base_model required when use_merged=False")
        model, tokenizer = load_model_with_adapter(base_model, model_id)

    # Generate
    print(f"Generating answers...")
    answers = []

    for prompt in tqdm(dataset["prompt"], desc="Generating"):
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=2048
        ).to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=2048,
                temperature=0.8,
                top_p=0.95,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        generated = tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        )
        answers.append(generated)

    dataset = dataset.add_column("answers", answers)

    # Upload
    result_repo = f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results"
    print(f"Uploading to {result_repo}")
    dataset.push_to_hub(result_repo, token=HF_TOKEN)

    # Cleanup
    del model, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

    return dataset

## 6. Evaluation Functions (Using Gemini)

In [7]:
class RateLimiter:
    def __init__(self, max_requests_per_minute: int = 10):
        self.max_requests = max_requests_per_minute
        self.requests = queue.Queue()
        self.lock = Lock()

    def acquire(self):
        with self.lock:
            now = time.time()
            # Remove timestamps older than 60s
            while not self.requests.empty():
                if now - self.requests.queue[0] > 60:
                    self.requests.get()
                else:
                    break

            # If we hit limit, sleep until next slot opens
            if self.requests.qsize() >= self.max_requests:
                sleep_time = 60 - (now - self.requests.queue[0]) + 0.5
                print(f"Rate limit hit, sleeping for {sleep_time:.1f}s...")
                time.sleep(sleep_time)

            # Record new request timestamp
            self.requests.put(time.time())

In [8]:
rate_limiter = RateLimiter(max_requests_per_minute=15)

In [9]:
def evaluate_answer(instruction: str, answer: str, model) -> dict:
    prompt = f"""You are an expert judge. Evaluate the answer based on:

1. Accuracy: How factually correct is the information?
2. Style: Is the tone appropriate for blog/social media?

Accuracy scale:
1 (Poor): Factual errors or misleading
2 (Good): Mostly accurate with minor errors
3 (Excellent): Highly accurate and comprehensive

Style scale:
1 (Poor): Too formal, overly complex words
2 (Good): Good balance but still formal
3 (Excellent): Accessible language, simple technical terms

Example bad style: "The Llama2 7B model constitutes a noteworthy progression..."
Example excellent style: "Llama2 7B outperforms the original Llama model."

Instruction: {instruction}

Answer: {answer}

Respond ONLY with valid JSON:
{{
    "accuracy": {{"analysis": "...", "score": 0}},
    "style": {{"analysis": "...", "score": 0}}
}}"""

    # Apply rate limiter before calling Gemini
    rate_limiter.acquire()

    try:
        response = model.generate_content(prompt)
        text = response.text.strip()
        if text.startswith("```json"):
            text = text[7:-3].strip()
        elif text.startswith("```"):
            text = text[3:-3].strip()
        return json.loads(text)
    except Exception as e:
        return {
            "accuracy": {"analysis": f"Error: {e}", "score": None},
            "style": {"analysis": f"Error: {e}", "score": None}
        }

# ─────────────────────────────
# Batch Evaluation Helper
# ─────────────────────────────
def evaluate_batch(batch, start_index, model):
    return [(i, evaluate_answer(instr, ans, model))
            for i, (instr, ans) in enumerate(batch, start=start_index)]

# ─────────────────────────────
# Main Batch Evaluator
# ─────────────────────────────
def evaluate_answers(model_id: str, num_threads: int = 5, batch_size: int = 5):
    gemini_model = genai.GenerativeModel('gemini-2.5-flash')

    result_repo = f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results"
    print(f"Loading {result_repo}")
    dataset = load_dataset(result_repo, split="all", token=HF_TOKEN)

    batches = [
        (i, list(zip(
            dataset["instruction"][i:i + batch_size],
            dataset["answers"][i:i + batch_size],
            strict=False
        )))
        for i in range(0, len(dataset), batch_size)
    ]

    evaluations = [None] * len(dataset)
    print(f"Evaluating {len(dataset)} answers (rate-limited)...")

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(evaluate_batch, batch, idx, gemini_model)
                   for idx, batch in batches]

        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            for index, evaluation in future.result():
                evaluations[index] = evaluation

    # Add evaluations
    if "evaluation" in dataset.column_names:
        dataset = dataset.remove_columns(["evaluation"])
    dataset = dataset.add_column("evaluation", evaluations)

    # Extract scores
    accuracy_scores = []
    style_scores = []

    for evaluation in dataset["evaluation"]:
        try:
            eval_dict = json.loads(evaluation) if isinstance(evaluation, str) else evaluation
            accuracy_scores.append(eval_dict["accuracy"]["score"])
            style_scores.append(eval_dict["style"]["score"])
        except:
            accuracy_scores.append(None)
            style_scores.append(None)

    # Replace columns
    for col_name, values in [("accuracy", accuracy_scores), ("style", style_scores)]:
        if col_name in dataset.column_names:
            dataset = dataset.remove_columns([col_name])
        dataset = dataset.add_column(col_name, values)

    print(f"Uploading results")
    dataset.push_to_hub(result_repo, token=HF_TOKEN)

    return dataset

## 7. Analysis Functions

In [10]:
def analyze_results(model_id: str):
    result_repo = f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results"
    dataset = load_dataset(result_repo, split="all", token=HF_TOKEN)

    valid_acc = [s for s in dataset["accuracy"] if s is not None]
    valid_sty = [s for s in dataset["style"] if s is not None]

    if valid_acc:
        acc = sum(valid_acc) / len(valid_acc)
        print(f"{model_id.split('/')[-1]}")
        print(f"   Accuracy: {acc:.2f}")

    if valid_sty:
        sty = sum(valid_sty) / len(valid_sty)
        print(f"Style: {sty:.2f}")

    return dataset

## 8. Model Selection

**Choose which model to evaluate in this runtime session.**

In [14]:
# Define models with their base models (for adapter loading)
ALL_MODELS = [
    {
        "name": "SFT MODEL",
        "model": f"{MODEL_HUGGINGFACE_WORKSPACE}/TwinLlama-3.1-8B-KD-16bit",
        "default": "mlabonne/TwinLlama-3.1-8B",
        "base": "meta-llama/Llama-3.1-8B-Instruct",
    },
    {
        "name": "DPO MODEL",
        "model": f"{MODEL_HUGGINGFACE_WORKSPACE}/TwinLlama-3.1-8B-KD-DPO",
        "default": "mlabonne/TwinLlama-3.1-8B-DPO",
        "base": "meta-llama/Llama-3.1-8B-Instruct",
    },
    # {
    #     "name": "Llama-3.1-8B-Instruct",
    #     "model": "meta-llama/Llama-3.1-8B-Instruct",
    #     "default": "meta-llama/Llama-3.1-8B-Instruct",
    #     "base": "meta-llama/Llama-3.1-8B-Instruct",
    # },
]

print("Available models:")
for i, m in enumerate(ALL_MODELS, 1):
    print(f"  {i}. {m['name']}")

model_choice = int(input("\nSelect model (1-3): ")) - 1
selected = ALL_MODELS[model_choice]

print(f"\nSelected: {selected['name']}")
CURRENT_MODEL = check_if_huggingface_model_exists(selected['model'], selected['default'])
BASE_MODEL = selected['base']
print(f"Model: {CURRENT_MODEL}")
print(f"Base: {BASE_MODEL}")

Available models:
  1. SFT MODEL
  2. DPO MODEL

Select model (1-3): 2

Selected: DPO MODEL
Found: 'K-1303/TwinLlama-3.1-8B-KD-DPO'
Model: K-1303/TwinLlama-3.1-8B-KD-DPO
Base: meta-llama/Llama-3.1-8B-Instruct


## 9. Run Generation

Generate answers for the selected model.

In [None]:
dataset_name = check_if_huggingface_dataset_exists(
    f"{DATASET_HUGGINGFACE_WORKSPACE}/llmtwin",
    "mlabonne/llmtwin"
)

print(f"\n{'='*50}")
print(f"GENERATING: {CURRENT_MODEL}")
print(f"{'='*50}\n")

result_dataset = generate_answers(
    CURRENT_MODEL,
    dataset_name=dataset_name,
    use_merged=USE_MERGED_MODELS,
    base_model=BASE_MODEL if not USE_MERGED_MODELS else None,
)

print("\nGeneration complete!")

Found: 'K-1303/llmtwin'

GENERATING: K-1303/TwinLlama-3.1-8B-KD-DPO



README.md:   0%|          | 0.00/408 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/139k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/896 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Dummy mode: 10 samples
Dataset size: 10


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Loading merged model: K-1303/TwinLlama-3.1-8B-KD-DPO


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/196 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.65G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/240 [00:00<?, ?B/s]

Merged model loaded
Generating answers...


Generating: 100%|██████████| 10/10 [07:50<00:00, 47.06s/it]


Uploading to K-1303/TwinLlama-3.1-8B-KD-DPO-results


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              : 100%|##########| 15.8kB / 15.8kB            


✅ Generation complete!


## 10. Run Evaluation

Evaluate the generated answers using Gemini.

In [None]:
print(f"\n{'='*50}")
print(f"EVALUATING: {CURRENT_MODEL}")
print(f"{'='*50}\n")

evaluated_dataset = evaluate_answers(CURRENT_MODEL)
print("\nEvaluation complete!")


EVALUATING: K-1303/TwinLlama-3.1-8B-KD-DPO

Loading K-1303/TwinLlama-3.1-8B-KD-DPO-results
Evaluating 10 answers (rate-limited)...


100%|██████████| 2/2 [01:37<00:00, 48.89s/it]


Uploading results


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              : 100%|##########| 27.0kB / 27.0kB            


Evaluation complete!


## 11. Analyze Results

View the final scores for the evaluated model.

In [13]:
print(f"\n{'='*50}")
print(f"RESULTS: {CURRENT_MODEL}")
print(f"{'='*50}\n")

final_dataset = analyze_results(CURRENT_MODEL)
print("\nDone! Restart runtime for next model.")


RESULTS: K-1303/TwinLlama-3.1-8B-KD-16bit



README.md:   0%|          | 0.00/717 [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/24.4k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10 [00:00<?, ? examples/s]

TwinLlama-3.1-8B-KD-16bit
   Accuracy: 2.00
Style: 2.50

Done! Restart runtime for next model.


In [15]:
print(f"\n{'='*50}")
print(f"RESULTS: {CURRENT_MODEL}")
print(f"{'='*50}\n")

final_dataset = analyze_results(CURRENT_MODEL)
print("\nDone! Restart runtime for next model.")


RESULTS: K-1303/TwinLlama-3.1-8B-KD-DPO



README.md:   0%|          | 0.00/717 [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/27.0k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10 [00:00<?, ? examples/s]

TwinLlama-3.1-8B-KD-DPO
   Accuracy: 2.12
Style: 2.62

Done! Restart runtime for next model.
