In [1]:
import polars as pl
import os, math, string
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
from transformers import BitsAndBytesConfig
import torch
import gc
from tqdm import tqdm
import numpy as np
import markdown

import lancedb
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from rouge_score import rouge_scorer
from bert_score import score as bertscore
from collections import Counter
import string
import random
import emoji

# === NLTK SETUP ===
nltk.download('punkt_tab')
nltk.download("wordnet")
smooth_fn = SmoothingFunction().method4

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/zorin17/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/zorin17/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from huggingface_hub import login

# Insert your token here
login(token="")

In [3]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [4]:
df = pl.read_csv('hf://datasets/bitext/Bitext-customer-support-llm-chatbot-training-dataset/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv')
# Data cleaning
# fill in null
df = df.with_columns([
    pl.col("instruction")
      .cast(str)
      .str.to_lowercase()
      .fill_null(""),
    
    pl.col("response")
      .cast(str)
      .str.to_lowercase()
      .fill_null(""),
    
    pl.col("intent")
      .cast(str)
      .fill_null("unknown")
])

# Remove emoji
def remove_emojis(text: str) -> str:
    return emoji.replace_emoji(text, replace="")  # Remove all emojis safely

# Apply to instruction and response
df = df.with_columns([
    pl.col("instruction").map_elements(remove_emojis).alias("instruction")
])

df = df.with_columns([
    pl.col("response").map_elements(remove_emojis).alias("response")
])

#Exclude noisy flags
# Filter out rows where 'flags' contains Z, Q, or W ===
flag = ["flags"]
df_z = df.filter(
    ~pl.col(flag).cast(str).str.contains("Z")
)

df_zw = df.filter(
    ~pl.col(flag).cast(str).str.contains("Z") &
    ~pl.col(flag).cast(str).str.contains("W")
)

df_clean = df.filter(
    ~pl.col(flag).cast(str).str.contains("Z") &
    ~pl.col(flag).cast(str).str.contains("W") &
    ~pl.col(flag).cast(str).str.contains("Q")
)

print(df_z.height)
print(df_zw.height)
print(df_clean.height)

category_counts = (
    df_clean
    .group_by("category")
    .agg(pl.count().alias("counts"))
    .sort("counts", descending=True)
)

print(category_counts)

# Filter only selected categories
selected_categories = ["ORDER", "REFUND", "SHIPPING", "DELIVERY"]
df_selected = df_clean.filter(
    pl.col("category").is_in(selected_categories)
)

  df = df.with_columns([
  df = df.with_columns([


21586
20517
14454
shape: (11, 2)
┌──────────────┬────────┐
│ category     ┆ counts │
│ ---          ┆ ---    │
│ str          ┆ u32    │
╞══════════════╪════════╡
│ ACCOUNT      ┆ 3251   │
│ ORDER        ┆ 2152   │
│ REFUND       ┆ 1527   │
│ SHIPPING     ┆ 1156   │
│ DELIVERY     ┆ 1102   │
│ …            ┆ …      │
│ INVOICE      ┆ 1076   │
│ PAYMENT      ┆ 1028   │
│ FEEDBACK     ┆ 1004   │
│ CANCEL       ┆ 539    │
│ SUBSCRIPTION ┆ 537    │
└──────────────┴────────┘


(Deprecated in version 0.20.5)
  .agg(pl.count().alias("counts"))


In [5]:
# Split dataset by category
# === Configuration ===
LABEL_COL = "category"  # 🔁 Replace with "intent" or any stratification column
SPLIT_RATIO_TRAIN = 0.7
SPLIT_RATIO_VAL = 0.15
SEED = 123
df_final = df_selected.clone()

# === Stratified split logic ===
random.seed(SEED)
train_parts = []
test_parts = []
val_parts = []

for label in df_final[LABEL_COL].unique().to_list():
    group_df = df_final.filter(pl.col(LABEL_COL) == label)
    group_df = group_df.sample(n=len(group_df), shuffle=True, seed=SEED)

    n = len(group_df)
    train_idx = int(n * SPLIT_RATIO_TRAIN)
    val_idx = int(n * (SPLIT_RATIO_TRAIN + SPLIT_RATIO_VAL))

    train_parts.append(group_df[:train_idx])
    val_parts.append(group_df[train_idx:val_idx])
    test_parts.append(group_df[val_idx:])

# === Combine all groups
train_df = pl.concat(train_parts).sort(["category", "instruction"])
val_df = pl.concat(val_parts).sort(["category", "instruction"])
test_df = pl.concat(test_parts).sort(["category", "instruction"])

print("✅ Split sizes:")
print(f"Train: {len(train_df)}")
print(f"Val:   {len(val_df)}")
print(f"Test:  {len(test_df)}")

print("\n📊 Category distribution in test set:")
print(test_df.select([pl.col(LABEL_COL)]).to_series().value_counts())

✅ Split sizes:
Train: 4154
Val:   890
Test:  893

📊 Category distribution in test set:
shape: (4, 2)
┌──────────┬───────┐
│ category ┆ count │
│ ---      ┆ ---   │
│ str      ┆ u32   │
╞══════════╪═══════╡
│ DELIVERY ┆ 166   │
│ ORDER    ┆ 323   │
│ REFUND   ┆ 230   │
│ SHIPPING ┆ 174   │
└──────────┴───────┘


In [6]:
# === Configuration ===
scenario_no = "F"
scenario    = "rag_prompt_ft"  

# Prompt-tuned generators: base + adapter
MODELS = {
    # "llama": {
    #     "base":    "meta-llama/Llama-3.2-1B",
    #     "adapter": "prompt-tune-outputs/Llama-3.2-1B-faq",
    # },
    # "qwen": {
    #     "base":    "Qwen/Qwen3-0.6B-Base",
    #     "adapter": "prompt-tune-outputs/Qwen3-0.6B-Base-faq",
    # },
    "olmo": {
        "base":    "allenai/OLMo-2-0425-1B",
        "adapter": "prompt-tune-outputs/OLMo-2-0425-1B-faq",
    },
}

OFFLOAD_DIR = "./offload"
USE_CUDA = torch.cuda.is_available()
SAVE_EVERY = 20

# === Load test set (pre-split externally)
questions = test_df["instruction"].cast(str).to_list()
answers = test_df["response"].cast(str).to_list()

# ----------------- RAG pieces -----------------

# Read data from LanceDB table
def load_faq_table():
    db = lancedb.connect("/home/zorin17/Desktop/LLM/")
    return db.open_table("LANCEDB_FAQ")

table = load_faq_table()

# Search reranker
def search(query, table, top_k=5, category=None, intent=None):
    s = table.search(query).limit(top_k)
    if category:
        s = s.where(f"category = '{category}'")
    if intent:
        s = s.where(f"intent = '{intent}'")

    result = s.to_list()
    if not result:
        return "[Context 1]:\n(no relevant context found)\n"

    blocks = []
    for i, r in enumerate(result, 1):
        text = (r.get("text") or "").strip()
        # optional: guard against empty rows
        if not text:
            text = "(empty context)"
        blocks.append(f"[Context {i}]:\n{text}\n")
    return "\n".join(blocks)

# Prompt template (system + user with explicit sections)
base_prompt = """You are a helpful retail assistant. Your task is to answer the user question using provided contexts as the answer. 
You must make your response organized and structured.

User question: {}
Contexts:
{}
"""
custom_template = """{% if messages | selectattr('role','equalto','system') | list %}
System: {{ (messages | selectattr('role','equalto','system') | map(attribute='content') | list) | join('\\n') }}
{% endif %}
{% for m in messages %}
{% if m['role'] == 'user' -%}
User: {{ m['content'] }}
{% elif m['role'] == 'assistant' -%}
Assistant: {{ m['content'] }}
{% elif m['role'] == 'tool' -%}
Tool: {{ m['content'] }}
{% elif m['role'] == 'developer' -%}
System: {{ m['content'] }}
{% else -%}
{{ m['role']|capitalize }}: {{ m['content'] }}
{% endif -%}
{% endfor %}
Assistant:"""

# ----------------- RAG pieces End -----------------

# ----------------- Utilities -----------------
# === Safe append function
def append_to_csv(new_data: pl.DataFrame, path: str):
    if os.path.exists(path):
        existing = pl.read_csv(path)
        combined = existing.vstack(new_data)
    else:
        combined = new_data
    combined.write_csv(path)

def adaptive_max_new_tokens(gold_text: str, tokenizer, min_nt=48, max_nt=300, mult=1.3, bonus=16) -> int:
    n_gold = len(tokenizer(gold_text, add_special_tokens=False).input_ids)
    est = int(round(mult * n_gold) + bonus)
    return int(np.clip(est, min_nt, max_nt))

def _count_tokens(s, tok): 
    return len(tok(s, add_special_tokens=False).input_ids)

def safe_pack_contexts(question, contexts, tokenizer, max_src_tokens=700):
    head = "You are a helpful retail assistant. Your task is to answer the user question using provided contexts as the answer.\n\n"
    used = _count_tokens(head + f"User question: {question}\nContexts:\n", tokenizer)
    kept, buf = [], contexts.splitlines()
    for c in buf:
        block = c.strip() + "\n"
        t = _count_tokens(block, tokenizer)
        if used + t > max_src_tokens:
            break
        kept.append(block); used += t
    return "".join(kept)

def generate_one(tokenizer, hf_model, base_prompt, question, context,
                 temperature=0.0, max_new_tokens=128, return_html=False):
    system_content = base_prompt.format(question, context)
    messages = [
        {"role": "system", "content": system_content},
        {"role": "user",   "content": f"Question: {question}\n\nContext:\n{context}"}
    ]
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        chat_template=custom_template
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(hf_model.device)
    input_len = inputs["input_ids"].shape[-1]

    with torch.inference_mode():
        outputs = hf_model.generate(
            **inputs,
            do_sample=(temperature > 0),
            temperature=temperature,
            top_p=0.95,
            max_new_tokens=max_new_tokens,
            no_repeat_ngram_size=3,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            return_dict_in_generate=True,
        )

    new_tokens = outputs.sequences[0, input_len:]
    text = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
    return markdown.markdown(text) if return_html else text

def rag_answer(tokenizer, hf_model, question, temperature=0.0, category=None, intent=None, max_new_tokens=128):
    raw_ctx = search(question, table, top_k=5, category=category, intent=intent)
    context = safe_pack_contexts(question, raw_ctx, tokenizer, max_src_tokens=700)
    return generate_one(tokenizer, hf_model, base_prompt, question, context,
                        temperature=temperature, max_new_tokens=max_new_tokens, return_html=False)

# === Prompt-tuning loader ===
def load_tokenizer(model_or_path: str):
    tok = AutoTokenizer.from_pretrained(model_or_path, use_fast=True, trust_remote_code=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    tok.padding_side = "right"
    return tok

def load_base_4bit(base_name: str, offload_dir: str):
    supports_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
    bnb = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 if supports_bf16 else torch.float16,
    )
    return AutoModelForCausalLM.from_pretrained(
        base_name,
        quantization_config=bnb,
        device_map="auto",
        attn_implementation="sdpa",
        torch_dtype=torch.bfloat16 if supports_bf16 else torch.float16,
        offload_folder=offload_dir,
        trust_remote_code=True,
    )

def enable_inference_cache(model):
    if getattr(model.config, "use_cache", None) is not True:
        model.config.use_cache = True

# ----------------- Run all Prompt-tuned generators -----------------
for key, cfg in MODELS.items():
    BASE = cfg["base"]
    ADAPTER_DIR = cfg["adapter"]

    # sanity: adapter exists
    assert os.path.exists(ADAPTER_DIR), f"Adapter not found for {key}: {ADAPTER_DIR}"

    OUTPUT_CSV = f"chatbot_result/{scenario}/scenario_{scenario_no}_{scenario}_results_{key}.csv"
    print(f"\n🚀 Scenario {scenario_no} (RAG + Prompt-Tuned) — Inference for: {BASE} + {ADAPTER_DIR}")
    os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)

    torch.cuda.empty_cache(); torch.cuda.ipc_collect()

    # --- Load base (4-bit) + attach LoRA adapter ---
    tokenizer = load_tokenizer(BASE)
    base_model = load_base_4bit(BASE, OFFLOAD_DIR)
    hf_model  = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
    enable_inference_cache(hf_model)
    hf_model.eval()

    # resume
    if os.path.exists(OUTPUT_CSV):
        existing = pl.read_csv(OUTPUT_CSV)
        done = set(existing["instruction"].to_list())
        print(f"🔄 Resuming from {len(done)} rows...")
    else:
        done = set()

    results = []
    for question, gold in tqdm(zip(questions, answers), total=len(questions), desc=f"{key.upper()} RAG+FT"):
        if question in done:
            continue

        max_nt = adaptive_max_new_tokens(gold, tokenizer, min_nt=48, max_nt=300, mult=1.3, bonus=16)

        try:
            pred = rag_answer(
                tokenizer, hf_model, question,
                temperature=0.0,
                category=None, intent=None,
                max_new_tokens=max_nt
            )
            if "Assistant:" in pred:
                pred = pred.split("Assistant:")[-1].strip()

        except Exception as e:
            print(f"⚠️ Error on: {question[:60]}... Skip. Reason: {e}")
            continue

        results.append((question, gold, pred))

        if len(results) >= SAVE_EVERY:
            chunk_df = pl.DataFrame(results, schema=["instruction", "response", "prediction"])
            enriched = test_df.join(chunk_df, on=["instruction", "response"], how="inner")
            append_to_csv(enriched, OUTPUT_CSV)
            results = []

    if results:
        chunk_df = pl.DataFrame(results, schema=["instruction", "response", "prediction"])
        enriched = test_df.join(chunk_df, on=["instruction", "response"], how="inner")
        append_to_csv(enriched, OUTPUT_CSV)

    print(f"✅ Done with {key}. Results saved to {OUTPUT_CSV}")

    # cleanup VRAM
    del tokenizer, hf_model, base_model
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()


🚀 Scenario F (RAG + Prompt-Tuned) — Inference for: allenai/OLMo-2-0425-1B + prompt-tune-outputs/OLMo-2-0425-1B-faq


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OLMO RAG+FT:   0%|          | 0/893 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
OLMO RAG+FT:   0%|          | 1/893 [00:13<3:15:11, 13.13s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
OLMO RAG+FT:   0%|          | 2/893 [00:34<4:26:07, 17.92s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
OLMO RAG+FT:   0%|          | 3/893 [00:41<3:11:26, 12.91s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
OLMO RAG+FT:   0%|          | 4/893 [00:47<2:33:23, 10.35s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for

✅ Done with olmo. Results saved to chatbot_result/rag_prompt_ft/scenario_F_rag_prompt_ft_results_olmo.csv


In [2]:
# === Config ===
scenario_no = "F"
scenario    = "rag_prompt_ft"
DEVICE = "cuda"
MODELS = {
    "llama":    "meta-llama/Llama-3.2-1B",
    "qwen":     "Qwen/Qwen3-0.6B-Base",
    "olmo":     "allenai/OLMo-2-0425-1B",
}

BERTSCORE_MODEL = "microsoft/deberta-base-mnli"  # fast & solid for FAQ/QA

# NEW: memory safety knobs (no impact on scores)
import os, torch
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
BERTSCORE_BATCH_SIZE = 4     # smaller batch to reduce VRAM peak (no score change)
BERTSCORE_CHUNK_SIZE = 600   # process pairs in chunks (no score change)

# Output folder
OUT_DIR = "Evaluation"
os.makedirs(OUT_DIR, exist_ok=True)

# BLEU smoothing (method4 is friendlier for short answers)
smooth_fn = SmoothingFunction().method4
# Cache rouge scorer once
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

# === Helpers ===
def _safe_text(x):
    return "" if x is None or (isinstance(x, float) and math.isnan(x)) else str(x)

def normalize(text: str) -> list[str]:
    text = _safe_text(text).lower().translate(str.maketrans('', '', string.punctuation))
    return wordpunct_tokenize(text)

def f1_score_overlap(pred: str, gold: str) -> float:
    pred_tokens = normalize(pred)
    gold_tokens = normalize(gold)
    if not pred_tokens or not gold_tokens:
        return 0.0
    common = Counter(pred_tokens) & Counter(gold_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)

def bleu_score(pred: str, gold: str) -> float:
    # BLEU-2 is fairer for short FAQ answers
    return sentence_bleu(
        [normalize(gold)], normalize(pred),
        weights=(0.5, 0.5, 0, 0),
        smoothing_function=smooth_fn
    )

def rouge_l_score(pred: str, gold: str) -> float:
    return rouge.score(_safe_text(gold), _safe_text(pred))["rougeL"].fmeasure

def meteor(pred: str, gold: str) -> float:
    return meteor_score([normalize(gold)], normalize(pred))

def clean_text(t: str) -> str:
    t = _safe_text(t).strip()
    for pre in ("answer:", "response:", "reply:"):
        if t.lower().startswith(pre):
            t = t[len(pre):].strip()
    return t.strip("`").strip()

# --- BERTScore with GPU-first, CPU fallback on OOM (identical results) ---
def _bertscore_mean_F1(preds_nz: list[str], golds_nz: list[str]) -> float:
    if len(preds_nz) == 0:
        return 0.0

    def _run(device: str) -> float:
        if device.startswith("cuda"):
            torch.cuda.empty_cache()
        all_f1 = []
        for i in range(0, len(preds_nz), BERTSCORE_CHUNK_SIZE):
            p_chunk = preds_nz[i:i+BERTSCORE_CHUNK_SIZE]
            g_chunk = golds_nz[i:i+BERTSCORE_CHUNK_SIZE]
            P, R, F1 = bertscore(
                p_chunk, g_chunk,
                lang="en",
                model_type=BERTSCORE_MODEL,
                rescale_with_baseline=True,
                device=device,                      # cuda OR cpu
                batch_size=BERTSCORE_BATCH_SIZE,
            )
            all_f1.append(F1)
            if device.startswith("cuda"):
                torch.cuda.empty_cache()
        return float(torch.cat(all_f1).mean()) * 100.0

    # Try CUDA first; if OOM, re-run entirely on CPU
    try:
        return _run(DEVICE)
    except RuntimeError as e:
        if "out of memory" in str(e).lower():
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            return _run("cpu")  # same model/settings; numerically equivalent up to tiny fp noise
        raise

def compute_metrics(preds_raw, golds_raw):
    preds = [clean_text(p) for p in preds_raw]
    golds = [clean_text(g) for g in golds_raw]

    f1_list, bleu_list, rouge_list, meteor_list = [], [], [], []
    for pred, gold in zip(preds, golds):
        f1_list.append(f1_score_overlap(pred, gold))
        bleu_list.append(bleu_score(pred, gold))
        rouge_list.append(rouge_l_score(pred, gold))
        meteor_list.append(meteor(pred, gold))

    # BERTScore; drop empty pairs
    mask = [bool(p.strip()) and bool(g.strip()) for p, g in zip(preds, golds)]
    preds_nz = [p for p, m in zip(preds, mask) if m]
    golds_nz = [g for g, m in zip(golds, mask) if m]
    bert_f1_avg = _bertscore_mean_F1(preds_nz, golds_nz)

    n = max(1, len(f1_list))
    metrics = ["F1-overlap", "BLEU", "ROUGE-L", "METEOR", "BERTScore-F1"]
    scores = [
        round(sum(f1_list)    / n * 100, 2),
        round(sum(bleu_list)  / n * 100, 2),
        round(sum(rouge_list) / n * 100, 2),
        round(sum(meteor_list)/ n * 100, 2),
        round(bert_f1_avg, 2),
    ]
    rows = [{"Metric": m, "Score": s} for m, s in zip(metrics, scores)]
    return pl.DataFrame(rows)

# === Run all models and combine outputs ===
all_summaries = []

for key, MODEL_NAME in MODELS.items():
    OUTPUT_CSV = f"chatbot_result/{scenario}/scenario_{scenario_no}_{scenario}_results_{key}.csv"

    print(f"\n🚀 Running Evaluation for: {MODEL_NAME}")

    result_df = pl.read_csv(OUTPUT_CSV)
    preds = result_df["prediction"].to_list()
    golds = result_df["response"].to_list()

    summary = compute_metrics(preds, golds).with_columns(pl.lit(key).alias("Model"))
    all_summaries.append(summary)

# Combine all models (long)
if all_summaries:
    final_df = pl.concat(all_summaries).select(["Model", "Metric", "Score"])
    final_df.write_csv(os.path.join(OUT_DIR, f"Scenario_{scenario_no}_Evaluation_AllModels.csv"))

    # Pivot for side-by-side comparison
    pivot_df = final_df.pivot(index="Metric", columns="Model", values="Score")
    pivot_df.write_csv(os.path.join(OUT_DIR, f"Scenario_{scenario_no}_Evaluation_Pivot.csv"))

    print("\n📊 Combined (pivot) comparison:")
    print(pivot_df)
    print("\n✅ Saved under 'Evaluation/'")
    print(f"- Scenario_{scenario_no}_Evaluation_AllModels.csv")
    print(f"- Scenario_{scenario_no}_Evaluation_Pivot.csv")
else:
    print("⚠️ No model results were processed.")



🚀 Running Evaluation for: meta-llama/Llama-3.2-1B

🚀 Running Evaluation for: Qwen/Qwen3-0.6B-Base

🚀 Running Evaluation for: allenai/OLMo-2-0425-1B

📊 Combined (pivot) comparison:
shape: (5, 4)
┌──────────────┬───────┬───────┬───────┐
│ Metric       ┆ llama ┆ qwen  ┆ olmo  │
│ ---          ┆ ---   ┆ ---   ┆ ---   │
│ str          ┆ f64   ┆ f64   ┆ f64   │
╞══════════════╪═══════╪═══════╪═══════╡
│ F1-overlap   ┆ 27.83 ┆ 12.79 ┆ 21.98 │
│ BLEU         ┆ 9.07  ┆ 4.1   ┆ 7.77  │
│ ROUGE-L      ┆ 17.8  ┆ 8.76  ┆ 15.19 │
│ METEOR       ┆ 17.72 ┆ 7.92  ┆ 13.54 │
│ BERTScore-F1 ┆ 19.2  ┆ 21.31 ┆ 4.08  │
└──────────────┴───────┴───────┴───────┘

✅ Saved under 'Evaluation/'
- Scenario_F_Evaluation_AllModels.csv
- Scenario_F_Evaluation_Pivot.csv


  pivot_df = final_df.pivot(index="Metric", columns="Model", values="Score")
