In [None]:
import os
import json
import glob
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, BertTokenizer, BertModel
from peft import AutoPeftModelForCausalLM
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.util import ngrams
from nltk import download
from rouge_score import rouge_scorer
import torch.nn.functional as F

# =========================
# Optional metric libraries
# =========================
_HAS_BERTSCORE = True
_HAS_SENTENCE_TRANSFORMERS = True

try:
    from bert_score import score as bertscore
    _HAS_BERTSCORE = True
except Exception as e:
    print("[Info] bert-score not available; official BERTScore will be skipped.", e)

try:
    from sentence_transformers import SentenceTransformer, util as sbert_util
    _HAS_SENTENCE_TRANSFORMERS = True
except Exception as e:
    print("[Info] sentence-transformers not available; SBERT Similarity will be skipped.", e)

# ------------------------------------
# Speed/parallelism friendly settings
# ------------------------------------
torch.backends.cuda.matmul.allow_tf32 = True
if torch.cuda.is_available():
    torch.set_float32_matmul_precision("high")

# Download NLTK data if needed (may require adjustment if no internet)
download('wordnet', quiet=True)
download('omw-1.4', quiet=True)  # For multilingual, but assuming English

# Set up paths (adjust if needed)
model_id = "Qwen/Qwen3-4B-Instruct-2507"
final_dir = "checkpoint-400"
test_glob = r"Eval/**/*.jsonl"
results_file = "metrics_results.json"
plot_file = "metrics_plot.png"

# -------------------------
# Device & batch utilities
# -------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# You can override with env var, otherwise pick a sensible default
DEFAULT_BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 24 if DEVICE == "cuda" else 16))

print(f"[Info] Using device: {DEVICE}, default batch size: {DEFAULT_BATCH_SIZE}")

# ============================================================
# FIXED TOKENIZER LOADING (MATCHING TRAINING CODE EXACTLY)
# ============================================================
print(f"Loading tokenizer from: {final_dir}")
tokenizer = AutoTokenizer.from_pretrained(final_dir)

# Apply the EXACT same tokenizer configuration as training
# Training used: tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token = tokenizer.eos_token
print(f"Set pad_token to eos_token: {tokenizer.pad_token}")

# Set padding side to left (matches training)
tokenizer.padding_side = "left"
print("Set padding_side to 'left' (matches training)")

# BOS token is None by design in Qwen3 (matches training)
tokenizer.bos_token = None
print("BOS token set to None (Qwen3 design)")

# Verify token configuration matches training
print(f"EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")
print(f"PAD token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")
print(f"BOS token: {tokenizer.bos_token}")

# Verify PAD and EOS are the same (critical for consistency with training)
assert tokenizer.pad_token_id == tokenizer.eos_token_id, "PAD and EOS token IDs must match training setup"
print("✓ Tokenizer configuration verified: PAD = EOS")

# ============================================================
# FIXED MODEL LOADING WITH PROPER CONFIG ALIGNMENT
# ============================================================
print(f"Loading model from: {final_dir}")
model = AutoPeftModelForCausalLM.from_pretrained(
    final_dir,
    device_map="auto" if DEVICE == "cuda" else None,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    low_cpu_mem_usage=True,
)
if DEVICE != "cuda":
    model = model.to(DEVICE)

# Function to align model config with tokenizer (simplified to match training)
def fix_model_config_tokens(model, tokenizer):
    """Align model config with tokenizer exactly as in training"""
    # Set model config to match tokenizer
    if hasattr(model.config, 'pad_token_id'):
        model.config.pad_token_id = tokenizer.pad_token_id
    if hasattr(model.config, 'bos_token_id'):
        model.config.bos_token_id = None  # Qwen3 doesn't use BOS
    if hasattr(model.config, 'eos_token_id'):
        model.config.eos_token_id = tokenizer.eos_token_id
    
    # Also fix generation config if it exists
    if hasattr(model, 'generation_config'):
        if model.generation_config is not None:
            model.generation_config.pad_token_id = tokenizer.pad_token_id
            model.generation_config.bos_token_id = None
            model.generation_config.eos_token_id = tokenizer.eos_token_id
    
    print("Model config aligned with tokenizer (matches training)")

# Apply the tokenizer config fix to the model
fix_model_config_tokens(model, tokenizer)

# Set model to evaluation mode
model.eval()

# Load BERT for custom BERTScore (mean-pooled cosine)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(DEVICE).eval()

# Optionally load SBERT
sbert_model = None
if _HAS_SENTENCE_TRANSFORMERS:
    # strong general-purpose sentence model
    sbert_name = os.environ.get("SBERT_MODEL", "sentence-transformers/all-mpnet-base-v2")
    try:
        sbert_model = SentenceTransformer(sbert_name, device=DEVICE)
        print(f"[Info] Loaded SBERT model: {sbert_name}")
    except Exception as e:
        print("[Info] Could not load SBERT model; SBERT Similarity will be skipped.", e)
        _HAS_SENTENCE_TRANSFORMERS = False

# Load test examples
test_files = glob.glob(test_glob, recursive=True)
examples = []
for file in test_files:
    with open(file, 'r') as f:
        for line in f:
            data = json.loads(line)
            examples.append(data["messages"])

# Function to determine module type
def get_module_type(system_content):
    lower_content = system_content.lower()
    if "therapeutic response generator" in lower_content:
        return "response"
    elif "cumulative summary ai" in lower_content:
        return "summary"
    elif "phq-8 classification ai" in lower_content:
        return "classify"
    return "unknown"

# Filter examples
response_examples = [ex for ex in examples if get_module_type(ex[0]["content"]) == "response"]
summary_examples = [ex for ex in examples if get_module_type(ex[0]["content"]) == "summary"]
classify_examples = [ex for ex in examples if get_module_type(ex[0]["content"]) == "classify"]

# -----------------------------------
# Batched generation (with fallback)
# -----------------------------------
def build_prompt(messages):
    return tokenizer.apply_chat_template(messages[:-1], tokenize=False, add_generation_prompt=True)

def generate_outputs(examples_list, batch_size=DEFAULT_BATCH_SIZE, max_new_tokens=512, min_batch_size=1, force_single_batch=False):
    """
    Batched generation with tokenizer settings matching training exactly.
    """
    if force_single_batch:
        batch_size = 1
        print("[Info] Force single batch mode enabled.")
    
    if batch_size < min_batch_size:
        batch_size = min_batch_size

    outputs_all = []
    try:
        prompts = [build_prompt(msgs) for msgs in examples_list]
        
        if DEVICE == "cuda":
            print(f"[Mem] Pre-batch max allocated: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
        
        for i in range(0, len(prompts), batch_size):
            batch_prompts = prompts[i:i+batch_size]
            inputs = tokenizer(
                batch_prompts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=tokenizer.model_max_length if hasattr(tokenizer, "model_max_length") else 2048
            ).to(model.device if DEVICE == "cuda" else DEVICE)

            with torch.inference_mode():
                gen_out = model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=False,
                    temperature=0.0,
                    top_p=1.0,
                    # Use the same token IDs as training
                    pad_token_id=tokenizer.pad_token_id,  # This is now EOS token ID
                    eos_token_id=tokenizer.eos_token_id,  # Same as pad_token_id
                    bos_token_id=None,  # Qwen3 doesn't use BOS
                )

            input_length = inputs["input_ids"].shape[1]
            for j, out_ids in enumerate(gen_out):
                text = tokenizer.decode(out_ids[input_length:], skip_special_tokens=True).strip()
                outputs_all.append(text)
        
        if DEVICE == "cuda":
            torch.cuda.empty_cache()
            torch.cuda.synchronize()
            print(f"[Mem] Post-batch max allocated: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
        
        return outputs_all

    except Exception as e:
        if DEVICE == "cuda" and "out of memory" in str(e).lower():
            print(f"[Warn] Batch size {batch_size} failed (likely OOM), clearing cache and retrying smaller batch.", e)
            if DEVICE == "cuda":
                torch.cuda.empty_cache()
                torch.cuda.synchronize()
            new_batch = max(batch_size // 2, min_batch_size)
            if new_batch < batch_size and not force_single_batch:
                return generate_outputs(examples_list, batch_size=new_batch, max_new_tokens=max_new_tokens, min_batch_size=min_batch_size)
        else:
            print("[Warn] Batched generation failed, falling back to single-example generation.", e)
        
        # Fallback with consistent token settings
        def generate_output(messages):
            prompt = tokenizer.apply_chat_template(messages[:-1], tokenize=False, add_generation_prompt=True)
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device if DEVICE == "cuda" else DEVICE)
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=512,
                    do_sample=False,
                    temperature=0.0,
                    top_p=1.0,
                    pad_token_id=tokenizer.pad_token_id,  # EOS token ID
                    eos_token_id=tokenizer.eos_token_id,  # Same as pad_token_id
                    bos_token_id=None,  # Qwen3 doesn't use BOS
                )
            generated = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
            return generated
        
        singles = [generate_output(msgs) for msgs in examples_list]
        if DEVICE == "cuda":
            torch.cuda.empty_cache()
            torch.cuda.synchronize()
        return singles
        
# -------------------------------
# Custom BERTScore (yours, kept)
# -------------------------------
def compute_bertscore(hyps, refs, batch_size=DEFAULT_BATCH_SIZE):
    if not hyps:
        return 0.0
    scores = []
    with torch.no_grad():
        for i in range(0, len(hyps), batch_size):
            h_batch = hyps[i:i+batch_size]
            r_batch = refs[i:i+batch_size]
            h_inputs = bert_tokenizer(h_batch, return_tensors="pt", truncation=True, max_length=512, padding=True).to(DEVICE)
            r_inputs = bert_tokenizer(r_batch, return_tensors="pt", truncation=True, max_length=512, padding=True).to(DEVICE)
            h_last = bert_model(**h_inputs).last_hidden_state.mean(dim=1)  # [B, H]
            r_last = bert_model(**r_inputs).last_hidden_state.mean(dim=1)
            batch_scores = F.cosine_similarity(h_last, r_last, dim=1)  # [B]
            scores.extend(batch_scores.detach().cpu().tolist())
    return float(np.mean(scores)) if scores else 0.0

# ---------------------------------
# Official BERTScore (new, added)
# ---------------------------------
def compute_official_bertscore(hyps, refs, lang="en", batch_size=DEFAULT_BATCH_SIZE):
    """
    Returns (Precision, Recall, F1). If bert-score is unavailable, returns (None, None, None).
    """
    if not _HAS_BERTSCORE or not hyps:
        return None, None, None
    try:
        P, R, F1 = bertscore(
            hyps, refs,
            lang=lang,
            rescale_with_baseline=True,   # stabilized 0..1ish range
            batch_size=batch_size,
            # model_type can be set via env if you like:
            # model_type=os.environ.get("BERTSCORE_MODEL", None)
        )
        return float(P.mean()), float(R.mean()), float(F1.mean())
    except Exception as e:
        print("[Info] Official BERTScore failed; skipping.", e)
        return None, None, None

# -----------------------------------------
# SBERT Similarity (new, added, sentence-level)
# -----------------------------------------
def compute_sbert_similarity(hyps, refs, batch_size=DEFAULT_BATCH_SIZE):
    """
    Mean diagonal cosine similarity of SBERT embeddings (hyp vs ref).
    Returns float or None if SBERT unavailable.
    """
    if not _HAS_SENTENCE_TRANSFORMERS or not hyps:
        return None
    try:
        # Encode in batches under-the-hood; SentenceTransformer handles batching
        hyp_embs = sbert_model.encode(hyps, convert_to_tensor=True, normalize_embeddings=True, batch_size=batch_size)
        ref_embs = sbert_model.encode(refs, convert_to_tensor=True, normalize_embeddings=True, batch_size=batch_size)
        sims = sbert_util.cos_sim(hyp_embs, ref_embs).diagonal().detach().cpu().numpy()
        return float(np.mean(sims))
    except Exception as e:
        print("[Info] SBERT similarity failed; skipping.", e)
        return None

# -------------------
# Perplexity function
# -------------------
def compute_perplexity(texts, batch_size=DEFAULT_BATCH_SIZE):
    if not texts:
        return 0.0
    ppl_vals = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors="pt", truncation=True, max_length=512, padding=True).to(model.device if DEVICE == "cuda" else DEVICE)
            # Shift labels are handled by transformers when labels=input_ids
            loss = model(**inputs, labels=inputs["input_ids"]).loss
            ppl = torch.exp(loss).item()
            ppl_vals.append(ppl)
    return float(np.mean(ppl_vals)) if ppl_vals else 0.0

# -------------------------
# Function to compute scores
# -------------------------
def compute_scores(hypotheses, references, module_name, results_dict):
    if not hypotheses:
        print(f"No {module_name} module examples found.")
        return results_dict

    # BLEU
    ref_tokens = [[ref.split()] for ref in references]
    hyp_tokens = [hyp.split() for hyp in hypotheses]
    smoothing = SmoothingFunction().method1
    bleu_score = corpus_bleu(ref_tokens, hyp_tokens, smoothing_function=smoothing)

    # ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for ref, hyp in zip(references, hypotheses):
        scores = scorer.score(ref, hyp)
        for key in rouge_scores:
            rouge_scores[key].append(scores[key].fmeasure)
    avg_rouge1 = float(np.mean(rouge_scores['rouge1']))
    avg_rouge2 = float(np.mean(rouge_scores['rouge2']))
    avg_rougeL = float(np.mean(rouge_scores['rougeL']))

    # METEOR
    meteor_scores = [meteor_score([ref.split()], hyp.split()) for ref, hyp in zip(references, hypotheses)]
    avg_meteor = float(np.mean(meteor_scores)) if meteor_scores else 0.0

    # Custom BERTScore (yours)
    avg_bertscore_custom = compute_bertscore(hypotheses, references)

    # Official BERTScore (new)
    bP, bR, bF1 = compute_official_bertscore(hypotheses, references)

    # SBERT Similarity (new)
    sbert_sim = compute_sbert_similarity(hypotheses, references)

    # Perplexity (on references; preserved)
    avg_ppl = compute_perplexity(references)

    # Length Ratio
    len_ratio_vals = [len(h) / len(r) for h, r in zip(hypotheses, references) if len(r) > 0]
    len_ratio = float(np.mean(len_ratio_vals)) if len_ratio_vals else 0.0

    # Bigram Diversity
    all_bigrams = set()
    total_bigrams = 0
    for hyp in hypotheses:
        hyp_words = hyp.split()
        hyp_bigrams = list(ngrams(hyp_words, 2))
        all_bigrams.update(hyp_bigrams)
        total_bigrams += len(hyp_bigrams)
    diversity = float(len(all_bigrams) / total_bigrams) if total_bigrams > 0 else 0.0

    # Save results
    res = {
        "BLEU": bleu_score,
        "ROUGE-1": avg_rouge1,
        "ROUGE-2": avg_rouge2,
        "ROUGE-L": avg_rougeL,
        "METEOR": avg_meteor,
        "BERTScore (Custom)": avg_bertscore_custom,
        "Perplexity": avg_ppl,
        "Length Ratio": len_ratio,
        "Bigram Diversity": diversity
    }
    # Append new metrics if available
    if bP is not None:
        res["BERTScore-P"] = bP
    if bR is not None:
        res["BERTScore-R"] = bR
    if bF1 is not None:
        res["BERTScore-F1"] = bF1
    if sbert_sim is not None:
        res["SBERT Similarity"] = sbert_sim

    results_dict[module_name] = res

    print(f"{module_name.capitalize()} BLEU: {bleu_score:.4f}")
    print(f"{module_name.capitalize()} ROUGE-1: {avg_rouge1:.4f}")
    print(f"{module_name.capitalize()} ROUGE-2: {avg_rouge2:.4f}")
    print(f"{module_name.capitalize()} ROUGE-L: {avg_rougeL:.4f}")
    print(f"{module_name.capitalize()} METEOR: {avg_meteor:.4f}")
    print(f"{module_name.capitalize()} BERTScore (Custom): {avg_bertscore_custom:.4f}")
    if bF1 is not None:
        print(f"{module_name.capitalize()} BERTScore-P (official): {bP:.4f}")
        print(f"{module_name.capitalize()} BERTScore-R (official): {bR:.4f}")
        print(f"{module_name.capitalize()} BERTScore-F1 (official): {bF1:.4f}")
    else:
        print(f"{module_name.capitalize()} BERTScore (official): [skipped]")
    if sbert_sim is not None:
        print(f"{module_name.capitalize()} SBERT Similarity: {sbert_sim:.4f}")
    else:
        print(f"{module_name.capitalize()} SBERT Similarity: [skipped]")
    print(f"{module_name.capitalize()} Perplexity: {avg_ppl:.4f}")
    print(f"{module_name.capitalize()} Length Ratio: {len_ratio:.4f}")
    print(f"{module_name.capitalize()} Bigram Diversity: {diversity:.4f}")

    return results_dict

# =================
# Run evaluations in new order: Response -> Classification -> Summary
# =================
results = {}

# 1. RESPONSE MODULE EVALUATION
print("Evaluating Response Module...")
response_hypotheses = generate_outputs(response_examples, batch_size=DEFAULT_BATCH_SIZE)
response_references = [ex[-1]["content"] for ex in response_examples]

# Additional evaluation for therapist_response only (preserved)
print("\nEvaluating Response Therapist Only...")
response_hyps_therapist = []
response_refs_therapist = []
for hyp, ref in zip(response_hypotheses, response_references):
    try:
        hyp_json = json.loads(hyp)
        ref_json = json.loads(ref)
        response_hyps_therapist.append(hyp_json.get('therapist_response', ''))
        response_refs_therapist.append(ref_json.get('therapist_response', ''))
    except json.JSONDecodeError:
        response_hyps_therapist.append('')
        response_refs_therapist.append('')

# Print first 3 generated and true therapist responses (preserved)
print("First 3 generated and true therapist responses:")
for i in range(min(3, len(response_hyps_therapist))):
    print(f"Generated {i+1}: {response_hyps_therapist[i]}")
    print(f"True {i+1}: {response_refs_therapist[i]}")
    print("---")

results = compute_scores(response_hyps_therapist, response_refs_therapist, "response_therapist", results)
print(results)

# Original full response evaluation (preserved)
print("\nEvaluating Response Full...")
results = compute_scores(response_hypotheses, response_references, "response", results)

# 2. CLASSIFICATION MODULE EVALUATION
print("\nEvaluating Classify Module...")
classify_hypotheses = generate_outputs(classify_examples, batch_size=DEFAULT_BATCH_SIZE)
classify_references = [ex[-1]["content"] for ex in classify_examples]
results = compute_scores(classify_hypotheses, classify_references, "classify", results)

# 3. SUMMARY MODULE EVALUATION (with single batch fallback)
print("\nEvaluating Summary Module...")
summary_hypotheses = generate_outputs(summary_examples, batch_size=DEFAULT_BATCH_SIZE, force_single_batch=True)
summary_references = [ex[-1]["content"] for ex in summary_examples]
results = compute_scores(summary_hypotheses, summary_references, "summary", results)

# 4. OVERALL EVALUATION
print("\nEvaluating Overall...")
overall_hypotheses = response_hypotheses + classify_hypotheses + summary_hypotheses
overall_references = response_references + classify_references + summary_references
results = compute_scores(overall_hypotheses, overall_references, "overall", results)

# Save results to JSON
with open(results_file, "w") as f:
    json.dump(results, f, indent=4)
print(f"\nSaved results to {results_file}")

# Plot results
modules = list(results.keys())
# Extend metric list with new ones (while preserving originals)
metrics = [
    "BLEU", "ROUGE-1", "ROUGE-2", "ROUGE-L", "METEOR",
    "BERTScore (Custom)",
    "BERTScore-P", "BERTScore-R", "BERTScore-F1",
    "SBERT Similarity",
    "Perplexity", "Length Ratio", "Bigram Diversity"
]

plt.figure(figsize=(14, 8))
bar_width = 0.06  # narrower since we have more metrics now
x = np.arange(len(modules))

for i, metric in enumerate(metrics):
    values = [results[m].get(metric, 0) for m in modules]  # Use 0 if metric not present
    plt.bar(x + i * bar_width, values, width=bar_width, label=metric)

plt.xticks(x + bar_width * (len(metrics) - 1) / 2, modules)
plt.ylabel("Score")
plt.title("Evaluation Metrics per Module")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(plot_file)
print(f"Saved plot to {plot_file}")

In [None]:
import os
import json
import glob
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, BertTokenizer, BertModel
from peft import AutoPeftModelForCausalLM
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.util import ngrams
from nltk import download
from rouge_score import rouge_scorer
import torch.nn.functional as F

# Download NLTK data if needed (may require adjustment if no internet)
download('wordnet', quiet=True)
download('omw-1.4', quiet=True)  # For multilingual, but assuming English

# Set up paths (adjust if needed)
model_id = "Qwen/Qwen3-4B-Instruct-2507"
final_dir = "./qwen3_lora_finetuned_final"
test_glob = r"Eval/**/*.jsonl"
results_file = "metrics_results.json"
plot_file = "metrics_plot.png"

# Load tokenizer and model once
tokenizer = AutoTokenizer.from_pretrained(final_dir)
model = AutoPeftModelForCausalLM.from_pretrained(
    final_dir,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    low_cpu_mem_usage=True,
)
model.eval()

# Load BERT for custom BERTScore
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').eval()

# Load test examples
test_files = glob.glob(test_glob, recursive=True)
examples = []
for file in test_files:
    with open(file, 'r') as f:
        for line in f:
            data = json.loads(line)
            examples.append(data["messages"])

# Function to determine module type
def get_module_type(system_content):
    lower_content = system_content.lower()
    if "therapeutic response generator" in lower_content:
        return "response"
    elif "cumulative summary ai" in lower_content:
        return "summary"
    elif "phq-8 classification ai" in lower_content:
        return "classify"
    return "unknown"

# Filter examples
response_examples = [ex for ex in examples if get_module_type(ex[0]["content"]) == "response"]
summary_examples = [ex for ex in examples if get_module_type(ex[0]["content"]) == "summary"]
classify_examples = [ex for ex in examples if get_module_type(ex[0]["content"]) == "classify"]

# Generation function
def generate_output(messages):
    prompt = tokenizer.apply_chat_template(messages[:-1], tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            temperature=0.0,
            top_p=1.0,
        )
    generated = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
    return generated

# Custom BERTScore function (sentence-level average embedding cosine)
def compute_bertscore(hyps, refs):
    if not hyps:
        return 0.0
    hyp_embs = []
    ref_embs = []
    for h, r in zip(hyps, refs):
        hyp_inputs = bert_tokenizer(h, return_tensors="pt", truncation=True, max_length=512)
        ref_inputs = bert_tokenizer(r, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            hyp_emb = bert_model(**hyp_inputs).last_hidden_state.mean(dim=1)
            ref_emb = bert_model(**ref_inputs).last_hidden_state.mean(dim=1)
        hyp_embs.append(hyp_emb)
        ref_embs.append(ref_emb)
    scores = [F.cosine_similarity(h, r).item() for h, r in zip(hyp_embs, ref_embs)]
    return np.mean(scores)

# Perplexity function
def compute_perplexity(texts):
    if not texts:
        return 0.0
    ppl = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
        with torch.no_grad():
            loss = model(**inputs, labels=inputs["input_ids"]).loss
        ppl.append(torch.exp(loss).item())
    return np.mean(ppl)

# Function to compute scores
def compute_scores(hypotheses, references, module_name, results_dict):
    if not hypotheses:
        print(f"No {module_name} module examples found.")
        return results_dict

    # BLEU
    ref_tokens = [[ref.split()] for ref in references]
    hyp_tokens = [hyp.split() for hyp in hypotheses]
    smoothing = SmoothingFunction().method1
    bleu_score = corpus_bleu(ref_tokens, hyp_tokens, smoothing_function=smoothing)

    # ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for ref, hyp in zip(references, hypotheses):
        scores = scorer.score(ref, hyp)
        for key in rouge_scores:
            rouge_scores[key].append(scores[key].fmeasure)
    avg_rouge1 = np.mean(rouge_scores['rouge1'])
    avg_rouge2 = np.mean(rouge_scores['rouge2'])
    avg_rougeL = np.mean(rouge_scores['rougeL'])

    # METEOR
    meteor_scores = [meteor_score([ref.split()], hyp.split()) for ref, hyp in zip(references, hypotheses)]
    avg_meteor = np.mean(meteor_scores)

    # Custom BERTScore
    avg_bertscore = compute_bertscore(hypotheses, references)

    # Perplexity (on references)
    avg_ppl = compute_perplexity(references)

    # Length Ratio
    len_ratio = np.mean([len(h) / len(r) for h, r in zip(hypotheses, references) if len(r) > 0])

    # Bigram Diversity
    all_bigrams = set()
    total_bigrams = 0
    for hyp in hypotheses:
        hyp_words = hyp.split()
        hyp_bigrams = list(ngrams(hyp_words, 2))
        all_bigrams.update(hyp_bigrams)
        total_bigrams += len(hyp_bigrams)
    diversity = len(all_bigrams) / total_bigrams if total_bigrams > 0 else 0.0

    # Save results
    results_dict[module_name] = {
        "BLEU": bleu_score,
        "ROUGE-1": avg_rouge1,
        "ROUGE-2": avg_rouge2,
        "ROUGE-L": avg_rougeL,
        "METEOR": avg_meteor,
        "BERTScore (Custom)": avg_bertscore,
        "Perplexity": avg_ppl,
        "Length Ratio": len_ratio,
        "Bigram Diversity": diversity
    }
    print(f"{module_name.capitalize()} BLEU: {bleu_score:.4f}")
    print(f"{module_name.capitalize()} ROUGE-1: {avg_rouge1:.4f}")
    print(f"{module_name.capitalize()} ROUGE-2: {avg_rouge2:.4f}")
    print(f"{module_name.capitalize()} ROUGE-L: {avg_rougeL:.4f}")
    print(f"{module_name.capitalize()} METEOR: {avg_meteor:.4f}")
    print(f"{module_name.capitalize()} BERTScore (Custom): {avg_bertscore:.4f}")
    print(f"{module_name.capitalize()} Perplexity: {avg_ppl:.4f}")
    print(f"{module_name.capitalize()} Length Ratio: {len_ratio:.4f}")
    print(f"{module_name.capitalize()} Bigram Diversity: {diversity:.4f}")
    return results_dict

# Run evaluations
results = {}

print("Evaluating Response Module...")
response_hypotheses = [generate_output(ex) for ex in response_examples]
response_references = [ex[-1]["content"] for ex in response_examples]
results = compute_scores(response_hypotheses, response_references, "response", results)

print("\nEvaluating Summary Module...")
summary_hypotheses = [generate_output(ex) for ex in summary_examples]
summary_references = [ex[-1]["content"] for ex in summary_examples]
results = compute_scores(summary_hypotheses, summary_references, "summary", results)

print("\nEvaluating Classify Module...")
classify_hypotheses = [generate_output(ex) for ex in classify_examples]
classify_references = [ex[-1]["content"] for ex in classify_examples]
results = compute_scores(classify_hypotheses, classify_references, "classify", results)

print("\nEvaluating Overall...")
overall_hypotheses = response_hypotheses + summary_hypotheses + classify_hypotheses
overall_references = response_references + summary_references + classify_references
results = compute_scores(overall_hypotheses, overall_references, "overall", results)

# Save results to JSON
with open(results_file, "w") as f:
    json.dump(results, f, indent=4)
print(f"\nSaved results to {results_file}")

# Plot results
modules = list(results.keys())
metrics = ["BLEU", "ROUGE-1", "ROUGE-2", "ROUGE-L", "METEOR", "BERTScore (Custom)", "Perplexity", "Length Ratio", "Bigram Diversity"]

plt.figure(figsize=(14,8))
bar_width = 0.1
x = np.arange(len(modules))

for i, metric in enumerate(metrics):
    values = [results[m].get(metric, 0) for m in modules]  # Use 0 if metric not present
    plt.bar(x + i*bar_width, values, width=bar_width, label=metric)

plt.xticks(x + bar_width*(len(metrics)-1)/2, modules)
plt.ylabel("Score")
plt.title("Evaluation Metrics per Module")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(plot_file)
print(f"Saved plot to {plot_file}")