In [1]:
import polars as pl
import numpy as np
import os, math, string
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import gc
from tqdm import tqdm

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from rouge_score import rouge_scorer
from bert_score import score as bertscore
from collections import Counter
import string
import random
import emoji

# === NLTK SETUP ===
nltk.download('punkt_tab')
nltk.download("wordnet")
smooth_fn = SmoothingFunction().method4

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/zorin17/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/zorin17/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from huggingface_hub import login

# Insert your token here
login(token="")

In [3]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [4]:
df = pl.read_csv('hf://datasets/bitext/Bitext-customer-support-llm-chatbot-training-dataset/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv')

In [5]:
# Data cleaning
# fill in null
df = df.with_columns([
    pl.col("instruction")
      .cast(str)
      .str.to_lowercase()
      .fill_null(""),
    
    pl.col("response")
      .cast(str)
      .str.to_lowercase()
      .fill_null(""),
    
    pl.col("intent")
      .cast(str)
      .fill_null("unknown")
])

# Remove emoji
def remove_emojis(text: str) -> str:
    return emoji.replace_emoji(text, replace="")  # Remove all emojis safely

# Apply to instruction and response
df = df.with_columns([
    pl.col("instruction").map_elements(remove_emojis).alias("instruction")
])

df = df.with_columns([
    pl.col("response").map_elements(remove_emojis).alias("response")
])

#Exclude noisy flags
# Filter out rows where 'flags' contains Z, Q, or W ===
flag = ["flags"]
df_z = df.filter(
    ~pl.col(flag).cast(str).str.contains("Z")
)

df_zw = df.filter(
    ~pl.col(flag).cast(str).str.contains("Z") &
    ~pl.col(flag).cast(str).str.contains("W")
)

df_clean = df.filter(
    ~pl.col(flag).cast(str).str.contains("Z") &
    ~pl.col(flag).cast(str).str.contains("W") &
    ~pl.col(flag).cast(str).str.contains("Q")
)

print(df_z.height)
print(df_zw.height)
print(df_clean.height)

category_counts = (
    df_clean
    .group_by("category")
    .agg(pl.count().alias("counts"))
    .sort("counts", descending=True)
)

print(category_counts)

# Filter only selected categories
selected_categories = ["ORDER", "REFUND", "SHIPPING", "DELIVERY"]
df_selected = df_clean.filter(
    pl.col("category").is_in(selected_categories)
)


  df = df.with_columns([
  df = df.with_columns([


21586
20517
14454
shape: (11, 2)
┌──────────────┬────────┐
│ category     ┆ counts │
│ ---          ┆ ---    │
│ str          ┆ u32    │
╞══════════════╪════════╡
│ ACCOUNT      ┆ 3251   │
│ ORDER        ┆ 2152   │
│ REFUND       ┆ 1527   │
│ SHIPPING     ┆ 1156   │
│ DELIVERY     ┆ 1102   │
│ …            ┆ …      │
│ INVOICE      ┆ 1076   │
│ PAYMENT      ┆ 1028   │
│ FEEDBACK     ┆ 1004   │
│ CANCEL       ┆ 539    │
│ SUBSCRIPTION ┆ 537    │
└──────────────┴────────┘


(Deprecated in version 0.20.5)
  .agg(pl.count().alias("counts"))


In [6]:
# Split dataset by category
# === Configuration ===
LABEL_COL = "category"  # 🔁 Replace with "intent" or any stratification column
SPLIT_RATIO_TRAIN = 0.7
SPLIT_RATIO_VAL = 0.15
SEED = 123
df_final = df_selected.clone()

# === Stratified split logic ===
random.seed(SEED)
train_parts = []
test_parts = []
val_parts = []

for label in df_final[LABEL_COL].unique().to_list():
    group_df = df_final.filter(pl.col(LABEL_COL) == label)
    group_df = group_df.sample(n=len(group_df), shuffle=True, seed=SEED)

    n = len(group_df)
    train_idx = int(n * SPLIT_RATIO_TRAIN)
    val_idx = int(n * (SPLIT_RATIO_TRAIN + SPLIT_RATIO_VAL))

    train_parts.append(group_df[:train_idx])
    val_parts.append(group_df[train_idx:val_idx])
    test_parts.append(group_df[val_idx:])

# === Combine all groups
train_df = pl.concat(train_parts).sort(["category", "instruction"])
val_df = pl.concat(val_parts).sort(["category", "instruction"])
test_df = pl.concat(test_parts).sort(["category", "instruction"])

print("✅ Split sizes:")
print(f"Train: {len(train_df)}")
print(f"Val:   {len(val_df)}")
print(f"Test:  {len(test_df)}")

print("\n📊 Category distribution in test set:")
print(test_df.select([pl.col(LABEL_COL)]).to_series().value_counts())

✅ Split sizes:
Train: 4154
Val:   890
Test:  893

📊 Category distribution in test set:
shape: (4, 2)
┌──────────┬───────┐
│ category ┆ count │
│ ---      ┆ ---   │
│ str      ┆ u32   │
╞══════════╪═══════╡
│ DELIVERY ┆ 166   │
│ ORDER    ┆ 323   │
│ REFUND   ┆ 230   │
│ SHIPPING ┆ 174   │
└──────────┴───────┘


In [7]:
# === Configuration ===
scenario = "zero_shot"
MODELS = {
    "llama":    "meta-llama/Llama-3.2-1B",
    # "qwen":     "Qwen/Qwen3-0.6B-Base",
    # "olmo":     "allenai/OLMo-2-0425-1B",
}

OFFLOAD_DIR = "./offload"
USE_CUDA = torch.cuda.is_available()
SAVE_EVERY = 20

# === Load test set (pre-split externally)
questions = test_df["instruction"].cast(str).to_list()
answers = test_df["response"].cast(str).to_list()

# === Safe append function
def append_to_csv(new_data: pl.DataFrame, path: str):
    if os.path.exists(path):
        existing = pl.read_csv(path)
        combined = existing.vstack(new_data)
    else:
        combined = new_data
    combined.write_csv(path)

def adaptive_max_new_tokens(gold_text: str, tokenizer,
                            min_nt=48, max_nt=200, mult=1.3, bonus=16) -> int:
    # estimate needed length from gold answer tokens
    n_gold = len(tokenizer(gold_text, add_special_tokens=False).input_ids)
    est = int(round(mult * n_gold) + bonus)   # headroom
    return int(np.clip(est, min_nt, max_nt))

# === Loop through each model
for key, MODEL_NAME in MODELS.items():
    OUTPUT_CSV = f"chatbot_result/{scenario}/scenario_A_zero_shot_results_{key}.csv"

    print(f"\n🚀 Running inference for: {MODEL_NAME}")
    os.makedirs(OFFLOAD_DIR, exist_ok=True)

    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

    # Load tokenizer + model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=True, use_fast=True, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16 if USE_CUDA else torch.float32,
        device_map="auto",
        offload_folder=OFFLOAD_DIR
    )
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

    # Resume support
    if os.path.exists(OUTPUT_CSV):
        existing = pl.read_csv(OUTPUT_CSV)
        done = set(existing["instruction"].to_list())
        print(f"🔄 Resuming from {len(done)} completed rows...")
    else:
        done = set()

    results = []
    for question, ground_truth in tqdm(zip(questions, answers), total=len(questions), desc=f"{key.upper()} Inference"):
        if question in done:
            continue

        prompt = f"""You are a helpful retail assistant. Answer the following customer query briefly and accurately.\n\nCustomer: {question}\nAnswer:"""

        try:
            max_nt = adaptive_max_new_tokens(ground_truth, tokenizer,
                                            min_nt=48, max_nt=300, mult=1.3, bonus=16)

            output = pipe(prompt, max_new_tokens=max_nt, do_sample=False)[0]["generated_text"]
            if "Answer:" in output:
                answer_part = output.split("Answer:")[1]
                predicted = answer_part.split("Customer:")[0].strip() if "Customer:" in answer_part else answer_part.strip()
            else:
                predicted = output.strip()
        except Exception as e:
            print(f"⚠️ Error processing question: {question[:60]}... Skipping. Reason: {e}")
            continue

        results.append((question, ground_truth, predicted))

        if len(results) >= SAVE_EVERY:
            # Convert batch of results to DataFrame
            chunk_df = pl.DataFrame(results, schema=["instruction", "response", "prediction"])
            # Join with full test_df to get original columns
            enriched = test_df.join(chunk_df, on=["instruction", "response"], how="inner")
            append_to_csv(enriched, OUTPUT_CSV)
            results = []

    if results:
        chunk_df = pl.DataFrame(results, schema=["instruction", "response", "prediction"])
        enriched = test_df.join(chunk_df, on=["instruction", "response"], how="inner")
        append_to_csv(enriched, OUTPUT_CSV)

    print(f"✅ Done with {key}. Results saved to {OUTPUT_CSV}")



🚀 Running inference for: meta-llama/Llama-3.2-1B


Device set to use cuda:0
LLAMA Inference:   0%|          | 0/893 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
LLAMA Inference:   0%|          | 1/893 [00:00<12:48,  1.16it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
LLAMA Inference:   0%|          | 2/893 [00:05<42:52,  2.89s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
LLAMA Inference:   0%|          | 3/893 [00:09<53:53,  3.63s/it]The following generation flags are not valid and may be ignored: ['temperature'

✅ Done with llama. Results saved to chatbot_result/zero_shot/scenario_A_zero_shot_results_llama.csv





In [3]:
# === Config ===
scenario = "zero_shot"
DEVICE = "cuda"
MODELS = {
    "llama":    "meta-llama/Llama-3.2-1B",
    "qwen":     "Qwen/Qwen3-0.6B-Base",
    "olmo":     "allenai/OLMo-2-0425-1B",
}

BERTSCORE_MODEL = "microsoft/deberta-base-mnli"  # fast & solid for FAQ/QA

# Output folder
OUT_DIR = "Evaluation"
os.makedirs(OUT_DIR, exist_ok=True)

# BLEU smoothing (method4 is friendlier for short answers)
smooth_fn = SmoothingFunction().method4
# Cache rouge scorer once
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

# === Helpers ===
def _safe_text(x):
    return "" if x is None or (isinstance(x, float) and math.isnan(x)) else str(x)

def normalize(text: str) -> list[str]:
    text = _safe_text(text).lower().translate(str.maketrans('', '', string.punctuation))
    return wordpunct_tokenize(text)

# def exact_match(pred: str, gold: str) -> int:
#     return int(_safe_text(pred).strip().lower() == _safe_text(gold).strip().lower())

def f1_score_overlap(pred: str, gold: str) -> float:
    pred_tokens = normalize(pred)
    gold_tokens = normalize(gold)
    if not pred_tokens or not gold_tokens:
        return 0.0
    common = Counter(pred_tokens) & Counter(gold_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)

def bleu_score(pred: str, gold: str) -> float:
    # BLEU-2 is fairer for short FAQ answers
    return sentence_bleu(
        [normalize(gold)], normalize(pred),
        weights=(0.5, 0.5, 0, 0),
        smoothing_function=smooth_fn
    )

def rouge_l_score(pred: str, gold: str) -> float:
    return rouge.score(_safe_text(gold), _safe_text(pred))["rougeL"].fmeasure

def meteor(pred: str, gold: str) -> float:
    return meteor_score([normalize(gold)], normalize(pred))

def clean_text(t: str) -> str:
    t = _safe_text(t).strip()
    for pre in ("answer:", "response:", "reply:"):
        if t.lower().startswith(pre):
            t = t[len(pre):].strip()
    return t.strip("`").strip()

def compute_metrics(preds_raw, golds_raw):
    preds = [clean_text(p) for p in preds_raw]
    golds = [clean_text(g) for g in golds_raw]

    # em_list, f1_list, bleu_list, rouge_list, meteor_list = [], [], [], [], []
    f1_list, bleu_list, rouge_list, meteor_list = [], [], [], []
    for pred, gold in zip(preds, golds):
        # em_list.append(exact_match(pred, gold))
        f1_list.append(f1_score_overlap(pred, gold))
        bleu_list.append(bleu_score(pred, gold))
        rouge_list.append(rouge_l_score(pred, gold))
        meteor_list.append(meteor(pred, gold))

    # BERTScore on CUDA; drop empty pairs to avoid warnings
    mask = [bool(p.strip()) and bool(g.strip()) for p, g in zip(preds, golds)]
    preds_nz = [p for p, m in zip(preds, mask) if m]
    golds_nz = [g for g, m in zip(golds, mask) if m]
    if len(preds_nz) == 0:
        bert_f1_avg = 0.0
    else:
        P, R, F1 = bertscore(
            preds_nz, golds_nz,
            lang="en",
            model_type=BERTSCORE_MODEL,
            rescale_with_baseline=True,
            device=DEVICE
        )
        bert_f1_avg = float(F1.mean()) * 100.0

    n = max(1, len(f1_list))
    metrics = [
        # "Exact Match", 
        "F1-overlap", "BLEU", "ROUGE-L", "METEOR", "BERTScore-F1"]
    scores = [
        # round(sum(em_list) / n * 100, 2),
        round(sum(f1_list) / n * 100, 2),
        round(sum(bleu_list) / n * 100, 2),
        round(sum(rouge_list) / n * 100, 2),
        round(sum(meteor_list) / n * 100, 2),
        round(bert_f1_avg, 2),
    ]
    assert len(metrics) == len(scores), f"metrics={len(metrics)} scores={len(scores)}"

    rows = [{"Metric": m, "Score": s} for m, s in zip(metrics, scores)]
    return pl.DataFrame(rows)

# === Run all models and combine outputs ===
all_summaries = []

for key, MODEL_NAME in MODELS.items():
    OUTPUT_CSV = f"chatbot_result/{scenario}/scenario_A_zero_shot_results_{key}.csv"

    print(f"\n🚀 Running Evaluation for: {MODEL_NAME}")

    result_df = pl.read_csv(OUTPUT_CSV)
    preds = result_df["prediction"].to_list()
    golds = result_df["response"].to_list()

    summary = compute_metrics(preds, golds).with_columns(pl.lit(key).alias("Model"))
    all_summaries.append(summary)

# Combine all models (long)
if all_summaries:
    final_df = pl.concat(all_summaries).select(["Model", "Metric", "Score"])
    final_df.write_csv(os.path.join(OUT_DIR, "Scenario_A_Evaluation_AllModels.csv"))

    # Pivot for side-by-side comparison
    pivot_df = final_df.pivot(index="Metric", columns="Model", values="Score")
    pivot_df.write_csv(os.path.join(OUT_DIR, "Scenario_A_Evaluation_Pivot.csv"))

    print("\n📊 Combined (pivot) comparison:")
    print(pivot_df)
    print("\n✅ Saved under 'Evaluation/'")
    print("- Scenario_A_Evaluation_AllModels.csv")
    print("- Scenario_A_Evaluation_Pivot.csv")
else:
    print("⚠️ No model results were processed.")


🚀 Running Evaluation for: meta-llama/Llama-3.2-1B

🚀 Running Evaluation for: Qwen/Qwen3-0.6B-Base

🚀 Running Evaluation for: allenai/OLMo-2-0425-1B

📊 Combined (pivot) comparison:
shape: (5, 4)
┌──────────────┬───────┬───────┬───────┐
│ Metric       ┆ llama ┆ qwen  ┆ olmo  │
│ ---          ┆ ---   ┆ ---   ┆ ---   │
│ str          ┆ f64   ┆ f64   ┆ f64   │
╞══════════════╪═══════╪═══════╪═══════╡
│ F1-overlap   ┆ 14.01 ┆ 21.6  ┆ 14.04 │
│ BLEU         ┆ 1.7   ┆ 3.36  ┆ 1.15  │
│ ROUGE-L      ┆ 11.85 ┆ 16.39 ┆ 11.39 │
│ METEOR       ┆ 6.92  ┆ 10.77 ┆ 6.52  │
│ BERTScore-F1 ┆ 8.67  ┆ 20.91 ┆ 11.25 │
└──────────────┴───────┴───────┴───────┘

✅ Saved under 'Evaluation/'
- Scenario_A_Evaluation_AllModels.csv
- Scenario_A_Evaluation_Pivot.csv


  pivot_df = final_df.pivot(index="Metric", columns="Model", values="Score")
