In [None]:
!pip install -q "transformers>=4.45.0" "datasets>=2.20.0" "peft>=0.18.0" "accelerate>=1.2.0" bitsandbytes scikit-learn huggingface_hub

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount("/content/drive")
cache_dir = "/content/drive/MyDrive/hf_cache"
os.makedirs(cache_dir, exist_ok=True)
os.environ["HF_HOME"] = cache_dir

print(f"Hugging Face cache: {cache_dir}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Hugging Face cache is now set to: /content/drive/MyDrive/hf_cache


In [None]:
%%writefile task2_inference_fix.py
import argparse
import os
import re
import glob
from typing import List, Dict

import pandas as pd
import torch
from datasets import load_dataset, DatasetDict
from sklearn.metrics import f1_score
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from peft import PeftModel

EVASION_LABEL_DESCRIPTIONS = {
    "Claims ignorance": "The speaker says they do not know or are not aware.",
    "Clarification": "The speaker clarifies or asks for clarification.",
    "Declining to answer": "The speaker explicitly refuses to answer.",
    "Deflection": "The speaker changes the subject.",
    "Dodging": "The speaker avoids answering without explicitly refusing.",
    "Explicit": "The speaker directly answers.",
    "General": "The speaker answers vaguely.",
    "Implicit": "The speaker implies the answer without stating it clearly.",
    "Partial/half-answer": "The speaker answers only part of the question.",
}

def build_label_mappings(labels: List[str]):
    uniq = sorted(set(labels))
    label2id = {lab: i for i, lab in enumerate(uniq)}
    id2label = {i: lab for lab, i in label2id.items()}
    return label2id, id2label

def build_prompt(question, full_question, answer, inference_mode=False):
    # force inference_mode=False to match the training data format
    q = (question or "").strip()
    if not q: q = (full_question or "").strip()

    system_instructions = (
        "You are an expert at analyzing political and media interviews.\n"
        "Your task is to classify an answer into one of the following evasion techniques:\n\n"
    )

    label_list_str = ""
    for name, desc in EVASION_LABEL_DESCRIPTIONS.items():
        label_list_str += f"- {name}: {desc}\n"

    user_block = (
        f"Question: {q}\n"
        f"Answer: {answer.strip()}\n\n"
        "From the list above, which single evasion label best describes this answer?\n"
        "Respond with exactly one label name (no extra words).\n"
        "Evasion label:"
    )
    return system_instructions + label_list_str + "\n" + user_block

def extract_label_robust(text, valid_labels):
    text = text.strip()
    # check exact match first
    for lab in valid_labels:
        if lab.lower() == text.lower():
            return lab
    # check starts with
    for lab in valid_labels:
        if text.lower().startswith(lab.lower()):
            return lab
    # check contains
    for lab in valid_labels:
        if lab.lower() in text.lower():
            return lab
    return "Explicit"

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--base_model", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct")
    parser.add_argument("--adapter_dir", type=str, required=True) # Where your training saved
    parser.add_argument("--output_file", type=str, default="task2_predictions_fixed.csv")
    args = parser.parse_args()


    print("Loading Dataset...")
    raw = load_dataset("ailsntua/QEvasion")
    if "test" not in raw:
        print("Warning: No test split found, using dev as dummy test.")
        test_raw = raw["train"].select(range(10))
    else:
        test_raw = raw["test"]

    train_raw = raw["train"]
    label2id, id2label = build_label_mappings(train_raw["evasion_label"])
    label_ids = [label2id[l] for l in train_raw["evasion_label"]]
    train_raw = train_raw.add_column("evasion_label_id", label_ids)
    train_raw = train_raw.class_encode_column("evasion_label_id")

    splits = train_raw.train_test_split(test_size=0.1, stratify_by_column="evasion_label_id", seed=42)
    dev_ds = splits["test"]

    print(f"Loading Base Model: {args.base_model}...")
    bnb_cfg = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    tokenizer = AutoTokenizer.from_pretrained(args.base_model)
    tokenizer.pad_token = tokenizer.eos_token

    base_model = AutoModelForCausalLM.from_pretrained(
        args.base_model,
        quantization_config=bnb_cfg,
        device_map="auto"
    )

    checkpoints = sorted(glob.glob(os.path.join(args.adapter_dir, "checkpoint-*")), key=os.path.getmtime)
    if checkpoints:
        adapter_path = checkpoints[-1]
        print(f"Found checkpoint: {adapter_path}")
    else:
        adapter_path = args.adapter_dir
        print(f"Using root dir: {adapter_path}")

    print("Loading LoRA Adapter...")
    model = PeftModel.from_pretrained(base_model, adapter_path)
    model.eval()

    print("\nStarting FIXED Inference (Direct, No CoT, Repetition Penalty)...")

    def run_predict(dataset, desc):
        preds = []
        golds = []

        # Generation config to stop looping
        gen_config = {
            "max_new_tokens": 10,
            "do_sample": False,
            "repetition_penalty": 1.2,
            "pad_token_id": tokenizer.pad_token_id,
            "eos_token_id": tokenizer.eos_token_id
        }

        for i, row in enumerate(dataset):
            q = row.get("question", row.get("interview_question"))
            iq = row.get("interview_question", q)
            a = row.get("interview_answer")

            prompt = build_prompt(q, iq, a, inference_mode=False)
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

            with torch.no_grad():
                out = model.generate(**inputs, **gen_config)

            gen_text = tokenizer.decode(out[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
            pred = extract_label_robust(gen_text, list(label2id.keys()))
            preds.append(pred)

            if "evasion_label" in row:
                golds.append(row["evasion_label"])

            if i % 50 == 0:
                print(f"[{desc}] {i}/{len(dataset)} | Gen: {gen_text.strip()} -> Pred: {pred}")

        return preds, golds

    # check Dev Score
    dev_preds, dev_golds = run_predict(dev_ds, "DEV")
    dev_ids = [label2id[g] for g in dev_golds]
    pred_ids = [label2id[p] for p in dev_preds]

    macro_f1 = f1_score(dev_ids, pred_ids, average="macro")
    print(f"\n=== NEW MACRO F1 SCORE: {macro_f1:.4f} ===")

    print("Generating Test Predictions...")
    test_preds, _ = run_predict(test_raw, "TEST")

    out_path = os.path.join(args.adapter_dir, args.output_file)
    df = pd.DataFrame({"index": test_raw["index"], "evasion_label": test_preds})
    df.to_csv(out_path, index=False)
    print(f"Saved predictions to: {out_path}")

if __name__ == "__main__":
    main()

Writing task2_inference_fix.py


In [1]:
import os
os.environ["HF_HOME"] = "/content/hf_cache"

In [None]:
!python task2_inference_fix.py \
    --adapter_dir "/content/drive/MyDrive/SemEval_Clarity/v2_results"

2025-11-28 04:07:55.522352: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-28 04:07:55.541100: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764302875.563577   70659 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764302875.570436   70659 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764302875.587496   70659 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 