In [None]:
!pip install -q "transformers>=4.45.0" "datasets>=2.20.0" "peft>=0.18.0" "accelerate>=1.2.0" bitsandbytes scikit-learn huggingface_hub

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import os
os.environ["HF_HOME"] = "/content/hf_cache"

In [None]:
%%writefile task2_llama_lora_v3.py
import argparse
import os
import random
from collections import Counter
from typing import List, Dict

import numpy as np
import pandas as pd
import torch
from datasets import load_dataset, DatasetDict, concatenate_datasets
from sklearn.metrics import f1_score

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    set_seed,
)

from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)


EVASION_LABEL_DESCRIPTIONS = {
    "Claims ignorance": "The speaker says they do not know or are not aware.",
    "Clarification": "The speaker clarifies or asks for clarification.",
    "Declining to answer": "The speaker explicitly refuses to answer.",
    "Deflection": "The speaker changes the subject.",
    "Dodging": "The speaker avoids answering without explicitly refusing.",
    "Explicit": "The speaker directly answers.",
    "General": "The speaker answers vaguely.",
    "Implicit": "The speaker implies the answer without stating it clearly.",
    "Partial/half-answer": "The speaker answers only part of the question.",
}

def build_label_mappings(labels: List[str]) -> (Dict[str, int], Dict[int, str]):
    uniq = sorted(set(labels))
    label2id = {lab: i for i, lab in enumerate(uniq)}
    id2label = {i: lab for lab, i in label2id.items()}
    return label2id, id2label


def balance_dataset(ds, label_column="evasion_label"):
    labels = ds[label_column]
    counts = Counter(labels)
    max_count = max(counts.values())

    # Target count: We don't want to explode the dataset size,
    # so let's aim for everyone to have at least 30% of the max count.
    target_count = int(max_count * 0.3)

    shards_to_add = [ds]

    print(f"Original Counts: {dict(counts)}")

    for label, count in counts.items():
        if count < target_count:
            shortfall = target_count - count
            # filter rows for this label
            subset = ds.filter(lambda x: x[label_column] == label)

            repeats = int(shortfall / count)
            if repeats > 0:
                for _ in range(repeats):
                    shards_to_add.append(subset)

    balanced_ds = concatenate_datasets(shards_to_add)
    balanced_ds = balanced_ds.shuffle(seed=42)

    print(f"Balanced Dataset Size: {len(balanced_ds)} (Original: {len(ds)})")
    return balanced_ds


def build_prompt(question: str, full_question: str, answer: str) -> str:
    q = (question or "").strip()
    if not q: q = (full_question or "").strip()

    system_instructions = (
        "You are an expert at analyzing political and media interviews.\n"
        "Your task is to classify an answer into one of the following evasion techniques:\n\n"
    )

    label_list_str = ""
    for name, desc in EVASION_LABEL_DESCRIPTIONS.items():
        label_list_str += f"- {name}: {desc}\n"

    user_block = (
        f"Question: {q}\n"
        f"Answer: {answer.strip()}\n\n"
        "From the list above, which single evasion label best describes this answer?\n"
        "Respond with exactly one label name.\n"
        "Evasion label:"
    )

    return system_instructions + label_list_str + "\n" + user_block

def extract_label_robust(text: str, valid_labels: List[str]) -> str:
    text = text.strip()
    # exact Match
    for lab in valid_labels:
        if lab.lower() == text.lower(): return lab
    # starts With
    for lab in valid_labels:
        if text.lower().startswith(lab.lower()): return lab
    # contains
    for lab in valid_labels:
        if lab.lower() in text.lower(): return lab
    return "Explicit"

def prepare_splits(seed: int = 42):
    raw = load_dataset("ailsntua/QEvasion")
    if "test" in raw: test_raw = raw["test"]
    else: test_raw = raw["train"].select(range(10))

    train_raw = raw["train"]
    label2id, id2label = build_label_mappings(train_raw["evasion_label"])

    # Stratified Split
    label_ids = [label2id[l] for l in train_raw["evasion_label"]]
    train_raw = train_raw.add_column("evasion_label_id", label_ids)
    train_raw = train_raw.class_encode_column("evasion_label_id")

    splits = train_raw.train_test_split(test_size=0.1, stratify_by_column="evasion_label_id", seed=seed)
    train_ds = splits["train"].remove_columns(["evasion_label_id"])
    dev_ds = splits["test"].remove_columns(["evasion_label_id"])

    print("Balancing training data...")
    train_ds = balance_dataset(train_ds)

    ds = DatasetDict({"train": train_ds, "dev": dev_ds, "test": test_raw})
    return ds, label2id, id2label

def tokenize_for_causal_lm(ds: DatasetDict, tokenizer, label2id, max_length):
    valid_labels = sorted(label2id.keys())

    def _tokenize_train(examples):
        texts = []
        for q, iq, a, lab in zip(examples["question"], examples["interview_question"], examples["interview_answer"], examples["evasion_label"]):
            prompt = build_prompt(q, iq, a)
            target = lab.strip()
            if target not in valid_labels: target = "Explicit"
            texts.append(prompt + " " + target)

        enc = tokenizer(texts, padding="max_length", truncation=True, max_length=max_length)
        input_ids = enc["input_ids"]
        labels = []

        for ids, txt in zip(input_ids, texts):
            # Masking prompt
            labels.append([-100] * len(ids))

        # Simplified masking logic for speed/robustness
        final_labels = []
        for i in range(len(input_ids)):
            q = examples["question"][i]
            iq = examples["interview_question"][i]
            a = examples["interview_answer"][i]
            prompt = build_prompt(q, iq, a)

            p_len = len(tokenizer(prompt, add_special_tokens=False)["input_ids"])
            row_labels = [-100] * len(input_ids[i])
            start_index = min(p_len, len(input_ids[i])-1)
            for j in range(start_index, len(input_ids[i])):
                if input_ids[i][j] == tokenizer.pad_token_id: break
                row_labels[j] = input_ids[i][j]
            final_labels.append(row_labels)

        enc["labels"] = final_labels
        return enc

    ds["train"] = ds["train"].map(_tokenize_train, batched=True)
    return ds


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct")
    parser.add_argument("--output_dir", type=str, default="./clarity_task2_llama_lora_v3")
    parser.add_argument("--max_length", type=int, default=512)
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--gradient_accumulation_steps", type=int, default=16)
    parser.add_argument("--num_train_epochs", type=float, default=3.0)
    parser.add_argument("--learning_rate", type=float, default=2e-4)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--bf16", action="store_true")
    args = parser.parse_args()

    os.makedirs(args.output_dir, exist_ok=True)
    set_seed(args.seed)

    print("Loading Data & Balancing...")
    ds, label2id, id2label = prepare_splits(args.seed)

    print("Loading Model...")
    compute_dtype = torch.bfloat16 if args.bf16 else torch.float16
    bnb_cfg = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=compute_dtype,
    )

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(args.model_name, quantization_config=bnb_cfg, device_map="auto")
    model = prepare_model_for_kbit_training(model)

    # increased Dropout to 0.1 to fight overfitting
    lora_cfg = LoraConfig(
        r=64,
        lora_alpha=128,
        lora_dropout=0.1,
        target_modules="all-linear",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, lora_cfg)

    print("Tokenizing...")
    ds = tokenize_for_causal_lm(ds, tokenizer, label2id, args.max_length)

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        per_device_train_batch_size=args.batch_size,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        learning_rate=args.learning_rate,
        num_train_epochs=args.num_train_epochs,
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
        fp16=not args.bf16,
        bf16=args.bf16,
        logging_steps=10,
        save_strategy="epoch",
        save_total_limit=1,
        report_to="none",
        neftune_noise_alpha=5,
    )

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=ds["train"],
    )

    print("Training...")
    trainer.train()
    print("Running Inference...")
    model.eval()

    gen_config = {
        "max_new_tokens": 10,
        "do_sample": False,
        "repetition_penalty": 1.2, # prevents looping error
        "pad_token_id": tokenizer.pad_token_id,
        "eos_token_id": tokenizer.eos_token_id
    }

    def run_inference(dataset_split, desc="dev"):
        preds = []
        golds = []
        for i in range(len(dataset_split)):
            row = dataset_split[i]
            q = row.get("question", row.get("interview_question"))
            iq = row.get("interview_question", q)
            a = row.get("interview_answer")
            prompt = build_prompt(q, iq, a)

            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            with torch.no_grad():
                out = model.generate(**inputs, **gen_config)

            gen_text = tokenizer.decode(out[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
            pred_label = extract_label_robust(gen_text, list(label2id.keys()))
            preds.append(pred_label)

            if "evasion_label" in row: golds.append(row["evasion_label"])
            if i % 50 == 0:
                print(f"[{desc}] {i}/{len(dataset_split)} | Gen: {gen_text.strip()} -> Pred: {pred_label}")

        return preds, golds

    # Dev Eval
    dev_preds, dev_golds = run_inference(ds["dev"], "dev")
    dev_ids = [label2id[g] for g in dev_golds]
    pred_ids = [label2id[p] for p in dev_preds]

    print("\n=== FINAL MACRO F1: ===")
    print(f1_score(dev_ids, pred_ids, average="macro"))

    # Test Eval
    print("Predicting Test Set...")
    test_preds, _ = run_inference(ds["test"], "test")
    df = pd.DataFrame({"index": ds["test"]["index"], "evasion_label": test_preds})
    df.to_csv(os.path.join(args.output_dir, "predictions_v3.csv"), index=False)
    print("Done!")

if __name__ == "__main__":
    main()

Writing task2_llama_lora_v3.py


In [None]:
!python task2_llama_lora_v3.py \
    --output_dir "./v3_results_local" \
    --num_train_epochs 3 \
    --batch_size 1 \
    --gradient_accumulation_steps 16 \
    --learning_rate 2e-4 \
    --bf16

import shutil
destination = "/content/drive/MyDrive/SemEval_Clarity/v3_predictions.csv"
shutil.copy("./v3_results_local/predictions_v3.csv", destination)
print(f"Success! Predictions saved to {destination}")

2025-11-28 04:42:38.014912: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-28 04:42:38.032056: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764304958.053444    7429 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764304958.059999    7429 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764304958.076485    7429 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/SemEval_Clarity/v3_predictions.csv'