In [1]:
!pip install -q "transformers>=4.45.0" "datasets>=2.20.0" "peft>=0.18.0" "accelerate>=1.2.0" bitsandbytes scikit-learn huggingface_hub

In [10]:
!pip install -q -U torch transformers peft accelerate datasets scikit-learn pandas

import os
import torch
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import f1_score
from peft import LoraConfig, get_peft_model, TaskType
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    set_seed
)
from google.colab import files


os.environ["HF_HOME"] = "/content/hf_cache"
set_seed(42)


MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
OUTPUT_DIR = "./task1_llama_lora_16bit"


dataset = load_dataset("ailsntua/QEvasion")

full_train = dataset["train"]
official_test = dataset["test"]
full_train = full_train.class_encode_column("clarity_label")


splits = full_train.train_test_split(test_size=0.1, seed=42, stratify_by_column="clarity_label")
train_ds = splits["train"]
dev_ds = splits["test"]

def format_prompt(example):
    q = example['question'] or example['interview_question']
    a = example['interview_answer']
    label_int = example['clarity_label']
    features = full_train.features['clarity_label']
    label_str = features.int2str(label_int)

    text = (f"Question: {q}\nAnswer: {a}\n\n"
            "Classify the clarity of this answer. Options:\n"
            "- Clear Reply\n- Ambivalent\n- Clear Non-Reply\n\n"
            f"Label: {label_str}")
    return {"text": text, "label": label_str}

train_ds = train_ds.map(format_prompt)
dev_ds = dev_ds.map(format_prompt)

test_ds = official_test.map(lambda x: {
    "text": f"Question: {x['question']}\nAnswer: {x['interview_answer']}\n\n"
            "Classify the clarity of this answer. Options:\n"
            "- Clear Reply\n- Ambivalent\n- Clear Non-Reply\n\nLabel:",
    "label": x["clarity_label"]
})


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules="all-linear",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

def tokenize_fn(examples):
    outputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    outputs["labels"] = outputs["input_ids"]
    return outputs

cols_to_remove = train_ds.column_names
tokenized_train = train_ds.map(tokenize_fn, batched=True, remove_columns=cols_to_remove)
tokenized_dev = dev_ds.map(tokenize_fn, batched=True, remove_columns=cols_to_remove)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    num_train_epochs=10,
    logging_steps=10,
    fp16=False,
    bf16=True,
    eval_strategy="epoch",        
    save_strategy="epoch",     
    load_best_model_at_end=True, 
    metric_for_best_model="eval_loss", 
    greater_is_better=False,     
    save_total_limit=1,
    report_to="none"
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    args=training_args,
    processing_class=tokenizer,
)


trainer.train()


model.eval()

predictions = []
gen_config = {
    "max_new_tokens": 10,
    "do_sample": False,
    "repetition_penalty": 1.2,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": tokenizer.eos_token_id
}

def clean_prediction(text):
    text = text.lower()
    if "non-reply" in text: return "Clear Non-Reply"
    if "ambivalent" in text or "ambiguous" in text: return "Ambivalent"
    if "clear reply" in text: return "Clear Reply"
    return "Clear Reply"

for i, row in enumerate(test_ds):
    inputs = tokenizer(row["text"], return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, **gen_config)

    gen_text = tokenizer.decode(out[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True).strip()
    pred = clean_prediction(gen_text)
    predictions.append(pred)

    if i % 50 == 0:
        print(f"Test {i}: {gen_text} Pred: {pred}")

from sklearn.metrics import f1_score
label_map = {"Clear Reply": 0, "Ambivalent": 1, "Clear Non-Reply": 2}

y_true = []
y_pred = []

for t, p in zip(test_ds["label"], predictions):
    if t in label_map and p in label_map:
        y_true.append(label_map[t])
        y_pred.append(label_map[p])

final_f1 = f1_score(y_true, y_pred, average="macro")
print(f"Task 1 Macro F1: {final_f1:.4f}")

df = pd.DataFrame({"index": test_ds["index"], "clarity_label": predictions})
df.to_csv("llama_task1_predictions_lora.csv", index=False)
files.download("llama_task1_predictions_lora.csv")

Map:   0%|          | 0/3103 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196


Map:   0%|          | 0/3103 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128009}.


Epoch,Training Loss,Validation Loss
1,1.2159,1.156969
2,0.9037,1.001858
3,0.6722,0.872299
4,0.4336,0.784149
5,0.3015,0.729516
6,0.1844,0.71143
7,0.1388,0.734914
8,0.0942,0.776741
9,0.0707,0.858176
10,0.0514,0.922108


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Test 0: Ambivalent Pred: Ambivalent
Test 50: Ambivalent Pred: Ambivalent
Test 100: Ambivalent Pred: Ambivalent
Test 150: Clear Reply Pred: Clear Reply
Test 200: Ambivalent Pred: Ambivalent
Test 250: Ambivalent Pred: Ambivalent
Test 300: Ambivalent Pred: Ambivalent
Task 1 Macro F1: 0.6066


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>