In [None]:
# Install required packages and login
!pip install -q transformers datasets peft trl accelerate bitsandbytes scikit-learn tqdm

from huggingface_hub import login
login()

import os
import torch
from torch import bfloat16
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_recall_fscore_support,
    classification_report
)
from tqdm import tqdm
from peft import LoraConfig

# Configuration
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
experiment = "evasion_based_clarity"  
batch_size = 2
num_epochs = 4
lr = 1e-4
output_dir = "./outputs_llama3_lora_offload"
os.makedirs(output_dir, exist_ok=True)

# Label mapping
if experiment == "evasion_based_clarity":
    mapping_labels = {
        "Explicit": 0,
        "Implicit": 1,
        "Dodging": 2,
        "General": 3,
        "Deflection": 4,
        "Partial/half-answer": 5,
        "Declining to answer": 6,
        "Claims ignorance": 7,
        "Clarification": 8,
    }
    label_field = "evasion_label"
elif experiment == "direct_clarity":
    mapping_labels = {
        "Clear Reply": 0,
        "Ambivalent": 1,
        "Clear Non-Reply": 2,
    }
    label_field = "clarity_label"

id2label = {v: k for k, v in mapping_labels.items()}
num_labels = len(mapping_labels)
print(f"Using {num_labels} labels for experiment: {experiment}")

# Instruction header
INSTRUCTION_HEADER = """You are an expert annotator for political interview clarity classification.

You are given an interview question and an answer from a politician.
Your task is to classify how the answer addresses the question,
using exactly one of the following labels:

"""

if experiment == "evasion_based_clarity":
    INSTRUCTION_HEADER += """1. Explicit – The information requested is explicitly stated (in the requested form).
2. Implicit – The information requested is given, but without being explicitly stated (not in the expected form).
3. Dodging – Ignoring the question altogether.
4. Deflection – Starts on topic but shifts the focus and makes a different point than what is asked.
5. Partial/half-answer – Offers only a specific component of the requested information.
6. General – The information provided is too general/lacks the requested specificity.
7. Declining to answer – Acknowledge the question but directly or indirectly refusing to answer at the moment.
8. Claims ignorance – The answerer claims/admits not to know the answer themselves.
9. Clarification – Does not provide the requested information and asks for clarification.
"""
else:
    INSTRUCTION_HEADER += """1. Clear Reply – A clear, direct answer to the question.
2. Ambivalent – The answer is partially addressing the question or is ambiguous.
3. Clear Non-Reply – The answer does not address the question at all.
"""

INSTRUCTION_HEADER += """

Read the following interview question and answer segment.
Then output the label in the format: "Label: <LABEL>".

"""

# Prompt building functions
def build_prompt(data):
    system_prompt = INSTRUCTION_HEADER
    user_prompt = f"Interview Question: {data['interview_question']}\n\nFull Answer: {data['interview_answer']}\n\nLabel:"
    assistant_response = f"{data[label_field]}"
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": assistant_response},
    ]

def build_messages_dataset(raw_dataset):
    messages_list = []
    for row in raw_dataset:
        msgs = build_prompt(row)
        messages_list.append({"messages": msgs})
    return messages_list

# Load and split dataset
raw_dataset = load_dataset("ailsntua/QEvasion", split="train")
rows = [row for row in raw_dataset]
labels = [row[label_field] for row in raw_dataset]

train_rows, eval_rows = train_test_split(
    rows,
    test_size=0.1,
    random_state=3407,
    stratify=labels,
)

dataset = build_messages_dataset(train_rows)
train_dataset = Dataset.from_list(dataset)
eval_dataset = Dataset.from_list(build_messages_dataset(eval_rows))

print(f"Train set size: {len(train_dataset)}")
print(f"Eval set size: {len(eval_dataset)}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.2/517.2 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[?25h

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Using 9 labels for experiment: evasion_based_clarity


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.90M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/259k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3448 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

Train set size: 3103
Eval set size: 345


In [None]:
# Setup tokenizer, model, and trainer
LLAMA3_MASKING_TEMPLATE = (
    "{% set loop_messages = messages %}"
    "{% for message in loop_messages %}"
        "{% if loop.index0 == 0 %}"
            "{% set start_token = bos_token %}"
        "{% else %}"
            "{% set start_token = '' %}"
        "{% endif %}"

        "{% if message['role'] == 'assistant' %}"
            "{{ start_token + '<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '\n\n' }}"
            "{% generation %}"
            "{{ message['content'] | trim + '<|eot_id|>' }}"
            "{% endgeneration %}"
        "{% else %}"
            "{{ start_token + '<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '\n\n' + message['content'] | trim + '<|eot_id|>' }}"
        "{% endif %}"
    "{% endfor %}"
    "{% if add_generation_prompt %}"
        "{{ '<|start_header_id|>assistant<|end_header_id|>' + '\n\n' }}"
    "{% endif %}"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.chat_template = LLAMA3_MASKING_TEMPLATE

lora_args = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

sft_args = SFTConfig(
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=8,
    max_steps=len(dataset) * num_epochs // (batch_size * 8),
    warmup_steps=len(dataset) * num_epochs // (batch_size * 8 * 20),
    learning_rate=lr,
    logging_steps=10,
    optim="adamw_torch",
    weight_decay=0.001,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir=output_dir,
    batch_eval_metrics=True,
    dataset_text_field=None,
    packing=False,
    assistant_only_loss=True,
    report_to="none",
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=bfloat16,
    device_map="auto",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=sft_args,
    peft_config=lora_args,
    processing_class=tokenizer
)

print("Setup complete. Ready to train.")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Tokenizing train dataset:   0%|          | 0/3103 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/3103 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/345 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/345 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Setup complete. Ready to train.


In [None]:
# Train the model
print("Starting training...")
trainer.train()
print("Training complete!")


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128009}.


Starting training...


Step,Training Loss
10,9.7617
20,1.7059
30,0.7608
40,0.7883
50,0.6598
60,0.8386
70,0.6555
80,0.7844
90,0.7544
100,0.5808


Training complete!


In [None]:
# Run inference and calculate metrics
# Check that model and tokenizer are defined (run previous cells first!)
if 'model' not in globals() or 'tokenizer' not in globals():
    raise NameError("Model or tokenizer not defined. Please run cells 1, 2, and 3 first!")

model.eval()
tokenizer.padding_side = "left"

def format_prompt_for_inference(example):
    input_messages = example["messages"][:-1]
    if input_messages[-1]["role"] != "user":
        pass
    prompt = tokenizer.apply_chat_template(
        input_messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return prompt

ground_truths = [ex["messages"][-1]["content"] for ex in eval_dataset]

# Run batch inference
eval_batch_size = 8
generated_texts = []

print("Starting Inference...")
torch.cuda.empty_cache()

for i in tqdm(range(0, len(eval_dataset), eval_batch_size)):
    batch_indices = range(i, min(i + eval_batch_size, len(eval_dataset)))
    batch_examples = [eval_dataset[idx] for idx in batch_indices]

    prompts = [format_prompt_for_inference(ex) for ex in batch_examples]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=20,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    input_len = inputs.input_ids.shape[1]
    decoded_output = tokenizer.batch_decode(outputs[:, input_len:], skip_special_tokens=True)
    generated_texts.extend(decoded_output)

print(f"Generated {len(generated_texts)} predictions")

# Parse predictions and ground truth
def parse_prediction(text):
    text = text.strip().lower()
    for label_name, label_id in mapping_labels.items():
        if text.startswith(label_name.lower()):
            return label_id
    return -1

y_pred = [parse_prediction(text) for text in generated_texts]
y_true = []

for gt in ground_truths:
    if gt in mapping_labels:
        y_true.append(mapping_labels[gt])
    else:
        y_true.append(-1)

# Filter out invalid predictions (-1)
valid_indices = [i for i in range(len(y_pred)) if y_pred[i] != -1 and y_true[i] != -1]
y_pred_valid = [y_pred[i] for i in valid_indices]
y_true_valid = [y_true[i] for i in valid_indices]

print(f"Valid predictions: {len(y_pred_valid)}/{len(y_pred)}")
if len(y_pred) - len(y_pred_valid) > 0:
    print(f"Warning: {len(y_pred) - len(y_pred_valid)} predictions could not be parsed")

# Calculate all metrics
accuracy = accuracy_score(y_true_valid, y_pred_valid)

# Macro-averaged metrics
macro_precision = precision_recall_fscore_support(
    y_true_valid, y_pred_valid, average="macro", zero_division=0
)[0]
macro_recall = precision_recall_fscore_support(
    y_true_valid, y_pred_valid, average="macro", zero_division=0
)[1]
macro_f1 = f1_score(y_true_valid, y_pred_valid, average="macro")

# Per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(
    y_true_valid, y_pred_valid, average=None, zero_division=0, labels=list(mapping_labels.values())
)

# Print results
print("\n" + "="*80)
print("EVALUATION METRICS")
print("="*80)
print(f"\nOverall Accuracy: {accuracy:.4f}")
print(f"\nMacro-averaged Metrics:")
print(f"  Precision: {macro_precision:.4f}")
print(f"  Recall:    {macro_recall:.4f}")
print(f"  F1-Score:  {macro_f1:.4f}")

print(f"\nPer-class Metrics:")
print(f"{'Label':<25} {'Precision':<12} {'Recall':<12} {'F1-Score':<12} {'Support':<10}")
print("-" * 80)
for label_name, label_id in sorted(mapping_labels.items(), key=lambda x: x[1]):
    if label_id < len(precision):
        print(f"{label_name:<25} {precision[label_id]:<12.4f} {recall[label_id]:<12.4f} {f1[label_id]:<12.4f} {support[label_id]:<10}")

print("\n" + "="*80)
print("Detailed Classification Report:")
print("="*80)
print(classification_report(
    y_true_valid, y_pred_valid,
    target_names=list(mapping_labels.keys()),
    labels=list(mapping_labels.values()),
    zero_division=0
))

# Save the fine-tuned model and results
trainer.save_model(output_dir + "/new_model")
print(f"\nModel saved to {output_dir}/new_model")

with open(os.path.join(output_dir, "evaluation_results.txt"), "w") as f:
    f.write(f"Model output - {experiment}\n")
    f.write(f"\nOverall Accuracy: {accuracy:.4f}\n")
    f.write(f"Macro Precision: {macro_precision:.4f}\n")
    f.write(f"Macro Recall: {macro_recall:.4f}\n")
    f.write(f"Macro F1-Score: {macro_f1:.4f}\n")
    f.write("\n" + "="*80 + "\n")
    f.write("\nDetailed Results:\n")
    for i, text in enumerate(generated_texts):
        f.write(f"\nExample {i+1}:\n")
        f.write(f"Generated: {text}\n")
        f.write(f"Ground Truth: {ground_truths[i]}\n")
        f.write("-" * 80 + "\n")

print(f"Results saved to {output_dir}/evaluation_results.txt")


Starting Inference...


  0%|          | 0/44 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
100%|██████████| 44/44 [00:54<00:00,  1.23s/it]


Generated 345 predictions
Valid predictions: 345/345

EVALUATION METRICS

Overall Accuracy: 0.4203

Macro-averaged Metrics:
  Precision: 0.4063
  Recall:    0.4096
  F1-Score:  0.4073

Per-class Metrics:
Label                     Precision    Recall       F1-Score     Support   
--------------------------------------------------------------------------------
Explicit                  0.5327       0.5429       0.5377       105       
Implicit                  0.2593       0.2857       0.2718       49        
Dodging                   0.4026       0.4366       0.4189       71        
General                   0.2857       0.2564       0.2703       39        
Deflection                0.3158       0.3158       0.3158       38        
Partial/half-answer       0.0000       0.0000       0.0000       8         
Declining to answer       0.6154       0.5714       0.5926       14        
Claims ignorance          0.5455       0.5000       0.5217       12        
Clarification             0.700