In [3]:
import os
import pandas as pd
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    EarlyStoppingCallback
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig

  from .autonotebook import tqdm as notebook_tqdm


### Importing Data

In [None]:
DEV_DIR = "data_poleval/dev-0"

# 1. SETUP DIRECTORIES & DATA
os.makedirs(DEV_DIR, exist_ok=True)

input_path = os.path.join(DEV_DIR, 'in.tsv')
expected_path = os.path.join(DEV_DIR, 'expected.tsv')
train_csv_path = os.path.join('train_data.csv')
output_dir = os.path.join('pllama-qa-results')

# Load and process TSV files
with open(input_path, 'r', encoding='utf-8') as f:
    questions = [line.strip() for line in f]
with open(expected_path, 'r', encoding='utf-8') as f:
    answers = [line.strip().split('\t')[0] for line in f]

# Save to CSV for the trainer
pd.DataFrame({'instruction': questions, 'output': answers}).to_csv(train_csv_path, index=False)

EVAL_DIR = "data_poleval/test-A"
eval_input_path = os.path.join(EVAL_DIR, 'in.tsv')
eval_expected_path = os.path.join(EVAL_DIR, 'expected.tsv')
eval_csv_path = os.path.join('eval_data.csv')

# Load and process TSV files
with open(eval_input_path, 'r', encoding='utf-8') as f:
    questions = [line.strip() for line in f]
with open(eval_expected_path, 'r', encoding='utf-8') as f:
    answers = [line.strip().split('\t')[0] for line in f]

# Save to CSV for the trainer
pd.DataFrame({'instruction': questions, 'output': answers}).to_csv(eval_csv_path, index=False)

### 2. Model & Tokenizer

In [5]:
model_id = "radlab/pLLama3.2-1B"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

def formatting_prompts_func(example):
    text = f"<|im_start|>user\n{example['instruction']}<|im_end|>\n<|im_start|>assistant\n{example['output']}<|im_end|>"
    return text

### 3. Training Preparation

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

sft_config = SFTConfig(
    output_dir=output_dir,
    max_length=128,          # Define it here
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_steps=10,
    save_strategy="steps",
    eval_strategy="steps",
    eval_steps=10,
    load_best_model_at_end=True,
    bf16=True,
    report_to="none",
    use_mps_device=True
)

trainer = SFTTrainer(
    model=model,
    train_dataset=load_dataset("csv", data_files=train_csv_path, split="train"),
    eval_dataset=load_dataset("csv", data_files=eval_csv_path, split="train"),
    peft_config=peft_config,
    formatting_func=formatting_prompts_func,
    processing_class=tokenizer,
    args=sft_config,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

Generating train split: 1000 examples [00:00, 31606.46 examples/s]
Generating train split: 2500 examples [00:00, 393034.22 examples/s]
W0102 15:47:09.135000 79848 torch/utils/flop_counter.py:45] triton not found; flop counting will not work for triton kernels
W0102 15:47:09.137000 79848 torch/utils/flop_counter.py:45] triton not found; flop counting will not work for triton kernels
W0102 15:47:09.137000 79848 torch/utils/flop_counter.py:45] triton not found; flop counting will not work for triton kernels
W0102 15:47:09.138000 79848 torch/utils/flop_counter.py:45] triton not found; flop counting will not work for triton kernels
W0102 15:47:09.139000 79848 torch/utils/flop_counter.py:45] triton not found; flop counting will not work for triton kernels
W0102 15:47:09.142000 79848 torch/utils/flop_counter.py:45] triton not found; flop counting will not work for triton kernels
W0102 15:47:09.143000 79848 torch/utils/flop_counter.py:45] triton not found; flop counting will not work for trito

### Training

In [None]:
trainer.train()
trainer.save_model(output_dir)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.
  super().__init__(loader)


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
10,2.2825,1.607907,1.809729,4487.0,0.712882
20,1.5746,1.451807,1.655814,8976.0,0.739007
30,1.3979,1.409509,1.541388,13289.0,0.745745
40,1.4249,1.389011,1.501902,17642.0,0.748039
50,1.4013,1.374961,1.538428,22032.0,0.749151
60,1.4526,1.359279,1.507017,26481.0,0.750364
70,1.2822,1.345212,1.468903,30758.0,0.750785
80,1.2831,1.325749,1.477009,35289.0,0.753218
90,1.3552,1.30889,1.412238,39701.0,0.754939
100,1.2613,1.299299,1.392642,43859.0,0.756026


Model Evaluation

In [7]:
del model
torch.mps.empty_cache()

In [10]:
base_model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float16, device_map="auto")
model = PeftModel.from_pretrained(base_model, output_dir)

In [11]:
def ask_question(question):
    prompt = f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"

    device = "mps" if torch.backends.mps.is_available() else "cpu"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=32, eos_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True).strip()

In [12]:
ask_question("W ktÃ³rym mieÅ›cie mieszkaÅ‚ Batman?")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


'w Gotham<|im_end|>'