In [1]:
import os
os.environ["HF_HOME"] = "pfss/mlde/workspaces/mlde_wsp_AISR_Dianovi/MaxU/Thesis/hf-cache"

from typing import Any
from trl import SFTConfig, SFTTrainer
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [2]:
torch.__version__

'2.7.1+cu126'

In [3]:
torch.set_default_dtype(torch.bfloat16)

In [4]:
for i in range(torch.cuda.device_count()):
    print(f"Device {i}: {torch.cuda.get_device_name(i)}")

Device 0: NVIDIA A100-SXM4-80GB
Device 1: NVIDIA A100-SXM4-80GB


In [5]:
model_name = 'Qwen/Qwen3-1.7B'
output_dir = './model_checkpoints'

In [6]:
def count_conversation_tokens(conversations: list[str], used_tokenizer: Any) -> int:
    tokens = used_tokenizer(
        conversations,
        add_special_tokens=False,
        return_attention_mask=False,
        return_token_type_ids=False,
    )
    return sum(len(ids) for ids in tokens["input_ids"])

In [7]:
num_epochs:int = 3
batch_size:int = 1
lr:float = 1e-5
seed:int = 1
num_max_tokens:int = 8192

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    device_map="balanced"
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
tokenizer= AutoTokenizer.from_pretrained(model_name)

In [10]:
pretrain_data = pd.read_feather('data/all_german_data_combined_completion_format.feather').rename(columns={'conversation':'text'})
data = Dataset.from_pandas(pretrain_data)

In [11]:
#n_tokens = count_conversation_tokens(pretrain_data['text'].tolist(), tokenizer)
#n_tokens

In [12]:
training_args = SFTConfig(
    neftune_noise_alpha=5, # improves performance according to https://huggingface.co/papers/2310.05914
    output_dir=output_dir,
    save_strategy='epoch',
    bf16=True,
    per_device_train_batch_size=batch_size,
    seed=seed,
    data_seed=seed,
    dataloader_pin_memory=False,
    assistant_only_loss=True, # only penalize deviations from the models answers
    max_length=num_max_tokens,
    logging_steps=1000
)

In [13]:
trainer = SFTTrainer(
    model,
    train_dataset=data,
    args=training_args,
)


Tokenizing train dataset:   0%|          | 0/141324 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/141324 [00:00<?, ? examples/s]



[2025-07-18 09:37:00,530] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  def forward(ctx, input, weight, bias=None):
  def backward(ctx, grad_output):


In [14]:
trainer.train()

Step,Training Loss
1000,0.4266
2000,0.3027
3000,0.3182
4000,0.3172
5000,0.3372
6000,0.3084
7000,0.3188
8000,0.3191
9000,0.3341
10000,0.3307


TrainOutput(global_step=423972, training_loss=0.2941263957577553, metrics={'train_runtime': 99707.3983, 'train_samples_per_second': 4.252, 'train_steps_per_second': 4.252, 'total_flos': 5.161252379596554e+18, 'train_loss': 0.2941263957577553})