## 2. Importe

In [1]:
import os
import re
import random
import json
from pathlib import Path
from datasets import Dataset

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
    DataCollatorWithPadding,
)
from peft import get_peft_config, get_peft_model, LoraConfig

import numpy as np
import sklearn
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns




## 3 paths

In [2]:
# Deine lokalen Windows-Pfade
TRAIN_DIR = Path(r"C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\dataset\train")
DEV_DIR   = Path(r"C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\dataset\dev")
TEST_DIR  = Path(r"C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\dataset\test")

assert TRAIN_DIR.exists(), f"Train-Ordner nicht gefunden: {TRAIN_DIR}"
assert DEV_DIR.exists(),   f"Dev-Ordner nicht gefunden:   {DEV_DIR}"
assert TEST_DIR.exists(),  f"Test-Ordner nicht gefunden:  {TEST_DIR}"


## 4 load data

In [3]:
def load_docie_docs(folder: Path, recursive: bool = False):
    docs = []
    pattern = "**/*.json" if recursive else "*.json"
    for file in folder.glob(pattern):
        data = json.loads(file.read_text(encoding="utf-8"))
        if isinstance(data, list):
            docs.extend(data)
        else:
            docs.append(data)
    return docs

train_docs = load_docie_docs(TRAIN_DIR)
dev_docs   = load_docie_docs(DEV_DIR)
test_docs  = load_docie_docs(TEST_DIR, recursive=True)

print("Train:", len(train_docs), "│ Dev:", len(dev_docs), "│ Test:", len(test_docs))


Train: 51 │ Dev: 23 │ Test: 248


In [55]:
# -> new cell right after you do test_docs = load_docie_docs(...)
print(test_docs[0].keys())


dict_keys(['domain', 'document', 'RE_label_set', 'NER_label_set', 'id'])


## 5. Exploratory Data Analysis (EDA)

## 5.1 doc length

In [4]:
lengths = [len(doc["doc"].split()) for doc in train_docs]
print("Avg Tokens:", np.mean(lengths), "Max Tokens:", np.max(lengths))


Avg Tokens: 919.0784313725491 Max Tokens: 2560


## 5.2 Entity split

In [5]:
from collections import Counter
ctr = Counter(ent["type"] for doc in train_docs for ent in doc["entities"])
print("Entity-Typen:", ctr.most_common())


Entity-Typen: [('DATE', 647), ('MISC', 417), ('PERSON', 242), ('ORG', 241), ('CARDINAL', 224), ('GPE', 157), ('WORK_OF_ART', 65), ('NORP', 59), ('ORDINAL', 55), ('QUANTITY', 42), ('EVENT', 35), ('PRODUCT', 30), ('FAC', 30), ('MONEY', 29), ('PERCENT', 28), ('LOC', 24), ('LANGUAGE', 10), ('LAW', 9), ('TIME', 8)]


## 5.3 Relation split

In [6]:
ctr_rel = Counter(t["relation"] for doc in train_docs for t in doc["triples"])
print("Relation-Typen:", ctr_rel.most_common())

Relation-Typen: [('HasPart', 82), ('HasEffect', 67), ('DiplomaticRelation', 45), ('LocatedIn', 44), ('InterestedIn', 38), ('OwnerOf', 32), ('NominatedFor', 25), ('SaidToBeTheSameAs', 25), ('PartOf', 18), ('Creator', 17), ('Founded', 13), ('Country', 13), ('DifferentFrom', 11), ('SignificantEvent', 11), ('PrimeFactor', 11), ('InfluencedBy', 10), ('Follows', 10), ('UsedBy', 9), ('InspiredBy', 9), ('Uses', 8), ('FollowedBy', 8), ('SharesBorderWith', 8), ('AdjacentStation', 7), ('HasWorksInTheCollection', 6), ('PositionHeld', 6), ('OfficialLanguage', 5), ('Studies', 4), ('WorkLocation', 4), ('PracticedBy', 4), ('AcademicDegree', 3), ('Author', 3), ('CountryOfCitizenship', 3), ('EducatedAt', 3), ('LanguageUsed', 3), ('IssuedBy', 3), ('Affiliation', 2), ('MemberOf', 2), ('ApprovedBy', 2), ('Continent', 2), ('OwnedBy', 2), ('Location', 2), ('LanguageOfWorkOrName', 2), ('NativeLanguage', 2), ('OriginalLanguageOfFilmOrTvShow', 2), ('Employer', 2), ('AppliesToPeople', 1), ('HasQuality', 1), ('Pr

## 6. Label-Mapping

In [7]:
entity_types = train_docs[0]["entity_label_set"]

ner_labels = ["O"]
for t in entity_types:
    ner_labels += [f"B-{t}", f"I-{t}"]

label2id = {lab: i for i, lab in enumerate(ner_labels)}
id2label = {i: lab for lab, i in label2id.items()}

print("Anzahl NER-Labels:", len(ner_labels))
print("label2id['O'] =", label2id["O"])


Anzahl NER-Labels: 39
label2id['O'] = 0


## 7. load tokenizer

In [8]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

## 8. HF-Datasets

In [9]:
hf_train = Dataset.from_list(train_docs)
hf_dev   = Dataset.from_list(dev_docs)

## 9. encode_with_spans

In [12]:
max_length = 512
stride     = 128

# 9.2 Funktion für batched map
def tokenize_and_align_labels(examples):
    all_input_ids      = []
    all_attention_mask = []
    all_labels         = []

    for doc, entities in zip(examples["doc"], examples["entities"]):
        # Tokenize mit Overflow & Stride
        tokenized = tokenizer(
            doc,
            return_offsets_mapping=True,
            truncation=True,
            max_length=max_length,
            stride=stride,
            return_overflowing_tokens=True,
        )

        # Pro Chunk ein neues Beispiel erzeugen
        for i in range(len(tokenized["input_ids"])):
            offsets       = tokenized["offset_mapping"][i]
            input_ids     = tokenized["input_ids"][i]
            attention_mask= tokenized["attention_mask"][i]

            # 9.2.1 O-Labels initialisieren
            chunk_labels = ["O"] * len(offsets)

            # 9.2.2 Mentions labeln
            for ent in entities:
                ent_type = ent["type"]
                for mention in ent["mentions"]:
                    start = doc.find(mention)
                    if start < 0: 
                        continue
                    end = start + len(mention)
                    for idx, (o_start, o_end) in enumerate(offsets):
                        if o_start >= start and o_end <= end:
                            prefix = "B" if o_start == start else "I"
                            chunk_labels[idx] = f"{prefix}-{ent_type}"

            # 9.2.3 Labels → IDs
            label_ids = [ label2id.get(lab, label2id["O"]) for lab in chunk_labels ]

            # 9.2.4 Ansammln
            all_input_ids.append(input_ids)
            all_attention_mask.append(attention_mask)
            all_labels.append(label_ids)

    return {
        "input_ids": all_input_ids,
        "attention_mask": all_attention_mask,
        "labels": all_labels,
    }


## 10. tokenization & Label-Alignment

In [14]:
# 10.1 Original-Spalten, die wir nicht mehr brauchen
cols_to_remove = [
    "domain","title","doc","entities","triples",
    "label_set","entity_label_set"
]

# 10.2 Batched map mit Flattening
hf_train = hf_train.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=cols_to_remove,
)

hf_dev = hf_dev.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=cols_to_remove,
)


Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

## 11. DataCollator & metrtics

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer)

### 11.2 metrics function

In [18]:
def compute_metrics_entity_only(pred):
    preds  = pred.predictions.argmax(-1).flatten()
    labels = pred.label_ids.flatten()
    mask = (labels != label2id["O"]) & (labels != -100)
    if mask.sum() == 0:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
    p, r, f1, _ = precision_recall_fscore_support(
        labels[mask], preds[mask], average="micro"
    )
    return {"precision": p, "recall": r, "f1": f1}

## 12. Trainer & Smoke-Run

## 20. Final test set bert evaluation

In [160]:
print(test_docs[0].keys())


dict_keys(['domain', 'document', 'RE_label_set', 'NER_label_set', 'id'])


In [162]:
# === Kapitel 20: NER-Inferenz auf dem Test-Set mit dem besten Full-FT-BERT-Modell ===

import json
from pathlib import Path
from transformers import pipeline, AutoTokenizer

# ——————————————————————————————————————————————
# 1) Test-Daten laden
# ——————————————————————————————————————————————
TEST_DIR = Path(r"C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\dataset\test")

def load_docie_docs(folder: Path, recursive: bool = False):
    docs = []
    pattern = "**/*.json" if recursive else "*.json"
    for file in folder.glob(pattern):
        data = json.loads(file.read_text(encoding="utf-8"))
        if isinstance(data, list):
            docs.extend(data)
        else:
            docs.append(data)
    return docs

test_docs = load_docie_docs(TEST_DIR, recursive=True)
print(f"Anzahl Test-Dokumente: {len(test_docs)}")

# ——————————————————————————————————————————————
# 2) Tokenizer & Pipeline einrichten
# ——————————————————————————————————————————————
# Basis-Model-Name (Tokenizer kommt hierher)
base_model_name = "bert-base-uncased"

# Checkpoint-Ordner, den dein Trainer mit den besten Full-FT-Gewichten gefüllt hat
model_dir = Path(r"C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\outputs\bert-ner-full-ft-opt")

# Lade den Tokenizer aus dem Basismodell
tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=True)

# Erstelle die NER-Pipeline, die die Gewichte aus model_dir lädt
ner_pipe = pipeline(
    "ner",
    model=str(model_dir),
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0   # GPU 0, für CPU -> device=-1
)

# ——————————————————————————————————————————————
# 3) Inferenz & Speicherung
# ——————————————————————————————————————————————
ner_results = []
for doc in test_docs:
    # Doc-ID unter "id", Text unter "document"
    entities = ner_pipe(doc["document"])
    ner_results.append({
        "id": doc["id"],
        "entities": entities
    })

# Speichern
with open("ner_test_preds_full_ft_bert.json", "w", encoding="utf-8") as fout:
    json.dump(ner_results, fout, ensure_ascii=False, indent=2)

print("✅ NER-Predictions gespeichert in ner_test_preds_full_ft_bert.json")


Anzahl Test-Dokumente: 248


ValueError: Could not load model C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\outputs\bert-ner-full-ft-opt with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForTokenClassification'>, <class 'transformers.models.auto.modeling_tf_auto.TFAutoModelForTokenClassification'>). See the original errors:

while loading with AutoModelForTokenClassification, an error is thrown:
Traceback (most recent call last):
  File "C:\Users\nmilo\anaconda3\Lib\site-packages\transformers\pipelines\base.py", line 291, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nmilo\anaconda3\Lib\site-packages\transformers\models\auto\auto_factory.py", line 573, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nmilo\anaconda3\Lib\site-packages\transformers\modeling_utils.py", line 272, in _wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nmilo\anaconda3\Lib\site-packages\transformers\modeling_utils.py", line 4317, in from_pretrained
    checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nmilo\anaconda3\Lib\site-packages\transformers\modeling_utils.py", line 982, in _get_resolved_checkpoint_files
    raise EnvironmentError(
OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\outputs\bert-ner-full-ft-opt.

while loading with TFAutoModelForTokenClassification, an error is thrown:
Traceback (most recent call last):
  File "C:\Users\nmilo\anaconda3\Lib\site-packages\transformers\pipelines\base.py", line 291, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nmilo\anaconda3\Lib\site-packages\transformers\models\auto\auto_factory.py", line 573, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nmilo\anaconda3\Lib\site-packages\transformers\modeling_tf_utils.py", line 2797, in from_pretrained
    raise EnvironmentError(
OSError: Error no file named tf_model.h5, model.safetensors or pytorch_model.bin found in directory C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\outputs\bert-ner-full-ft-opt.




## 21: Setup für GPT-J

In [74]:
# 21.0 Load GPT-Neo tokenizer & model first
model_name = "EleutherAI/gpt-neo-125M"
tokenizer  = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model      = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(ner_labels),
    id2label=id2label,
    label2id=label2id
)

# 21.1 Guarantee a PAD token exists and update the model
if tokenizer.pad_token is None:
    # 1) add a “[PAD]” token
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    # 2) resize model embeddings
    model.resize_token_embeddings(len(tokenizer))
    # 3) tell the model to use that pad token
    model.config.pad_token_id = tokenizer.pad_token_id

# 21.2 DataCollator for token classification
data_collator = DataCollatorForTokenClassification(tokenizer)

# 21.3 Compute–Metrics unchanged
def compute_metrics_entity_only(p):
    preds = p.predictions.argmax(-1).reshape(-1)
    labels = p.label_ids.reshape(-1)
    mask   = labels >= 0
    p_, r_, f_, _ = precision_recall_fscore_support(
        labels[mask], preds[mask], average="micro"
    )
    return {"precision": p_, "recall": r_, "f1": f_}


Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Chapter 22: GPT-Neo Smoke-Run Baseline

In [94]:
# === Kapitel 22 (Neo-Baseline im BERT-Stil) ===
from transformers import TrainingArguments

neo_baseline_args = TrainingArguments(
    output_dir="outputs/gptneo-ner-baseline",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    num_train_epochs=3,
    learning_rate=3e-3,
    weight_decay=0.0,
    fp16=torch.cuda.is_available(),
    save_only_model=True,
)

neo_baseline_trainer = Trainer(
    model=model,
    args=neo_baseline_args,
    train_dataset=hf_train,
    eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)

neo_baseline_trainer.train()
baseline_metrics_neo = neo_baseline_trainer.evaluate()
print("🔖 GPT-Neo Baseline Dev-F1 (3 Epochen):", baseline_metrics_neo["eval_f1"])


  neo_baseline_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.087694,0.79206,0.79206,0.79206
2,No log,1.210977,0.743442,0.743442,0.743442
3,No log,1.484688,0.668711,0.668711,0.668711


🔖 GPT-Neo Baseline Dev-F1 (3 Epochen): 0.6687107564110087


## Chapter 23: Hyperparameter-Tuning Full Fine-Tuning for GPT-Neo

In [82]:
# === Chapter 23: Hyperparameter-Tuning Full Fine-Tuning for GPT-Neo (fixed) ===

import optuna
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import torch

def neo_ft_objective(trial):
    # 1) sample a learning rate and batch size
    lr = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    bs = trial.suggest_categorical("batch_size", [4, 8, 16])
    
    # 2) fresh model for each trial
    m = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(ner_labels),
        id2label=id2label,
        label2id=label2id
    )
    # ─────────── FIX ───────────
    # resize embeddings so that PAD token (added once at Chapter 21) fits
    m.resize_token_embeddings(len(tokenizer))
    m.config.pad_token_id = tokenizer.pad_token_id
    # ────────────────────────────

    # 3) training args
    args = TrainingArguments(
        output_dir=f"tmp/gptneo-ft-{trial.number}",
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs*2,
        evaluation_strategy="steps",
        eval_steps=20,
        save_strategy="no",
        max_steps=100,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
    )

    # 4) trainer & train
    trainer = Trainer(
        model=m,
        args=args,
        train_dataset=hf_train,
        eval_dataset=hf_dev,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_entity_only,
    )
    trainer.train()

    # 5) return Dev-F1 for Optuna to maximize
    return trainer.evaluate()["eval_f1"]

# 6) run the study overnight
study_neo_ft = optuna.create_study(direction="maximize")
study_neo_ft.optimize(neo_ft_objective, n_trials=8)

print("🏆 Best GPT-Neo Full-FT params:", study_neo_ft.best_params,
      "→ Dev-F1 =", study_neo_ft.best_value)


[I 2025-05-17 01:29:23,012] A new study created in memory with name: no-name-a381adde-6e59-40bc-8ea0-daa083131c8e
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.680027,0.882108,0.882108,0.882108
40,No log,0.636975,0.88337,0.88337,0.88337
60,No log,0.634295,0.882988,0.882988,0.882988
80,No log,0.636623,0.882959,0.882959,0.882959
100,No log,0.625744,0.882959,0.882959,0.882959


[I 2025-05-17 01:37:40,447] Trial 0 finished with value: 0.8829587465524323 and parameters: {'learning_rate': 3.782759033636916e-05, 'batch_size': 4}. Best is trial 0 with value: 0.8829587465524323.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.772614,0.875682,0.875682,0.875682
40,No log,0.680847,0.882167,0.882167,0.882167
60,No log,0.652842,0.882812,0.882812,0.882812
80,No log,0.64911,0.882871,0.882871,0.882871
100,No log,0.647396,0.882871,0.882871,0.882871


[I 2025-05-17 02:03:27,447] Trial 1 finished with value: 0.8828707235490875 and parameters: {'learning_rate': 1.182318193795246e-05, 'batch_size': 16}. Best is trial 0 with value: 0.8829587465524323.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.689386,0.882343,0.882343,0.882343
40,No log,0.648881,0.882929,0.882929,0.882929
60,No log,0.63418,0.882929,0.882929,0.882929
80,No log,0.634273,0.882959,0.882959,0.882959
100,No log,0.628873,0.882988,0.882988,0.882988


[I 2025-05-17 02:26:50,582] Trial 2 finished with value: 0.8829880875535473 and parameters: {'learning_rate': 2.7233372871192413e-05, 'batch_size': 16}. Best is trial 2 with value: 0.8829880875535473.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.804544,0.875213,0.875213,0.875213
40,No log,0.707226,0.880699,0.880699,0.880699
60,No log,0.680277,0.881902,0.881902,0.881902
80,No log,0.668567,0.882255,0.882255,0.882255
100,No log,0.668243,0.882343,0.882343,0.882343


[I 2025-05-17 02:39:40,217] Trial 3 finished with value: 0.8823425855290182 and parameters: {'learning_rate': 1.0562315509311206e-05, 'batch_size': 8}. Best is trial 2 with value: 0.8829880875535473.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.7148,0.879262,0.879262,0.879262
40,No log,0.659911,0.882548,0.882548,0.882548
60,No log,0.646126,0.882929,0.882929,0.882929
80,No log,0.65109,0.882841,0.882841,0.882841
100,No log,0.637015,0.882841,0.882841,0.882841


[I 2025-05-17 02:47:04,257] Trial 4 finished with value: 0.8828413825479726 and parameters: {'learning_rate': 2.5539261234013832e-05, 'batch_size': 4}. Best is trial 2 with value: 0.8829880875535473.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.74235,0.880553,0.880553,0.880553
40,No log,0.669068,0.882841,0.882841,0.882841
60,No log,0.642779,0.882929,0.882929,0.882929
80,No log,0.637915,0.882988,0.882988,0.882988
100,No log,0.636968,0.882988,0.882988,0.882988


[I 2025-05-17 02:59:41,602] Trial 5 finished with value: 0.8829880875535473 and parameters: {'learning_rate': 2.0512760971655216e-05, 'batch_size': 8}. Best is trial 2 with value: 0.8829880875535473.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.763084,0.867555,0.867555,0.867555
40,No log,0.672668,0.881697,0.881697,0.881697
60,No log,0.644924,0.882665,0.882665,0.882665
80,No log,0.644909,0.882783,0.882783,0.882783
100,No log,0.640749,0.882783,0.882783,0.882783


[I 2025-05-17 03:22:42,378] Trial 6 finished with value: 0.8827827005457425 and parameters: {'learning_rate': 1.6819568284149955e-05, 'batch_size': 16}. Best is trial 2 with value: 0.8829880875535473.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.685055,0.882489,0.882489,0.882489
40,No log,0.648965,0.882929,0.882929,0.882929
60,No log,0.630707,0.882959,0.882959,0.882959
80,No log,0.631525,0.882988,0.882988,0.882988
100,No log,0.626061,0.882988,0.882988,0.882988


[I 2025-05-17 03:45:37,757] Trial 7 finished with value: 0.8829880875535473 and parameters: {'learning_rate': 3.0720514907054634e-05, 'batch_size': 16}. Best is trial 2 with value: 0.8829880875535473.


🏆 Best GPT-Neo Full-FT params: {'learning_rate': 2.7233372871192413e-05, 'batch_size': 16} → Dev-F1 = 0.8829880875535473


## Chapter 24: LoRA Hyperparameter-Tuning for GPT-Neo

In [83]:
# === Chapter 24: LoRA Hyperparameter-Tuning for GPT-Neo (fixed) ===

import optuna
import torch
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

def neo_lora_objective(trial):
    # 1) sample hyperparameters
    lr      = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    r       = trial.suggest_categorical("r", [4, 8, 16])
    alpha   = trial.suggest_categorical("alpha", [16, 32])
    dropout = trial.suggest_float("dropout", 0.0, 0.3)
    bs      = trial.suggest_categorical("batch_size", [4, 8, 16])

    # 2) configure LoRA adapter
    lora_conf = LoraConfig(
        task_type="TOKEN_CLS",
        inference_mode=False,
        r=r,
        lora_alpha=alpha,
        lora_dropout=dropout,
    )
    base = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(ner_labels),
        id2label=id2label,
        label2id=label2id,
    )
    lora_model = get_peft_model(base, lora_conf)

    # ─────────── FIX: ensure new PAD token is in embedding matrix ───────────
    lora_model.resize_token_embeddings(len(tokenizer))
    lora_model.config.pad_token_id = tokenizer.pad_token_id
    # ──────────────────────────────────────────────────────────────────────────

    # 3) training args
    args = TrainingArguments(
        output_dir=f"tmp/gptneo-lora-{trial.number}",
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs*2,
        evaluation_strategy="steps",
        eval_steps=20,
        save_strategy="no",
        max_steps=100,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
    )

    # 4) trainer & train
    trainer = Trainer(
        model=lora_model,
        args=args,
        train_dataset=hf_train,
        eval_dataset=hf_dev,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_entity_only,
    )
    trainer.train()

    # 5) return Dev-F1 for Optuna
    return trainer.evaluate()["eval_f1"]

# 6) run the study
study_neo_lora = optuna.create_study(direction="maximize")
study_neo_lora.optimize(neo_lora_objective, n_trials=8)

print("🏆 Best GPT-Neo LoRA params:", study_neo_lora.best_params,
      "→ Dev-F1 =", study_neo_lora.best_value)


[I 2025-05-17 03:45:37,773] A new study created in memory with name: no-name-3d390395-92e6-42d6-85d4-e84bb04a2440
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.749797,0.880083,0.880083,0.880083
40,No log,0.690616,0.881785,0.881785,0.881785
60,No log,0.658776,0.882489,0.882489,0.882489
80,No log,0.657126,0.882548,0.882548,0.882548
100,No log,0.653099,0.882665,0.882665,0.882665


[I 2025-05-17 04:05:53,095] Trial 0 finished with value: 0.8826653365412828 and parameters: {'learning_rate': 0.00017023382278520056, 'r': 16, 'alpha': 32, 'dropout': 0.27867312010953715, 'batch_size': 16}. Best is trial 0 with value: 0.8826653365412828.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.778641,0.877736,0.877736,0.877736
40,No log,0.720792,0.878176,0.878176,0.878176
60,No log,0.686131,0.881316,0.881316,0.881316
80,No log,0.677228,0.881961,0.881961,0.881961
100,No log,0.675843,0.88199,0.88199,0.88199


[I 2025-05-17 04:16:58,245] Trial 1 finished with value: 0.8819904935156387 and parameters: {'learning_rate': 0.00019670484556358483, 'r': 16, 'alpha': 16, 'dropout': 0.1399427413465223, 'batch_size': 8}. Best is trial 0 with value: 0.8826653365412828.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.115436,0.011502,0.011502,0.011502
40,No log,3.152608,0.117804,0.117804,0.117804
60,No log,2.419782,0.394724,0.394724,0.394724
80,No log,1.982409,0.598791,0.598791,0.598791
100,No log,1.839116,0.657708,0.657708,0.657708


[I 2025-05-17 04:22:59,148] Trial 2 finished with value: 0.6577078809928995 and parameters: {'learning_rate': 3.1048016853416664e-05, 'r': 16, 'alpha': 32, 'dropout': 0.15310587001879, 'batch_size': 4}. Best is trial 0 with value: 0.8826653365412828.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.232097,0.005399,0.005399,0.005399
40,No log,3.249663,0.062438,0.062438,0.062438
60,No log,2.496342,0.316237,0.316237,0.316237
80,No log,2.048756,0.549586,0.549586,0.549586
100,No log,1.901574,0.620386,0.620386,0.620386


[I 2025-05-17 04:34:00,047] Trial 3 finished with value: 0.6203861275746728 and parameters: {'learning_rate': 2.9884670409584333e-05, 'r': 8, 'alpha': 32, 'dropout': 0.02495077214792871, 'batch_size': 8}. Best is trial 0 with value: 0.8826653365412828.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.751109,0.874479,0.874479,0.874479
40,No log,0.660312,0.882871,0.882871,0.882871
60,No log,0.640613,0.883105,0.883105,0.883105
80,No log,0.641691,0.883105,0.883105,0.883105
100,No log,0.636624,0.883105,0.883105,0.883105


[I 2025-05-17 04:54:13,986] Trial 4 finished with value: 0.8831054515580071 and parameters: {'learning_rate': 0.00031630029815269686, 'r': 4, 'alpha': 32, 'dropout': 0.05831405767768296, 'batch_size': 16}. Best is trial 4 with value: 0.8831054515580071.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,2.169436,0.477407,0.477407,0.477407
40,No log,1.101261,0.83484,0.83484,0.83484
60,No log,0.826886,0.869462,0.869462,0.869462
80,No log,0.781257,0.87131,0.87131,0.87131
100,No log,0.77362,0.870782,0.870782,0.870782


[I 2025-05-17 05:05:14,654] Trial 5 finished with value: 0.8707822310897247 and parameters: {'learning_rate': 5.0282113237116235e-05, 'r': 8, 'alpha': 32, 'dropout': 0.18124097185338556, 'batch_size': 8}. Best is trial 4 with value: 0.8831054515580071.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.590802,0.003286,0.003286,0.003286
40,No log,4.059296,0.013233,0.013233,0.013233
60,No log,3.65203,0.036471,0.036471,0.036471
80,No log,3.397099,0.067572,0.067572,0.067572
100,No log,3.308103,0.083035,0.083035,0.083035


[I 2025-05-17 05:16:20,081] Trial 6 finished with value: 0.08303503315533126 and parameters: {'learning_rate': 1.7679371395531382e-05, 'r': 16, 'alpha': 32, 'dropout': 0.0567647209448687, 'batch_size': 8}. Best is trial 4 with value: 0.8831054515580071.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.693647,0.882665,0.882665,0.882665
40,No log,0.635894,0.883105,0.883105,0.883105
60,No log,0.614593,0.883105,0.883105,0.883105
80,No log,0.614911,0.883076,0.883076,0.883076
100,No log,0.61187,0.883076,0.883076,0.883076


[I 2025-05-17 05:36:38,878] Trial 7 finished with value: 0.8830761105568922 and parameters: {'learning_rate': 0.000928420074451464, 'r': 4, 'alpha': 32, 'dropout': 0.2968941921119626, 'batch_size': 16}. Best is trial 4 with value: 0.8831054515580071.


🏆 Best GPT-Neo LoRA params: {'learning_rate': 0.00031630029815269686, 'r': 4, 'alpha': 32, 'dropout': 0.05831405767768296, 'batch_size': 16} → Dev-F1 = 0.8831054515580071


## Chapter 25: Partial-Freeze Hyperparameter-Tuning for GPT-Neo

In [84]:
# === Chapter 25: Partial-Freeze Hyperparameter-Tuning for GPT-Neo (fixed) ===

import optuna
import torch
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

def neo_freeze_objective(trial):
    # 1) sample hyperparameters
    lr  = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    bs  = trial.suggest_categorical("batch_size", [4, 8, 16])
    pct = trial.suggest_float("freeze_pct", 0.25, 0.75)

    # 2) fresh model
    m = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(ner_labels),
        id2label=id2label,
        label2id=label2id,
    )
    # ─────────── FIX: ensure pad token embedding exists ───────────
    m.resize_token_embeddings(len(tokenizer))
    m.config.pad_token_id = tokenizer.pad_token_id
    # ────────────────────────────────────────────────────────────────

    # 3) freeze first pct of transformer layers
    total = len([n for n,_ in m.named_parameters() if n.startswith("transformer.h.")])
    cutoff = int(total * pct)
    for name, param in m.named_parameters():
        if name.startswith("transformer.h.") and int(name.split(".")[2]) < cutoff:
            param.requires_grad = False

    # 4) training args
    args = TrainingArguments(
        output_dir=f"tmp/gptneo-freeze-{trial.number}",
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs*2,
        evaluation_strategy="steps",
        eval_steps=20,
        save_strategy="no",
        max_steps=100,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
    )

    # 5) trainer & train
    trainer = Trainer(
        model=m,
        args=args,
        train_dataset=hf_train,
        eval_dataset=hf_dev,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_entity_only,
    )
    trainer.train()

    # 6) return Dev-F1
    return trainer.evaluate()["eval_f1"]

# 7) run the study
study_neo_freeze = optuna.create_study(direction="maximize")
study_neo_freeze.optimize(neo_freeze_objective, n_trials=8)

print("🏆 Best GPT-Neo Freeze params:", study_neo_freeze.best_params,
      "→ Dev-F1 =", study_neo_freeze.best_value)


[I 2025-05-17 05:36:38,889] A new study created in memory with name: no-name-034ad9c7-93db-439d-b948-ac41a0dc5d17
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,2.298176,0.423039,0.423039,0.423039
40,No log,1.397294,0.775571,0.775571,0.775571
60,No log,1.031718,0.842439,0.842439,0.842439
80,No log,0.90954,0.857843,0.857843,0.857843
100,No log,0.88103,0.860337,0.860337,0.860337


[I 2025-05-17 05:42:38,007] Trial 0 finished with value: 0.8603368346927998 and parameters: {'learning_rate': 4.693600573046882e-05, 'batch_size': 4, 'freeze_pct': 0.25052201868831997}. Best is trial 0 with value: 0.8603368346927998.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,2.957616,0.112171,0.112171,0.112171
40,No log,2.357226,0.373452,0.373452,0.373452
60,No log,1.970128,0.582272,0.582272,0.582272
80,No log,1.760023,0.673024,0.673024,0.673024
100,No log,1.692249,0.699284,0.699284,0.699284


[I 2025-05-17 05:53:25,410] Trial 1 finished with value: 0.699284079572795 and parameters: {'learning_rate': 2.054713098524259e-05, 'batch_size': 8, 'freeze_pct': 0.2613474753248602}. Best is trial 0 with value: 0.8603368346927998.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,3.868567,0.03204,0.03204,0.03204
40,No log,3.220285,0.112464,0.112464,0.112464
60,No log,2.777935,0.225045,0.225045,0.225045
80,No log,2.523161,0.312188,0.312188,0.312188
100,No log,2.438042,0.343524,0.343524,0.343524


[I 2025-05-17 05:59:25,421] Trial 2 finished with value: 0.34352444105392876 and parameters: {'learning_rate': 2.4083598074175995e-05, 'batch_size': 4, 'freeze_pct': 0.3274962445505534}. Best is trial 0 with value: 0.8603368346927998.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,3.213424,0.056716,0.056716,0.056716
40,No log,2.780108,0.170999,0.170999,0.170999
60,No log,2.484169,0.306203,0.306203,0.306203
80,No log,2.313495,0.396397,0.396397,0.396397
100,No log,2.256195,0.427,0.427,0.427


[I 2025-05-17 06:10:12,138] Trial 3 finished with value: 0.4269995892259844 and parameters: {'learning_rate': 1.4124656276571285e-05, 'batch_size': 8, 'freeze_pct': 0.3726377583154495}. Best is trial 0 with value: 0.8603368346927998.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,3.759666,0.040667,0.040667,0.040667
40,No log,3.04132,0.153952,0.153952,0.153952
60,No log,2.556643,0.304149,0.304149,0.304149
80,No log,2.282338,0.407312,0.407312,0.407312
100,No log,2.191968,0.44525,0.44525,0.44525


[I 2025-05-17 06:29:32,566] Trial 4 finished with value: 0.44524969191948827 and parameters: {'learning_rate': 2.37037188874923e-05, 'batch_size': 16, 'freeze_pct': 0.445193786067657}. Best is trial 0 with value: 0.8603368346927998.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,3.607296,0.025732,0.025732,0.025732
40,No log,2.884267,0.174902,0.174902,0.174902
60,No log,2.402962,0.418344,0.418344,0.418344
80,No log,2.132607,0.566721,0.566721,0.566721
100,No log,2.043869,0.61041,0.61041,0.61041


[I 2025-05-17 06:48:50,667] Trial 5 finished with value: 0.6104101871955872 and parameters: {'learning_rate': 2.392840501149533e-05, 'batch_size': 16, 'freeze_pct': 0.4993138528542309}. Best is trial 0 with value: 0.8603368346927998.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,2.843037,0.19236,0.19236,0.19236
40,No log,1.727253,0.734288,0.734288,0.734288
60,No log,1.186735,0.843466,0.843466,0.843466
80,No log,0.983125,0.862567,0.862567,0.862567
100,No log,0.932611,0.866322,0.866322,0.866322


[I 2025-05-17 06:54:49,828] Trial 6 finished with value: 0.8663223989202512 and parameters: {'learning_rate': 4.963009046390126e-05, 'batch_size': 4, 'freeze_pct': 0.4807295614021466}. Best is trial 6 with value: 0.8663223989202512.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,3.088184,0.079984,0.079984,0.079984
40,No log,2.570399,0.26178,0.26178,0.26178
60,No log,2.225145,0.445132,0.445132,0.445132
80,No log,2.030884,0.551992,0.551992,0.551992
100,No log,1.966712,0.583681,0.583681,0.583681


[I 2025-05-17 07:05:36,987] Trial 7 finished with value: 0.5836805351798603 and parameters: {'learning_rate': 1.7247268512087645e-05, 'batch_size': 8, 'freeze_pct': 0.7004998811199978}. Best is trial 6 with value: 0.8663223989202512.


🏆 Best GPT-Neo Freeze params: {'learning_rate': 4.963009046390126e-05, 'batch_size': 4, 'freeze_pct': 0.4807295614021466} → Dev-F1 = 0.8663223989202512


## Kapitel 26: Full Fine-Tuning with best Params

In [89]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# 26.1 Beste Full-FT-Params aus Optuna
best = study_neo_ft.best_params  
# z.B. {'learning_rate': 2.7233e-05, 'batch_size': 16}

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(ner_labels), id2label=id2label, label2id=label2id
)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

args = TrainingArguments(
    output_dir="outputs/gptneo-full-opt",
    per_device_train_batch_size=best["batch_size"],
    per_device_eval_batch_size=best["batch_size"] * 2,
    evaluation_strategy="steps",
    eval_steps=20,
    logging_steps=10,
    save_strategy="no",
    max_steps=200,
    learning_rate=best["learning_rate"],
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model, args=args,
    train_dataset=hf_train, eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)
trainer.train()


Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,1.1071,0.696423,0.88199,0.88199,0.88199
40,0.9236,0.646867,0.882988,0.882988,0.882988
60,0.8664,0.628279,0.883047,0.883047,0.883047
80,0.8833,0.630834,0.883017,0.883017,0.883017
100,0.7699,0.629973,0.882812,0.882812,0.882812
120,0.7376,0.631209,0.881902,0.881902,0.881902
140,0.6912,0.641137,0.88158,0.88158,0.88158
160,0.6479,0.649982,0.880729,0.880729,0.880729
180,0.6602,0.658206,0.880142,0.880142,0.880142
200,0.6307,0.660646,0.879731,0.879731,0.879731


TrainOutput(global_step=200, training_loss=0.8408329391479492, metrics={'train_runtime': 2760.5328, 'train_samples_per_second': 1.159, 'train_steps_per_second': 0.072, 'total_flos': 789123673681920.0, 'train_loss': 0.8408329391479492, 'epoch': 18.181818181818183})

## 27: GPT-Neo LoRA Fine-Tuning with best params

In [90]:
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# 27.1 Beste LoRA-Params aus Optuna
best = study_neo_lora.best_params  
# z.B. {'learning_rate':0.0003163,'r':4,'alpha':32,'dropout':0.0583,'batch_size':16}

lora_conf = LoraConfig(
    task_type="TOKEN_CLS", inference_mode=False,
    r=best["r"], lora_alpha=best["alpha"], lora_dropout=best["dropout"]
)
base = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(ner_labels),
    id2label=id2label, label2id=label2id
)
model = get_peft_model(base, lora_conf)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

args = TrainingArguments(
    output_dir="outputs/gptneo-lora-opt",
    per_device_train_batch_size=best["batch_size"],
    per_device_eval_batch_size=best["batch_size"] * 2,
    evaluation_strategy="steps",
    eval_steps=20,
    logging_steps=10,
    save_strategy="no",
    max_steps=200,
    learning_rate=best["learning_rate"],
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model, args=args,
    train_dataset=hf_train, eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)
trainer.train()


Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,1.1319,0.72194,0.878646,0.878646,0.878646
40,0.9376,0.648442,0.882695,0.882695,0.882695
60,0.8972,0.634618,0.883017,0.883017,0.883017
80,0.9369,0.635759,0.883076,0.883076,0.883076
100,0.8537,0.61939,0.883047,0.883047,0.883047
120,0.8475,0.624178,0.883047,0.883047,0.883047
140,0.8106,0.618386,0.883047,0.883047,0.883047
160,0.7932,0.61172,0.882988,0.882988,0.882988
180,0.8212,0.612359,0.8829,0.8829,0.8829
200,0.7926,0.611126,0.882871,0.882871,0.882871


TrainOutput(global_step=200, training_loss=0.9494753837585449, metrics={'train_runtime': 2451.9182, 'train_samples_per_second': 1.305, 'train_steps_per_second': 0.082, 'total_flos': 790769927577600.0, 'train_loss': 0.9494753837585449, 'epoch': 18.181818181818183})

## 28: GPT-Neo Partial-Freeze Fine-Tuning with best params

In [91]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# 28.1 Beste Freeze-Params aus Optuna
best = study_neo_freeze.best_params  
# z.B. {'learning_rate':4.96e-05,'batch_size':4,'freeze_pct':0.4807}

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(ner_labels),
    id2label=id2label, label2id=label2id
)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

# Freeze first pct of layers
total = len([n for n,_ in model.named_parameters() if n.startswith("transformer.h.")])
cut = int(total * best["freeze_pct"])
for n, p in model.named_parameters():
    if n.startswith("transformer.h.") and int(n.split(".")[2]) < cut:
        p.requires_grad = False

args = TrainingArguments(
    output_dir="outputs/gptneo-freeze-opt",
    per_device_train_batch_size=best["batch_size"],
    per_device_eval_batch_size=best["batch_size"] * 2,
    evaluation_strategy="steps",
    eval_steps=20,
    logging_steps=10,
    save_strategy="no",
    max_steps=200,
    learning_rate=best["learning_rate"],
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model, args=args,
    train_dataset=hf_train, eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)
trainer.train()


Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,2.6082,1.997508,0.588287,0.588287,0.588287
40,1.457,1.0598,0.845666,0.845666,0.845666
60,1.0156,0.802541,0.873804,0.873804,0.873804
80,1.1632,0.750082,0.876621,0.876621,0.876621
100,1.0934,0.73035,0.877061,0.877061,0.877061
120,1.0264,0.71833,0.878235,0.878235,0.878235
140,0.9548,0.712973,0.87888,0.87888,0.87888
160,0.9838,0.709153,0.879027,0.879027,0.879027
180,0.9505,0.708594,0.879086,0.879086,0.879086
200,0.9741,0.707938,0.879086,0.879086,0.879086


TrainOutput(global_step=200, training_loss=1.3035770797729491, metrics={'train_runtime': 770.8462, 'train_samples_per_second': 1.038, 'train_steps_per_second': 0.259, 'total_flos': 206883665111808.0, 'train_loss': 1.3035770797729491, 'epoch': 4.761904761904762})

## 29. Relation-Extraction 

In [115]:
# === Kapitel 29: Relation Extraction – Beispiele aus Train/Dev/Test aufbereiten (robust) ===

from datasets import Dataset, DatasetDict
import random

examples = []

for split, docs in [("train", train_docs), ("dev", dev_docs), ("test", test_docs)]:
    for d in docs:
        # 1) Nur wirklich annotierte Docs verarbeiten
        if not d.get("triples") or not d.get("entities"):
            continue

        # 2) Text-Feld wählen (alt: "doc", neu: "document")
        text = d.get("doc") or d.get("document")
        if not text:
            continue

        # 3) Helfer: alle Entity-Mentions (jeweils der erste Span)
        ents = [e["mentions"][0] for e in d["entities"] if e.get("mentions")]

        # 4) Gold-Paare
        true_pairs = {(t["head"], t["tail"]) for t in d["triples"]}

        # 5) Pro Triple: 1 Positiv + 1 Zufalls-Negativ
        for triple in d["triples"]:
            # — Positiv
            examples.append({
                "split":    split,
                "sentence": text,
                "head":     triple["head"],
                "tail":     triple["tail"],
                "label":    triple["relation"],
            })
            # — Negativ (einfach solange random, bis kein Gold-Paar)
            while True:
                h, t = random.sample(ents, 2)
                if (h, t) not in true_pairs:
                    examples.append({
                        "split":    split,
                        "sentence": text,
                        "head":     h,
                        "tail":     t,
                        "label":    "no_relation",
                    })
                    break

# 6) In DatasetDict nach Split aufteilen
train_ex = [e for e in examples if e["split"] == "train"]
dev_ex   = [e for e in examples if e["split"] == "dev"]
test_ex  = [e for e in examples if e["split"] == "test"]

ds = DatasetDict({
    "train": Dataset.from_list(train_ex),
    "dev":   Dataset.from_list(dev_ex),
    "test":  Dataset.from_list(test_ex),
})

print(ds)


DatasetDict({
    train: Dataset({
        features: ['split', 'sentence', 'head', 'tail', 'label'],
        num_rows: 1222
    })
    dev: Dataset({
        features: ['split', 'sentence', 'head', 'tail', 'label'],
        num_rows: 606
    })
    test: Dataset({
        features: [],
        num_rows: 0
    })
})


## 30. Tokenisierung & Encoding

In [124]:
# === Kapitel 30: DatasetDict für RE bauen ===

from datasets import Dataset, DatasetDict
import pandas as pd

# 30.1: Aus den examples (Kapitel 29) ein DataFrame machen
# examples = [
#   {"split":"train","sentence":...,"head":...,"tail":...,"label":...}, ...
# ]
df = pd.DataFrame(examples)

# 30.2: Train/Dev trennen
train_df = df[df.split == "train"].reset_index(drop=True)
dev_df   = df[df.split   == "dev"].reset_index(drop=True)

# 30.3: Huggingface DatasetDict anlegen
ds = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "dev":   Dataset.from_pandas(dev_df),
})


In [125]:
raw_re_ds = ds 

## Kapitel 31: Tokenisierung & Label-Mapping für RE

In [126]:
# === Kapitel 31: Tokenisierung & Label-Mapping für RE ===

# 31.1: Label2id für alle Relationsklassen (inkl. "no_relation")
all_labels = sorted(df.label.unique())
label2id_re = {lab:i for i,lab in enumerate(all_labels)}

# 31.2: Tokenizer & Max-Length festlegen (hier z.B. Bert)
from transformers import AutoTokenizer
tokenizer_re = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
max_length_re = 128

# 31.3: Tokenisierungsfunktion
def tokenize_re(example):
    # Wir packen head, tail und sentence in eines: "[HEAD] [SEP] [TAIL] [SEP] [SENTENCE]"
    enc = tokenizer_re(
        example["head"],
        example["tail"] + " " + example["sentence"],
        truncation=True,
        max_length=max_length_re,
        padding="max_length"
    )
    enc["labels"] = label2id_re[example["label"]]
    return enc

# 31.4: Map auf ds
ds = ds.map(
    tokenize_re,
    batched=False,
    remove_columns=ds["train"].column_names
)

# 31.5: Für PyTorch vorbereiten
ds.set_format("torch", columns=["input_ids","attention_mask","labels"])


Map:   0%|          | 0/1222 [00:00<?, ? examples/s]

Map:   0%|          | 0/606 [00:00<?, ? examples/s]

## Kapitel 32: Baseline-Trainingsloop für RE

In [127]:
# === Kapitel 32: Baseline-Trainingsloop für RE ===

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score

# 32.1: Model laden
model_re = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id_re)
)

# 32.2: TrainingArguments
training_args_re = TrainingArguments(
    output_dir="outputs/bert-re-baseline",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=1,
    fp16=torch.cuda.is_available()
)

# 32.3: Compute-Metrics
def compute_metrics_re(p):
    preds = p.predictions.argmax(-1)
    return {"eval_f1": f1_score(p.label_ids, preds, average="macro")}

# 32.4: Trainer instanziieren
trainer_re = Trainer(
    model=model_re,
    args=training_args_re,
    train_dataset=ds["train"],
    eval_dataset=ds["dev"],
    tokenizer=tokenizer_re,
    compute_metrics=compute_metrics_re
)

# 32.5: Train & Eval
trainer_re.train()
metrics_re = trainer_re.evaluate()
print("🔖 RE Baseline Dev-F1:", metrics_re["eval_f1"])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_re = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## 33: RE Hyperparameter‐Tuning – Full Fine-Tuning

In [135]:
# === Chapter 33: RE Hyperparameter-Tuning (Full Fine-Tuning) with Steps ===

# 33.1: Build your label2id/id2label exactly as before
re_labels   = sorted(set(raw_re_ds["train"]["label"]) | set(raw_re_ds["dev"]["label"]))
label2id_re = {l:i for i,l in enumerate(re_labels)}
id2label_re = {i:l for l,i in label2id_re.items()}

hf_train_re = ds["train"]
hf_dev_re   = ds["dev"]

def compute_metrics_re(p):
    preds = p.predictions.argmax(-1)
    labs  = p.label_ids
    prec, rec, f1, _ = precision_recall_fscore_support(labs, preds, average="micro", zero_division=0)
    acc = accuracy_score(labs, preds)
    return {"precision": prec, "recall": rec, "f1": f1, "accuracy": acc}

def re_ft_objective(trial):
    lr = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    bs = trial.suggest_categorical("batch_size", [8, 16, 32])

    args = TrainingArguments(
        output_dir=f"tmp/re-ft-{trial.number}",
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs,
        evaluation_strategy="steps",
        eval_steps=10,
        logging_steps=10,
        save_strategy="no",
        max_steps=100,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=len(label2id_re),
        id2label=id2label_re,
        label2id=label2id_re,
    )
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=hf_train_re,
        eval_dataset=hf_dev_re,
        tokenizer=tokenizer_re,
        compute_metrics=compute_metrics_re,
    )
    trainer.train()
    return trainer.evaluate()["eval_f1"]

study_re_ft = optuna.create_study(direction="maximize")
study_re_ft.optimize(re_ft_objective, n_trials=8)
print("🏆 Best RE Full-FT params:", study_re_ft.best_params, "→ Dev-F1 =", study_re_ft.best_value)


[I 2025-05-17 16:47:20,361] A new study created in memory with name: no-name-e30ea694-5a9a-4969-88c5-23e06590f2a5
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,3.8274,3.300838,0.5,0.5,0.5,0.5
20,2.9761,2.830147,0.5,0.5,0.5,0.5
30,3.0733,2.755363,0.5,0.5,0.5,0.5
40,2.7047,2.710929,0.5,0.5,0.5,0.5
50,2.6366,2.691566,0.5,0.5,0.5,0.5
60,2.5242,2.652421,0.5,0.5,0.5,0.5
70,2.4889,2.657772,0.5,0.5,0.5,0.5
80,2.442,2.626473,0.5,0.5,0.5,0.5
90,2.1888,2.620448,0.5,0.5,0.5,0.5
100,2.0459,2.62436,0.5,0.5,0.5,0.5


[I 2025-05-17 16:54:19,938] Trial 0 finished with value: 0.5 and parameters: {'learning_rate': 4.3575643120387554e-05, 'batch_size': 16}. Best is trial 0 with value: 0.5.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.235,4.106583,0.219472,0.219472,0.219472,0.219472
20,4.0054,3.849129,0.475248,0.475248,0.475248,0.475248
30,3.7442,3.648604,0.481848,0.481848,0.481848,0.481848
40,3.5108,3.494017,0.5,0.5,0.5,0.5
50,3.5875,3.336197,0.5,0.5,0.5,0.5
60,3.3916,3.198136,0.5,0.5,0.5,0.5
70,3.2046,3.114041,0.5,0.5,0.5,0.5
80,3.0321,3.066108,0.5,0.5,0.5,0.5
90,3.0115,3.037939,0.5,0.5,0.5,0.5
100,3.1329,3.026117,0.5,0.5,0.5,0.5


[I 2025-05-17 16:59:57,198] Trial 1 finished with value: 0.5 and parameters: {'learning_rate': 1.1302804238596893e-05, 'batch_size': 8}. Best is trial 0 with value: 0.5.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.1511,3.865378,0.5,0.5,0.5,0.5
20,3.5423,3.111426,0.5,0.5,0.5,0.5
30,2.7702,2.79796,0.5,0.5,0.5,0.5
40,2.6331,2.774837,0.5,0.5,0.5,0.5
50,3.0852,2.821862,0.5,0.5,0.5,0.5
60,2.848,2.760433,0.5,0.5,0.5,0.5
70,2.8087,2.778891,0.5,0.5,0.5,0.5
80,2.6164,2.725522,0.5,0.5,0.5,0.5
90,2.5237,2.718717,0.5,0.5,0.5,0.5
100,2.6635,2.717334,0.5,0.5,0.5,0.5


[I 2025-05-17 17:05:37,639] Trial 2 finished with value: 0.5 and parameters: {'learning_rate': 3.698405938088476e-05, 'batch_size': 8}. Best is trial 0 with value: 0.5.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.3236,4.160428,0.0,0.0,0.0,0.0
20,4.0937,3.88129,0.49835,0.49835,0.49835,0.49835
30,3.653,3.509754,0.5,0.5,0.5,0.5
40,3.3986,3.245943,0.5,0.5,0.5,0.5
50,3.3564,3.054633,0.5,0.5,0.5,0.5
60,3.1035,2.96065,0.5,0.5,0.5,0.5
70,3.0473,2.885209,0.5,0.5,0.5,0.5
80,2.8457,2.852964,0.5,0.5,0.5,0.5
90,2.7105,2.834198,0.5,0.5,0.5,0.5
100,2.8854,2.828054,0.5,0.5,0.5,0.5


[I 2025-05-17 17:11:17,778] Trial 3 finished with value: 0.5 and parameters: {'learning_rate': 1.5358595750311107e-05, 'batch_size': 8}. Best is trial 0 with value: 0.5.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.2347,3.863283,0.455446,0.455446,0.455446,0.455446
20,3.6824,3.309893,0.5,0.5,0.5,0.5
30,3.0893,2.896867,0.5,0.5,0.5,0.5
40,2.7692,2.7707,0.5,0.5,0.5,0.5
50,2.3951,2.721347,0.5,0.5,0.5,0.5
60,2.6733,2.701469,0.5,0.5,0.5,0.5
70,2.8029,2.699732,0.5,0.5,0.5,0.5
80,2.6483,2.673278,0.5,0.5,0.5,0.5
90,2.6305,2.671589,0.5,0.5,0.5,0.5
100,2.443,2.67239,0.5,0.5,0.5,0.5


[I 2025-05-17 17:21:28,920] Trial 4 finished with value: 0.5 and parameters: {'learning_rate': 2.371455620448719e-05, 'batch_size': 32}. Best is trial 0 with value: 0.5.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.0268,3.790518,0.40264,0.40264,0.40264,0.40264
20,3.4489,3.146526,0.5,0.5,0.5,0.5
30,2.7317,2.802326,0.5,0.5,0.5,0.5
40,2.6283,2.783183,0.5,0.5,0.5,0.5
50,3.0356,2.750516,0.5,0.5,0.5,0.5
60,2.8173,2.696826,0.5,0.5,0.5,0.5
70,2.75,2.695716,0.5,0.5,0.5,0.5
80,2.5238,2.660494,0.5,0.5,0.5,0.5
90,2.4418,2.64685,0.5,0.5,0.5,0.5
100,2.5536,2.639212,0.5,0.5,0.5,0.5


[I 2025-05-17 17:26:53,339] Trial 5 finished with value: 0.5 and parameters: {'learning_rate': 4.3873912745852076e-05, 'batch_size': 8}. Best is trial 0 with value: 0.5.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.1142,3.754726,0.5,0.5,0.5,0.5
20,3.4349,2.964732,0.5,0.5,0.5,0.5
30,2.6827,2.794878,0.5,0.5,0.5,0.5
40,2.5647,2.773169,0.5,0.5,0.5,0.5
50,3.0534,2.779618,0.5,0.5,0.5,0.5
60,2.8437,2.736444,0.5,0.5,0.5,0.5
70,2.765,2.720011,0.5,0.5,0.5,0.5
80,2.5603,2.714036,0.5,0.5,0.5,0.5
90,2.456,2.706724,0.5,0.5,0.5,0.5
100,2.6246,2.701765,0.5,0.5,0.5,0.5


[I 2025-05-17 17:32:18,785] Trial 6 finished with value: 0.5 and parameters: {'learning_rate': 4.247069521480185e-05, 'batch_size': 8}. Best is trial 0 with value: 0.5.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.2975,4.077745,0.028053,0.028053,0.028053,0.028053
20,3.845,3.605651,0.5,0.5,0.5,0.5
30,3.4521,3.09205,0.5,0.5,0.5,0.5
40,3.0538,2.872872,0.5,0.5,0.5,0.5
50,2.7798,2.789164,0.5,0.5,0.5,0.5
60,2.7171,2.750902,0.5,0.5,0.5,0.5
70,2.7034,2.738456,0.5,0.5,0.5,0.5
80,2.6445,2.726763,0.5,0.5,0.5,0.5
90,2.4155,2.7218,0.5,0.5,0.5,0.5
100,2.3276,2.720441,0.5,0.5,0.5,0.5


[I 2025-05-17 17:39:21,046] Trial 7 finished with value: 0.5 and parameters: {'learning_rate': 2.0376354374117534e-05, 'batch_size': 16}. Best is trial 0 with value: 0.5.


🏆 Best RE Full-FT params: {'learning_rate': 4.3575643120387554e-05, 'batch_size': 16} → Dev-F1 = 0.5


## Chapter 34: RE Hyperparameter-Tuning (LoRA)

In [136]:
# === Chapter 34: RE Hyperparameter-Tuning (LoRA) with Steps ===

from peft import LoraConfig, get_peft_model

def re_lora_objective(trial):
    lr      = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    r       = trial.suggest_categorical("r", [4, 8, 16])
    alpha   = trial.suggest_categorical("alpha", [16, 32])
    dropout = trial.suggest_float("dropout", 0.0, 0.3)
    bs      = trial.suggest_categorical("batch_size", [8, 16, 32])

    lora_conf = LoraConfig(
        task_type="SEQ_CLS", inference_mode=False,
        r=r, lora_alpha=alpha, lora_dropout=dropout,
    )
    base = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=len(label2id_re),
        id2label=id2label_re,
        label2id=label2id_re,
    )
    lora_model = get_peft_model(base, lora_conf)

    args = TrainingArguments(
        output_dir=f"tmp/re-lora-{trial.number}",
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs,
        evaluation_strategy="steps",
        eval_steps=10,
        logging_steps=10,
        save_strategy="no",
        max_steps=100,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
    )

    trainer = Trainer(
        model=lora_model,
        args=args,
        train_dataset=hf_train_re,
        eval_dataset=hf_dev_re,
        tokenizer=tokenizer_re,
        compute_metrics=compute_metrics_re,
    )
    trainer.train()
    return trainer.evaluate()["eval_f1"]

study_re_lora = optuna.create_study(direction="maximize")
study_re_lora.optimize(re_lora_objective, n_trials=8)
print("🏆 Best RE LoRA params:", study_re_lora.best_params, "→ Dev-F1 =", study_re_lora.best_value)


[I 2025-05-17 17:39:21,055] A new study created in memory with name: no-name-ae73170c-7a86-408d-bfbc-c7d26bcee63a
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.252,4.306705,0.028053,0.028053,0.028053,0.028053
20,4.2047,4.246798,0.029703,0.029703,0.029703,0.029703
30,4.1097,4.192232,0.057756,0.057756,0.057756,0.057756
40,4.0178,4.138948,0.087459,0.087459,0.087459,0.087459
50,4.0962,4.088309,0.176568,0.176568,0.176568,0.176568
60,3.9908,4.044281,0.247525,0.247525,0.247525,0.247525
70,3.9316,4.006651,0.290429,0.290429,0.290429,0.290429
80,3.8262,3.978817,0.328383,0.328383,0.328383,0.328383
90,3.8273,3.961478,0.339934,0.339934,0.339934,0.339934
100,3.8437,3.95562,0.344884,0.344884,0.344884,0.344884


[I 2025-05-17 17:44:06,969] Trial 0 finished with value: 0.3448844884488449 and parameters: {'learning_rate': 3.819414458965424e-05, 'r': 16, 'alpha': 16, 'dropout': 0.030171143674280452, 'batch_size': 8}. Best is trial 0 with value: 0.3448844884488449.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.6284,4.467794,0.008251,0.008251,0.008251,0.008251
20,4.4364,4.330542,0.009901,0.009901,0.009901,0.009901
30,4.2258,4.19066,0.016502,0.016502,0.016502,0.016502
40,4.024,4.042905,0.156766,0.156766,0.156766,0.156766
50,3.8358,3.878495,0.392739,0.392739,0.392739,0.392739
60,3.7008,3.711313,0.458746,0.458746,0.458746,0.458746
70,3.5301,3.571662,0.4967,0.4967,0.4967,0.4967
80,3.4109,3.471841,0.5,0.5,0.5,0.5
90,3.2337,3.413323,0.49835,0.49835,0.49835,0.49835
100,3.1297,3.392461,0.49835,0.49835,0.49835,0.49835


[I 2025-05-17 17:50:24,275] Trial 1 finished with value: 0.49834983498349833 and parameters: {'learning_rate': 8.386716846610819e-05, 'r': 8, 'alpha': 16, 'dropout': 0.11397258975734581, 'batch_size': 16}. Best is trial 1 with value: 0.49834983498349833.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,3.9381,3.393328,0.49835,0.49835,0.49835,0.49835
20,2.754,2.704489,0.5,0.5,0.5,0.5
30,2.8699,2.786217,0.5,0.5,0.5,0.5
40,2.4884,2.885568,0.5,0.5,0.5,0.5
50,2.4889,2.909358,0.5,0.5,0.5,0.5
60,2.5165,2.876363,0.5,0.5,0.5,0.5
70,2.4606,2.861381,0.5,0.5,0.5,0.5
80,2.4201,2.850537,0.5,0.5,0.5,0.5
90,2.2044,2.861014,0.5,0.5,0.5,0.5
100,2.0412,2.873683,0.5,0.5,0.5,0.5


[I 2025-05-17 17:56:42,957] Trial 2 finished with value: 0.5 and parameters: {'learning_rate': 0.00032032068892167987, 'r': 8, 'alpha': 32, 'dropout': 0.19912026952009523, 'batch_size': 16}. Best is trial 2 with value: 0.5.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.0305,3.784569,0.387789,0.387789,0.387789,0.387789
20,3.3423,3.000968,0.5,0.5,0.5,0.5
30,2.6301,2.723487,0.5,0.5,0.5,0.5
40,2.4213,2.796836,0.5,0.5,0.5,0.5
50,2.1141,2.842041,0.5,0.5,0.5,0.5
60,2.3881,2.866793,0.5,0.5,0.5,0.5
70,2.67,2.868516,0.5,0.5,0.5,0.5
80,2.455,2.864289,0.5,0.5,0.5,0.5
90,2.4754,2.870556,0.5,0.5,0.5,0.5
100,2.316,2.871215,0.5,0.5,0.5,0.5


[I 2025-05-17 18:05:38,132] Trial 3 finished with value: 0.5 and parameters: {'learning_rate': 0.00021580480439878258, 'r': 4, 'alpha': 16, 'dropout': 0.16005040240833235, 'batch_size': 32}. Best is trial 2 with value: 0.5.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.0742,4.178323,0.133663,0.133663,0.133663,0.133663
20,4.0933,4.129025,0.19802,0.19802,0.19802,0.19802
30,3.9884,4.082238,0.278878,0.278878,0.278878,0.278878
40,3.9481,4.037941,0.315182,0.315182,0.315182,0.315182
50,3.8357,3.99755,0.346535,0.346535,0.346535,0.346535
60,3.8653,3.961963,0.371287,0.371287,0.371287,0.371287
70,3.8645,3.933297,0.387789,0.387789,0.387789,0.387789
80,3.8245,3.912088,0.391089,0.391089,0.391089,0.391089
90,3.7997,3.898384,0.392739,0.392739,0.392739,0.392739
100,3.7393,3.893719,0.392739,0.392739,0.392739,0.392739


[I 2025-05-17 18:14:25,864] Trial 4 finished with value: 0.3927392739273928 and parameters: {'learning_rate': 1.7526175007357064e-05, 'r': 8, 'alpha': 16, 'dropout': 0.1255252236159072, 'batch_size': 32}. Best is trial 2 with value: 0.5.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,3.9873,4.001821,0.364686,0.364686,0.364686,0.364686
20,3.8488,3.759772,0.462046,0.462046,0.462046,0.462046
30,3.569,3.536906,0.4967,0.4967,0.4967,0.4967
40,3.3542,3.336749,0.49835,0.49835,0.49835,0.49835
50,2.9943,3.171877,0.5,0.5,0.5,0.5
60,2.9708,3.046108,0.5,0.5,0.5,0.5
70,3.0114,2.959699,0.5,0.5,0.5,0.5
80,2.7847,2.906876,0.5,0.5,0.5,0.5
90,2.7619,2.880267,0.5,0.5,0.5,0.5
100,2.6145,2.871498,0.5,0.5,0.5,0.5


[I 2025-05-17 18:23:17,393] Trial 5 finished with value: 0.5 and parameters: {'learning_rate': 4.9824137594537844e-05, 'r': 16, 'alpha': 32, 'dropout': 0.11096296747671479, 'batch_size': 32}. Best is trial 2 with value: 0.5.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,2.7455,3.315737,0.5,0.5,0.5,0.5
20,3.1113,3.077214,0.5,0.5,0.5,0.5
30,2.563,2.974251,0.5,0.5,0.5,0.5
40,2.3975,3.040703,0.5,0.5,0.5,0.5
50,2.808,2.919446,0.5,0.5,0.5,0.5
60,2.9045,2.883625,0.5,0.5,0.5,0.5
70,2.6205,2.934661,0.5,0.5,0.5,0.5
80,2.3653,2.920622,0.5,0.5,0.5,0.5
90,2.5239,2.916737,0.5,0.5,0.5,0.5
100,2.4634,2.911949,0.5,0.5,0.5,0.5


[I 2025-05-17 18:27:56,351] Trial 6 finished with value: 0.5 and parameters: {'learning_rate': 0.000726832978635972, 'r': 4, 'alpha': 32, 'dropout': 0.05691534174746442, 'batch_size': 8}. Best is trial 2 with value: 0.5.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.4895,4.238732,0.019802,0.019802,0.019802,0.019802
20,3.9671,3.748173,0.430693,0.430693,0.430693,0.430693
30,3.4106,3.275074,0.5,0.5,0.5,0.5
40,2.8868,2.868564,0.5,0.5,0.5,0.5
50,2.2961,2.721523,0.5,0.5,0.5,0.5
60,2.4363,2.775628,0.5,0.5,0.5,0.5
70,2.6955,2.793723,0.5,0.5,0.5,0.5
80,2.4734,2.791213,0.5,0.5,0.5,0.5
90,2.4806,2.799067,0.5,0.5,0.5,0.5
100,2.3142,2.801023,0.5,0.5,0.5,0.5


[I 2025-05-17 18:36:47,777] Trial 7 finished with value: 0.5 and parameters: {'learning_rate': 0.00018133579482868897, 'r': 16, 'alpha': 32, 'dropout': 0.13780930049521228, 'batch_size': 32}. Best is trial 2 with value: 0.5.


🏆 Best RE LoRA params: {'learning_rate': 0.00032032068892167987, 'r': 8, 'alpha': 32, 'dropout': 0.19912026952009523, 'batch_size': 16} → Dev-F1 = 0.5


## Chapter 35: RE Hyperparameter-Tuning (Partial-Freeze)

In [137]:
# === Chapter 35: RE Hyperparameter-Tuning (Partial-Freeze) with Steps ===

def re_freeze_objective(trial):
    lr  = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    bs  = trial.suggest_categorical("batch_size", [8, 16, 32])
    pct = trial.suggest_float("freeze_pct", 0.25, 0.75)

    m = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=len(label2id_re),
        id2label=id2label_re,
        label2id=label2id_re,
    )
    total = len([n for n,_ in m.named_parameters() if n.startswith("bert.encoder.layer.")])
    cut   = int(total * pct)
    for name, param in m.named_parameters():
        if name.startswith("bert.encoder.layer.") and int(name.split(".")[3]) < cut:
            param.requires_grad = False

    args = TrainingArguments(
        output_dir=f"tmp/re-freeze-{trial.number}",
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs,
        evaluation_strategy="steps",
        eval_steps=10,
        logging_steps=10,
        save_strategy="no",
        max_steps=100,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
    )

    trainer = Trainer(
        model=m,
        args=args,
        train_dataset=hf_train_re,
        eval_dataset=hf_dev_re,
        tokenizer=tokenizer_re,
        compute_metrics=compute_metrics_re,
    )
    trainer.train()
    return trainer.evaluate()["eval_f1"]

study_re_freeze = optuna.create_study(direction="maximize")
study_re_freeze.optimize(re_freeze_objective, n_trials=8)
print("🏆 Best RE Freeze params:", study_re_freeze.best_params, "→ Dev-F1 =", study_re_freeze.best_value)


[I 2025-05-17 18:36:47,788] A new study created in memory with name: no-name-cfceb62a-ae33-477e-9451-e56f1393b9df
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,3.9573,3.909965,0.364686,0.364686,0.364686,0.364686
20,3.7567,3.644211,0.422442,0.422442,0.422442,0.422442
30,3.4376,3.425158,0.466997,0.466997,0.466997,0.466997
40,3.2283,3.24841,0.478548,0.478548,0.478548,0.478548
50,2.8656,3.116383,0.488449,0.488449,0.488449,0.488449
60,2.955,3.022203,0.493399,0.493399,0.493399,0.493399
70,3.0395,2.962284,0.49505,0.49505,0.49505,0.49505
80,2.8624,2.924422,0.49505,0.49505,0.49505,0.49505
90,2.8266,2.903704,0.49505,0.49505,0.49505,0.49505
100,2.6529,2.897018,0.49505,0.49505,0.49505,0.49505


[I 2025-05-17 18:45:19,494] Trial 0 finished with value: 0.49504950495049505 and parameters: {'learning_rate': 2.9876834036079085e-05, 'batch_size': 32, 'freeze_pct': 0.4892829271955335}. Best is trial 0 with value: 0.49504950495049505.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.1902,4.162264,0.033003,0.033003,0.033003,0.033003
20,4.0603,4.025545,0.316832,0.316832,0.316832,0.316832
30,3.887,3.91308,0.394389,0.394389,0.394389,0.394389
40,3.761,3.812786,0.420792,0.420792,0.420792,0.420792
50,3.8737,3.735861,0.442244,0.442244,0.442244,0.442244
60,3.7322,3.671133,0.455446,0.455446,0.455446,0.455446
70,3.6584,3.62002,0.458746,0.458746,0.458746,0.458746
80,3.4821,3.584121,0.465347,0.465347,0.465347,0.465347
90,3.4528,3.561368,0.466997,0.466997,0.466997,0.466997
100,3.4816,3.553952,0.466997,0.466997,0.466997,0.466997


[I 2025-05-17 18:49:55,271] Trial 1 finished with value: 0.466996699669967 and parameters: {'learning_rate': 2.1344895723176682e-05, 'batch_size': 8, 'freeze_pct': 0.4751237028768773}. Best is trial 0 with value: 0.49504950495049505.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.5237,4.46056,0.008251,0.008251,0.008251,0.008251
20,4.3712,4.355582,0.008251,0.008251,0.008251,0.008251
30,4.2528,4.263684,0.011551,0.011551,0.011551,0.011551
40,4.1428,4.182294,0.051155,0.051155,0.051155,0.051155
50,4.0693,4.113589,0.148515,0.148515,0.148515,0.148515
60,4.027,4.056708,0.252475,0.252475,0.252475,0.252475
70,3.9921,4.012719,0.338284,0.338284,0.338284,0.338284
80,3.9502,3.981887,0.371287,0.371287,0.371287,0.371287
90,3.9172,3.963177,0.40099,0.40099,0.40099,0.40099
100,3.8535,3.956617,0.40429,0.40429,0.40429,0.40429


[I 2025-05-17 18:58:17,826] Trial 2 finished with value: 0.4042904290429043 and parameters: {'learning_rate': 1.3216685877568885e-05, 'batch_size': 32, 'freeze_pct': 0.7254800724399941}. Best is trial 0 with value: 0.49504950495049505.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.1788,4.120126,0.127063,0.127063,0.127063,0.127063
20,4.0218,3.952847,0.381188,0.381188,0.381188,0.381188
30,3.8317,3.806957,0.425743,0.425743,0.425743,0.425743
40,3.6764,3.677771,0.458746,0.458746,0.458746,0.458746
50,3.4666,3.569592,0.468647,0.468647,0.468647,0.468647
60,3.472,3.480932,0.488449,0.488449,0.488449,0.488449
70,3.4477,3.413135,0.49505,0.49505,0.49505,0.49505
80,3.368,3.367398,0.49505,0.49505,0.49505,0.49505
90,3.301,3.340787,0.49505,0.49505,0.49505,0.49505
100,3.1997,3.331587,0.4967,0.4967,0.4967,0.4967


[I 2025-05-17 19:06:45,618] Trial 3 finished with value: 0.4966996699669967 and parameters: {'learning_rate': 2.1641651415244203e-05, 'batch_size': 32, 'freeze_pct': 0.5514573431729268}. Best is trial 3 with value: 0.4966996699669967.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.2216,4.218982,0.0,0.0,0.0,0.0
20,4.1411,4.133258,0.112211,0.112211,0.112211,0.112211
30,4.0339,4.062064,0.239274,0.239274,0.239274,0.239274
40,3.9495,3.998391,0.341584,0.341584,0.341584,0.341584
50,4.046,3.949198,0.377888,0.377888,0.377888,0.377888
60,3.9502,3.907484,0.39604,0.39604,0.39604,0.39604
70,3.9099,3.874328,0.39934,0.39934,0.39934,0.39934
80,3.7602,3.850776,0.409241,0.409241,0.409241,0.409241
90,3.7538,3.835855,0.417492,0.417492,0.417492,0.417492
100,3.7698,3.830971,0.417492,0.417492,0.417492,0.417492


[I 2025-05-17 19:11:38,363] Trial 4 finished with value: 0.41749174917491755 and parameters: {'learning_rate': 1.3329082649775433e-05, 'batch_size': 8, 'freeze_pct': 0.5818039532995977}. Best is trial 3 with value: 0.4966996699669967.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.471,4.360388,0.008251,0.008251,0.008251,0.008251
20,4.2302,4.171486,0.074257,0.074257,0.074257,0.074257
30,4.0211,4.005983,0.344884,0.344884,0.344884,0.344884
40,3.8324,3.859475,0.438944,0.438944,0.438944,0.438944
50,3.6511,3.737576,0.476898,0.476898,0.476898,0.476898
60,3.6083,3.636594,0.490099,0.490099,0.490099,0.490099
70,3.5726,3.560004,0.4967,0.4967,0.4967,0.4967
80,3.4669,3.507465,0.49835,0.49835,0.49835,0.49835
90,3.4231,3.476377,0.49835,0.49835,0.49835,0.49835
100,3.3027,3.465648,0.49835,0.49835,0.49835,0.49835


[I 2025-05-17 19:20:04,837] Trial 5 finished with value: 0.49834983498349833 and parameters: {'learning_rate': 2.3948435737103438e-05, 'batch_size': 32, 'freeze_pct': 0.45040950263606655}. Best is trial 5 with value: 0.49834983498349833.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.0855,3.974114,0.361386,0.361386,0.361386,0.361386
20,3.7935,3.668435,0.455446,0.455446,0.455446,0.455446
30,3.4058,3.42971,0.481848,0.481848,0.481848,0.481848
40,3.1641,3.237098,0.490099,0.490099,0.490099,0.490099
50,3.3461,3.109392,0.4967,0.4967,0.4967,0.4967
60,3.1069,3.015908,0.49835,0.49835,0.49835,0.49835
70,2.9747,2.953489,0.49835,0.49835,0.49835,0.49835
80,2.7376,2.918473,0.49835,0.49835,0.49835,0.49835
90,2.7013,2.897051,0.49835,0.49835,0.49835,0.49835
100,2.7762,2.890987,0.49835,0.49835,0.49835,0.49835


[I 2025-05-17 19:24:40,761] Trial 6 finished with value: 0.49834983498349833 and parameters: {'learning_rate': 4.84701464382748e-05, 'batch_size': 8, 'freeze_pct': 0.5770077788946468}. Best is trial 5 with value: 0.49834983498349833.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,4.485,4.387046,0.008251,0.008251,0.008251,0.008251
20,4.2677,4.220619,0.016502,0.016502,0.016502,0.016502
30,4.0825,4.074772,0.231023,0.231023,0.231023,0.231023
40,3.9145,3.945223,0.410891,0.410891,0.410891,0.410891
50,3.7616,3.836937,0.448845,0.448845,0.448845,0.448845
60,3.7188,3.747179,0.475248,0.475248,0.475248,0.475248
70,3.6819,3.678478,0.485149,0.485149,0.485149,0.485149
80,3.5924,3.630522,0.493399,0.493399,0.493399,0.493399
90,3.5508,3.601624,0.493399,0.493399,0.493399,0.493399
100,3.445,3.591578,0.4967,0.4967,0.4967,0.4967


[I 2025-05-17 19:33:07,005] Trial 7 finished with value: 0.4966996699669967 and parameters: {'learning_rate': 2.108399631517385e-05, 'batch_size': 32, 'freeze_pct': 0.46364901563028477}. Best is trial 5 with value: 0.49834983498349833.


🏆 Best RE Freeze params: {'learning_rate': 2.3948435737103438e-05, 'batch_size': 32, 'freeze_pct': 0.45040950263606655} → Dev-F1 = 0.49834983498349833


## 36: Final Full Fine-Tuning für RE

In [143]:
# === Kapitel 36: Final Full Fine-Tuning für RE (korrigiert) ===
from datasets import concatenate_datasets
from transformers import (
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# --- compute_metrics noch mal definieren, falls nicht global ---
def compute_metrics_re(p):
    preds  = p.predictions.argmax(-1)
    labels = p.label_ids
    mask   = labels >= 0
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels[mask], preds[mask], average="micro"
    )
    acc = accuracy_score(labels[mask], preds[mask])
    return {"precision": prec, "recall": rec, "f1": f1, "accuracy": acc}

# 1) Train+Dev zusammenfassen
full_train = concatenate_datasets([ds["train"], ds["dev"]])

# 2) Beste FT-Parameter (aus Kap.33)
best_ft = study_re_ft.best_params

# 3) DataCollator & Modell laden
data_collator_re = DataCollatorWithPadding(tokenizer=tokenizer_re)
ft_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id_re),
    id2label=id2label_re,
    label2id=label2id_re,
)

# 4) Trainings-Arguments
ft_args = TrainingArguments(
    output_dir="outputs/re-final-ft",
    evaluation_strategy="steps",
    eval_steps=20,
    logging_strategy="steps",
    logging_steps=20,
    save_strategy="no",
    max_steps=200,
    per_device_train_batch_size=best_ft["batch_size"],
    per_device_eval_batch_size=best_ft["batch_size"] * 2,
    learning_rate=best_ft["learning_rate"],
    fp16=torch.cuda.is_available(),
)

# 5) Trainer initialisieren — jetzt MIT eval_dataset!
ft_trainer = Trainer(
    model=ft_model,
    args=ft_args,
    train_dataset=full_train,
    eval_dataset=ds["dev"],           # ← hier fehlt(e) es vorher
    tokenizer=tokenizer_re,
    data_collator=data_collator_re,
    compute_metrics=compute_metrics_re,  # ← und hier die Metriken
)

# 6) Train & abschließende Dev-/Test-Evaluation
ft_trainer.train()
dev_metrics  = ft_trainer.evaluate(eval_dataset=ds["dev"])
test_metrics = ft_trainer.evaluate(eval_dataset=ds["test"])
print("🔖 RE Final Full-FT Dev:",  dev_metrics)
print("🏁 RE Final Full-FT Test:", test_metrics)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  ft_trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,3.7027,2.904846,0.5,0.5,0.5,0.5
40,2.6948,2.56347,0.5,0.5,0.5,0.5
60,2.5687,2.506935,0.5,0.5,0.5,0.5
80,2.5347,2.445755,0.5,0.5,0.5,0.5
100,2.5202,2.419361,0.5,0.5,0.5,0.5
120,2.3785,2.339497,0.5,0.5,0.5,0.5
140,2.3927,2.286658,0.5,0.5,0.5,0.5
160,2.2102,2.25308,0.5,0.5,0.5,0.5
180,2.3011,2.197468,0.5,0.5,0.5,0.5
200,2.3082,2.202842,0.5,0.5,0.5,0.5


KeyError: 'test'

## Kapitel 37: Final LoRA Fine-Tuning für RE

In [156]:
# === Kapitel 37: Final LoRA Fine-Tuning für RE ===

from datasets import concatenate_datasets
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import torch

# 1) Metrics-Funktion
def compute_metrics(p):
    preds  = p.predictions.argmax(-1)
    labels = p.label_ids
    mask   = labels >= 0
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels[mask], preds[mask], average="micro"
    )
    acc = accuracy_score(labels[mask], preds[mask])
    return {"precision": prec, "recall": rec, "f1": f1, "accuracy": acc}

# 2) Deine besten LoRA-Hyperparams
lr      = 0.00032032068892167987
r       = 8
alpha   = 32
dropout = 0.19912026952009523
bs      = 16

# 3) Modell + LoRA-Adapter
base_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id_re),
    id2label=id2label_re,
    label2id=label2id_re,
)
lora_conf = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=r,
    lora_alpha=alpha,
    lora_dropout=dropout,
)
model = get_peft_model(base_model, lora_conf)

# 4) DataCollator & TrainingsArguments
data_collator = DataCollatorWithPadding(tokenizer=tokenizer_re)

training_args = TrainingArguments(
    output_dir="outputs/re-final-lora",
    evaluation_strategy="steps",
    eval_steps=20,
    logging_strategy="steps",
    logging_steps=20,
    save_strategy="no",
    max_steps=200,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs * 2,
    learning_rate=lr,
    fp16=torch.cuda.is_available(),
)

# 5) Train+Dev zusammenfassen
full_train = concatenate_datasets([ds["train"], ds["dev"]])

# 6) Trainer initialisieren
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train,
    eval_dataset=ds["dev"],
    tokenizer=tokenizer_re,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 7) Trainieren und auswerten
trainer.train()
dev_metrics  = trainer.evaluate(eval_dataset=ds["dev"])
test_metrics = trainer.evaluate(eval_dataset=ds["test"])

print("🔖 Final RE LoRA — Dev:",  dev_metrics)
print("🏁 Final RE LoRA — Test:", test_metrics)
mpute_metrics_re,
)
lora_trainer.train()


SyntaxError: unmatched ')' (1851520589.py, line 87)

## 38: Final Partial-Freeze Fine-Tuning für RE

In [148]:
# === Kapitel 38: Final Partial-Freeze Fine-Tuning für RE ===

def compute_metrics_re(p):
    preds  = p.predictions.argmax(-1)
    labels = p.label_ids
    mask   = labels >= 0
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels[mask], preds[mask], average="micro"
    )
    acc = accuracy_score(labels[mask], preds[mask])
    return {"precision": prec, "recall": rec, "f1": f1, "accuracy": acc}

# 1) Train+Dev zusammenfassen
full_train = concatenate_datasets([ds["train"], ds["dev"]])

# 2) Beste Freeze-Parameter (aus Kapitel 35)
best_freeze = study_re_freeze.best_params
pct         = best_freeze["freeze_pct"]
bs          = best_freeze["batch_size"]
lr          = best_freeze["learning_rate"]

# 3) Modell laden und einfrieren
m = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id_re),
    id2label=id2label_re,
    label2id=label2id_re,
)
total_layers = len([n for n,_ in m.named_parameters() if n.startswith("bert.encoder.layer.")])
cutoff = int(total_layers * pct)
for name, param in m.named_parameters():
    if name.startswith("bert.encoder.layer.") and int(name.split(".")[3]) < cutoff:
        param.requires_grad = False

# 4) DataCollator & TrainingArguments
data_collator_re = DataCollatorWithPadding(tokenizer=tokenizer_re)
freeze_args = TrainingArguments(
    output_dir="outputs/re-final-freeze",
    evaluation_strategy="steps",
    eval_steps=20,
    logging_strategy="steps",
    logging_steps=20,
    save_strategy="no",
    max_steps=200,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs * 2,
    learning_rate=lr,
    fp16=torch.cuda.is_available(),
)

# 5) Trainer & Train/Metriken
freeze_trainer = Trainer(
    model=m,
    args=freeze_args,
    train_dataset=full_train,
    eval_dataset=ds["dev"],
    tokenizer=tokenizer_re,
    data_collator=data_collator_re,
    compute_metrics=compute_metrics_re,
)
freeze_trainer.train()
dev_metrics  = freeze_trainer.evaluate(eval_dataset=ds["dev"])
test_metrics = freeze_trainer.evaluate(eval_dataset=ds["test"])
print("🔖 RE Final Freeze Dev:",  dev_metrics)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  freeze_trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,4.1511,3.904321,0.40099,0.40099,0.40099,0.40099
40,3.7218,3.458064,0.49835,0.49835,0.49835,0.49835
60,3.3473,3.103712,0.5,0.5,0.5,0.5
80,3.0295,2.846551,0.5,0.5,0.5,0.5
100,2.9086,2.686925,0.5,0.5,0.5,0.5
120,2.6467,2.598918,0.5,0.5,0.5,0.5
140,2.6067,2.554867,0.5,0.5,0.5,0.5
160,2.5874,2.531546,0.5,0.5,0.5,0.5
180,2.6416,2.518265,0.5,0.5,0.5,0.5
200,2.531,2.513658,0.5,0.5,0.5,0.5


KeyError: 'test'

## Kapitel 39: Hyperparameter-Tuning Full Fine-Tuning für GPT-Neo (RE)

In [169]:
# === Kapitel 39: GPT-Neo RE Full Fine-Tuning Hyperparam-Tuning ===

import optuna
import torch
from transformers import (
    AutoModelForSequenceClassification, Trainer, TrainingArguments,
    DataCollatorWithPadding
)

def neo_re_ft_objective(trial):
    # 1) Hyper-Parameter
    lr = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    bs = trial.suggest_categorical("batch_size", [4, 8, 16])

    # 2) Modell laden
    model = AutoModelForSequenceClassification.from_pretrained(
        "EleutherAI/gpt-neo-125M",
        num_labels=len(label2id_re),
        id2label=id2label_re,
        label2id=label2id_re,
    )
    # ↑ Fix: pad_token_id setzen auf Dein RE-Tokenizer pad (BERT-pad=0)
    model.config.pad_token_id = tokenizer_re.pad_token_id

    # 3) Trainings-Argumente
    args = TrainingArguments(
        output_dir=f"tmp/neo-re-ft-{trial.number}",
        evaluation_strategy="steps",
        eval_steps=20,
        logging_steps=10,
        save_strategy="no",
        max_steps=100,
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs*2,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
    )

    # 4) Trainer initialisieren
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds["train"],
        eval_dataset=ds["dev"],
        tokenizer=tokenizer_re,
        data_collator=DataCollatorWithPadding(tokenizer_re),
        compute_metrics=compute_metrics_re,
    )

    # 5) Train & Eval
    trainer.train()
    return trainer.evaluate()["eval_f1"]


study_neo_re_ft = optuna.create_study(direction="maximize")
study_neo_re_ft.optimize(neo_re_ft_objective, n_trials=8)

print("🏆 Best GPT-Neo RE Full-FT params:", study_neo_re_ft.best_params,
      "→ Dev-F1 =", study_neo_re_ft.best_value)


[I 2025-05-18 02:51:22,154] A new study created in memory with name: no-name-e975b468-04f0-46a6-a5bd-82e233d60e48
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,2.6988,3.065078,0.490099,0.490099,0.490099,0.490099
40,2.5335,2.9026,0.49835,0.49835,0.49835,0.49835
60,2.3938,2.871885,0.5,0.5,0.5,0.5
80,2.2888,2.851628,0.49505,0.49505,0.49505,0.49505
100,1.7555,2.866771,0.5,0.5,0.5,0.5


[I 2025-05-18 02:59:15,366] Trial 0 finished with value: 0.5 and parameters: {'learning_rate': 2.1593610305165778e-05, 'batch_size': 16}. Best is trial 0 with value: 0.5.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,2.9604,3.273297,0.5,0.5,0.5,0.5
40,3.2299,2.957267,0.491749,0.491749,0.491749,0.491749
60,2.4438,2.874211,0.493399,0.493399,0.493399,0.493399
80,1.9337,2.857775,0.4967,0.4967,0.4967,0.4967
100,2.9538,2.849956,0.49835,0.49835,0.49835,0.49835


[I 2025-05-18 03:04:01,707] Trial 1 finished with value: 0.49834983498349833 and parameters: {'learning_rate': 1.6017733468724378e-05, 'batch_size': 4}. Best is trial 0 with value: 0.5.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,3.3643,3.138107,0.5,0.5,0.5,0.5
40,2.8051,2.843173,0.50165,0.50165,0.50165,0.50165
60,3.0108,2.834888,0.49505,0.49505,0.49505,0.49505
80,2.5669,2.820672,0.5,0.5,0.5,0.5
100,2.5155,2.801967,0.49835,0.49835,0.49835,0.49835


[I 2025-05-18 03:09:20,255] Trial 2 finished with value: 0.49834983498349833 and parameters: {'learning_rate': 1.3189981191712024e-05, 'batch_size': 8}. Best is trial 0 with value: 0.5.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,2.7744,2.906713,0.5,0.5,0.5,0.5
40,2.5637,2.783278,0.5,0.5,0.5,0.5
60,2.241,2.798984,0.5,0.5,0.5,0.5
80,2.2355,2.771923,0.5,0.5,0.5,0.5
100,1.6265,2.806891,0.5,0.5,0.5,0.5


[I 2025-05-18 03:16:13,521] Trial 3 finished with value: 0.5 and parameters: {'learning_rate': 4.312986845029776e-05, 'batch_size': 16}. Best is trial 0 with value: 0.5.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,2.9722,2.980735,0.4967,0.4967,0.4967,0.4967
40,2.523,2.847092,0.5,0.5,0.5,0.5
60,2.8345,2.829383,0.5,0.5,0.5,0.5
80,2.3391,2.829108,0.5,0.5,0.5,0.5
100,2.393,2.811469,0.5,0.5,0.5,0.5


[I 2025-05-18 03:21:19,210] Trial 4 finished with value: 0.5 and parameters: {'learning_rate': 4.690273591206409e-05, 'batch_size': 8}. Best is trial 0 with value: 0.5.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,2.9945,3.266162,0.5,0.5,0.5,0.5
40,3.2638,2.956435,0.432343,0.432343,0.432343,0.432343
60,2.3161,2.780778,0.5,0.5,0.5,0.5
80,1.9912,2.77197,0.5,0.5,0.5,0.5
100,2.8801,2.765402,0.5,0.5,0.5,0.5


[I 2025-05-18 03:25:30,021] Trial 5 finished with value: 0.5 and parameters: {'learning_rate': 2.1586648731832532e-05, 'batch_size': 4}. Best is trial 0 with value: 0.5.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,2.993,2.925017,0.5,0.5,0.5,0.5
40,2.8796,2.817191,0.49835,0.49835,0.49835,0.49835
60,2.4232,2.803146,0.485149,0.485149,0.485149,0.485149
80,2.4913,2.774386,0.488449,0.488449,0.488449,0.488449
100,1.8743,2.781367,0.4967,0.4967,0.4967,0.4967


[I 2025-05-18 03:32:19,567] Trial 6 finished with value: 0.4966996699669967 and parameters: {'learning_rate': 1.2559225392801358e-05, 'batch_size': 16}. Best is trial 0 with value: 0.5.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,3.0042,3.069555,0.49505,0.49505,0.49505,0.49505
40,2.8427,2.905274,0.49835,0.49835,0.49835,0.49835
60,2.5763,2.831778,0.49835,0.49835,0.49835,0.49835
80,2.396,2.793318,0.5,0.5,0.5,0.5
100,1.9921,2.788931,0.5,0.5,0.5,0.5


[I 2025-05-18 03:39:09,607] Trial 7 finished with value: 0.5 and parameters: {'learning_rate': 1.0147296366379076e-05, 'batch_size': 16}. Best is trial 0 with value: 0.5.


🏆 Best GPT-Neo RE Full-FT params: {'learning_rate': 2.1593610305165778e-05, 'batch_size': 16} → Dev-F1 = 0.5


## Kapitel 40: GPT-Neo RE LoRA Hyperparam-Tuning

In [170]:
# === Kapitel 40: GPT-Neo RE LoRA Hyperparam-Tuning ===

import optuna
import torch
from peft import LoraConfig, get_peft_model
from transformers import (
    AutoModelForSequenceClassification, Trainer, TrainingArguments,
    DataCollatorWithPadding
)

def neo_re_lora_objective(trial):
    # 1) Hyper-Params
    lr      = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    r       = trial.suggest_categorical("r", [4, 8, 16])
    alpha   = trial.suggest_categorical("alpha", [16, 32])
    dropout = trial.suggest_float("dropout", 0.0, 0.3)
    bs      = trial.suggest_categorical("batch_size", [4, 8, 16])

    # 2) LoRA aufsetzen
    lora_conf = LoraConfig(
        task_type="SEQ_CLS",
        inference_mode=False,
        r=r,
        lora_alpha=alpha,
        lora_dropout=dropout,
    )
    base = AutoModelForSequenceClassification.from_pretrained(
        "EleutherAI/gpt-neo-125M",
        num_labels=len(label2id_re),
        id2label=id2label_re,
        label2id=label2id_re,
    )
    # Fix pad_token_id
    base.config.pad_token_id = tokenizer_re.pad_token_id
    lora_model = get_peft_model(base, lora_conf)

    # 3) Trainings-Args
    args = TrainingArguments(
        output_dir=f"tmp/neo-re-lora-{trial.number}",
        evaluation_strategy="steps",
        eval_steps=20,
        logging_steps=10,
        save_strategy="no",
        max_steps=100,
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs*2,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
    )

    # 4) Trainer
    trainer = Trainer(
        model=lora_model,
        args=args,
        train_dataset=ds["train"],
        eval_dataset=ds["dev"],
        tokenizer=tokenizer_re,
        data_collator=DataCollatorWithPadding(tokenizer_re),
        compute_metrics=compute_metrics_re,
    )

    # 5) Train & Eval
    trainer.train()
    return trainer.evaluate()["eval_f1"]


study_neo_re_lora = optuna.create_study(direction="maximize")
study_neo_re_lora.optimize(neo_re_lora_objective, n_trials=8)

print("🏆 Best GPT-Neo RE LoRA params:", study_neo_re_lora.best_params,
      "→ Dev-F1 =", study_neo_re_lora.best_value)


[I 2025-05-18 03:39:09,619] A new study created in memory with name: no-name-3a553752-5f24-4ca7-9016-2c585c7090c3
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,5.7649,5.215204,0.00165,0.00165,0.00165,0.00165
40,4.9404,4.878578,0.00495,0.00495,0.00495,0.00495
60,4.7951,4.649555,0.011551,0.011551,0.011551,0.011551
80,4.5783,4.511791,0.016502,0.016502,0.016502,0.016502
100,4.3679,4.463237,0.024752,0.024752,0.024752,0.024752


[I 2025-05-18 03:43:31,260] Trial 0 finished with value: 0.024752475247524754 and parameters: {'learning_rate': 2.0656131108914558e-05, 'r': 16, 'alpha': 16, 'dropout': 0.026479857787921366, 'batch_size': 8}. Best is trial 0 with value: 0.024752475247524754.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,3.1047,3.496833,0.5,0.5,0.5,0.5
40,2.4767,3.250067,0.4967,0.4967,0.4967,0.4967
60,3.0008,3.265537,0.5,0.5,0.5,0.5
80,2.3255,3.245646,0.5,0.5,0.5,0.5
100,2.5439,3.229023,0.5,0.5,0.5,0.5


[I 2025-05-18 03:47:53,577] Trial 1 finished with value: 0.5 and parameters: {'learning_rate': 0.0002827866072068735, 'r': 16, 'alpha': 16, 'dropout': 0.255526969488984, 'batch_size': 8}. Best is trial 1 with value: 0.5.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,2.5646,3.922038,0.5,0.5,0.5,0.5
40,3.7145,3.370856,0.480198,0.480198,0.480198,0.480198
60,2.5049,3.273753,0.49505,0.49505,0.49505,0.49505
80,2.1256,3.295169,0.49835,0.49835,0.49835,0.49835
100,2.9896,3.273296,0.49835,0.49835,0.49835,0.49835


[I 2025-05-18 03:51:14,711] Trial 2 finished with value: 0.49834983498349833 and parameters: {'learning_rate': 0.00043383939848244343, 'r': 4, 'alpha': 32, 'dropout': 0.09455558574559031, 'batch_size': 4}. Best is trial 1 with value: 0.5.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,2.3615,3.794927,0.5,0.5,0.5,0.5
40,3.7842,3.441583,0.466997,0.466997,0.466997,0.466997
60,2.5417,3.250133,0.49835,0.49835,0.49835,0.49835
80,2.1062,3.308109,0.49835,0.49835,0.49835,0.49835
100,3.0949,3.293999,0.49835,0.49835,0.49835,0.49835


[I 2025-05-18 03:54:34,629] Trial 3 finished with value: 0.49834983498349833 and parameters: {'learning_rate': 0.0003421651976998089, 'r': 4, 'alpha': 16, 'dropout': 0.1559480095913568, 'batch_size': 4}. Best is trial 1 with value: 0.5.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,4.1568,3.858274,0.10066,0.10066,0.10066,0.10066
40,2.6434,3.086512,0.5,0.5,0.5,0.5
60,2.9202,3.057168,0.5,0.5,0.5,0.5
80,2.3793,3.06798,0.491749,0.491749,0.491749,0.491749
100,2.6042,3.069056,0.491749,0.491749,0.491749,0.491749


[I 2025-05-18 03:58:58,397] Trial 4 finished with value: 0.49174917491749176 and parameters: {'learning_rate': 9.501753675852592e-05, 'r': 4, 'alpha': 32, 'dropout': 0.24248496327515434, 'batch_size': 8}. Best is trial 1 with value: 0.5.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,4.335,4.12328,0.041254,0.041254,0.041254,0.041254
40,2.8846,3.08818,0.465347,0.465347,0.465347,0.465347
60,2.9232,3.063572,0.5,0.5,0.5,0.5
80,2.4028,3.074744,0.5,0.5,0.5,0.5
100,2.6215,3.074718,0.5,0.5,0.5,0.5


[I 2025-05-18 04:03:19,248] Trial 5 finished with value: 0.5 and parameters: {'learning_rate': 9.797567727942517e-05, 'r': 16, 'alpha': 16, 'dropout': 0.10715511849770963, 'batch_size': 8}. Best is trial 1 with value: 0.5.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,3.4229,3.693619,0.339934,0.339934,0.339934,0.339934
40,2.6204,3.393147,0.5,0.5,0.5,0.5
60,3.0359,3.188347,0.50165,0.50165,0.50165,0.50165
80,2.2756,3.198689,0.5,0.5,0.5,0.5
100,2.3986,3.165586,0.5,0.5,0.5,0.5


[I 2025-05-18 04:07:42,179] Trial 6 finished with value: 0.5 and parameters: {'learning_rate': 0.0007101062145830903, 'r': 4, 'alpha': 32, 'dropout': 0.10464782707469777, 'batch_size': 8}. Best is trial 1 with value: 0.5.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,5.5175,5.494432,0.0033,0.0033,0.0033,0.0033
40,5.0932,5.195218,0.00495,0.00495,0.00495,0.00495
60,5.3153,4.975839,0.008251,0.008251,0.008251,0.008251
80,4.8381,4.840773,0.009901,0.009901,0.009901,0.009901
100,4.6956,4.799141,0.009901,0.009901,0.009901,0.009901


[I 2025-05-18 04:11:03,549] Trial 7 finished with value: 0.009900990099009901 and parameters: {'learning_rate': 2.288000633868335e-05, 'r': 4, 'alpha': 16, 'dropout': 0.08153525636079527, 'batch_size': 4}. Best is trial 1 with value: 0.5.


🏆 Best GPT-Neo RE LoRA params: {'learning_rate': 0.0002827866072068735, 'r': 16, 'alpha': 16, 'dropout': 0.255526969488984, 'batch_size': 8} → Dev-F1 = 0.5


## Kapitel 41: GPT-Neo RE Partial-Freeze Hyperparam-Tuning

In [171]:
# === Kapitel 41: GPT-Neo RE Partial-Freeze Hyperparam-Tuning ===

import optuna
import torch
from transformers import (
    AutoModelForSequenceClassification, Trainer, TrainingArguments,
    DataCollatorWithPadding
)

def neo_re_freeze_objective(trial):
    # 1) Hyper-Params
    lr  = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    bs  = trial.suggest_categorical("batch_size", [4, 8, 16])
    pct = trial.suggest_float("freeze_pct", 0.25, 0.75)

    # 2) Modell laden & einfrieren
    m = AutoModelForSequenceClassification.from_pretrained(
        "EleutherAI/gpt-neo-125M",
        num_labels=len(label2id_re),
        id2label=id2label_re,
        label2id=label2id_re,
    )
    m.config.pad_token_id = tokenizer_re.pad_token_id

    total = len([n for n,_ in m.named_parameters() if n.startswith("transformer.h.")])
    cut   = int(total * pct)
    for name,param in m.named_parameters():
        if name.startswith("transformer.h.") and int(name.split(".")[2]) < cut:
            param.requires_grad = False

    # 3) Trainings-Args
    args = TrainingArguments(
        output_dir=f"tmp/neo-re-freeze-{trial.number}",
        evaluation_strategy="steps",
        eval_steps=20,
        logging_steps=10,
        save_strategy="no",
        max_steps=100,
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs*2,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
    )

    # 4) Trainer
    trainer = Trainer(
        model=m,
        args=args,
        train_dataset=ds["train"],
        eval_dataset=ds["dev"],
        tokenizer=tokenizer_re,
        data_collator=DataCollatorWithPadding(tokenizer_re),
        compute_metrics=compute_metrics_re,
    )

    # 5) Train & Eval
    trainer.train()
    return trainer.evaluate()["eval_f1"]


study_neo_re_freeze = optuna.create_study(direction="maximize")
study_neo_re_freeze.optimize(neo_re_freeze_objective, n_trials=8)

print("🏆 Best GPT-Neo RE Freeze params:", study_neo_re_freeze.best_params,
      "→ Dev-F1 =", study_neo_re_freeze.best_value)


[I 2025-05-18 04:11:03,560] A new study created in memory with name: no-name-0d2a3390-569c-4772-93c5-83461bd75b45
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,5.284,5.385854,0.0033,0.0033,0.0033,0.0033
40,5.0244,5.002536,0.008251,0.008251,0.008251,0.008251
60,4.6021,4.758359,0.009901,0.009901,0.009901,0.009901
80,4.0464,4.618761,0.014851,0.014851,0.014851,0.014851
100,4.2555,4.570947,0.016502,0.016502,0.016502,0.016502


[I 2025-05-18 04:15:29,880] Trial 0 finished with value: 0.0165016501650165 and parameters: {'learning_rate': 2.351527731326995e-05, 'batch_size': 8, 'freeze_pct': 0.48507277200501586}. Best is trial 0 with value: 0.0165016501650165.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,4.9419,4.781651,0.006601,0.006601,0.006601,0.006601
40,4.4674,4.246407,0.006601,0.006601,0.006601,0.006601
60,3.9709,3.908218,0.067657,0.067657,0.067657,0.067657
80,3.5835,3.724397,0.132013,0.132013,0.132013,0.132013
100,3.5247,3.66218,0.155116,0.155116,0.155116,0.155116


[I 2025-05-18 04:19:56,666] Trial 1 finished with value: 0.1551155115511551 and parameters: {'learning_rate': 3.464080464231026e-05, 'batch_size': 8, 'freeze_pct': 0.4393379142919582}. Best is trial 1 with value: 0.1551155115511551.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,5.4534,5.178004,0.0033,0.0033,0.0033,0.0033
40,5.1144,4.934453,0.00495,0.00495,0.00495,0.00495
60,4.7441,4.764187,0.006601,0.006601,0.006601,0.006601
80,4.679,4.661656,0.006601,0.006601,0.006601,0.006601
100,4.534,4.627132,0.006601,0.006601,0.006601,0.006601


[I 2025-05-18 04:25:52,008] Trial 2 finished with value: 0.006600660066006601 and parameters: {'learning_rate': 1.3682997912141893e-05, 'batch_size': 16, 'freeze_pct': 0.5351551386727706}. Best is trial 1 with value: 0.1551155115511551.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,5.5035,5.419997,0.00165,0.00165,0.00165,0.00165
40,5.6571,5.260497,0.00165,0.00165,0.00165,0.00165
60,5.1932,5.147106,0.00165,0.00165,0.00165,0.00165
80,4.5325,5.077394,0.00165,0.00165,0.00165,0.00165
100,5.2965,5.056298,0.00165,0.00165,0.00165,0.00165


[I 2025-05-18 04:29:21,943] Trial 3 finished with value: 0.0016501650165016502 and parameters: {'learning_rate': 1.0390261096975955e-05, 'batch_size': 4, 'freeze_pct': 0.3403133527773655}. Best is trial 1 with value: 0.1551155115511551.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,5.4487,5.173009,0.0033,0.0033,0.0033,0.0033
40,5.105,4.925798,0.00495,0.00495,0.00495,0.00495
60,4.7313,4.753035,0.006601,0.006601,0.006601,0.006601
80,4.6639,4.649024,0.006601,0.006601,0.006601,0.006601
100,4.5154,4.614009,0.006601,0.006601,0.006601,0.006601


[I 2025-05-18 04:35:18,965] Trial 4 finished with value: 0.006600660066006601 and parameters: {'learning_rate': 1.3896797585741956e-05, 'batch_size': 16, 'freeze_pct': 0.4348566215743214}. Best is trial 1 with value: 0.1551155115511551.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,5.2024,5.096635,0.00165,0.00165,0.00165,0.00165
40,5.1198,4.690338,0.011551,0.011551,0.011551,0.011551
60,4.2657,4.415658,0.031353,0.031353,0.031353,0.031353
80,3.4551,4.253765,0.059406,0.059406,0.059406,0.059406
100,4.3716,4.20664,0.066007,0.066007,0.066007,0.066007


[I 2025-05-18 04:38:50,142] Trial 5 finished with value: 0.066006600660066 and parameters: {'learning_rate': 2.7867044773781677e-05, 'batch_size': 4, 'freeze_pct': 0.6510004317785507}. Best is trial 1 with value: 0.1551155115511551.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,5.1509,5.03102,0.0033,0.0033,0.0033,0.0033
40,4.95,4.672598,0.006601,0.006601,0.006601,0.006601
60,4.5159,4.434186,0.00495,0.00495,0.00495,0.00495
80,4.266,4.297263,0.00495,0.00495,0.00495,0.00495
100,4.2012,4.249631,0.00495,0.00495,0.00495,0.00495


[I 2025-05-18 04:43:15,119] Trial 6 finished with value: 0.0049504950495049506 and parameters: {'learning_rate': 2.2426496344383954e-05, 'batch_size': 8, 'freeze_pct': 0.5896594545402926}. Best is trial 1 with value: 0.1551155115511551.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,5.2935,5.083532,0.00495,0.00495,0.00495,0.00495
40,4.6864,4.770689,0.006601,0.006601,0.006601,0.006601
60,4.5564,4.554609,0.00495,0.00495,0.00495,0.00495
80,4.4668,4.422428,0.006601,0.006601,0.006601,0.006601
100,4.3197,4.381373,0.006601,0.006601,0.006601,0.006601


[I 2025-05-18 04:46:45,066] Trial 7 finished with value: 0.006600660066006601 and parameters: {'learning_rate': 2.2559916493357485e-05, 'batch_size': 4, 'freeze_pct': 0.3952181431193523}. Best is trial 1 with value: 0.1551155115511551.


🏆 Best GPT-Neo RE Freeze params: {'learning_rate': 3.464080464231026e-05, 'batch_size': 8, 'freeze_pct': 0.4393379142919582} → Dev-F1 = 0.1551155115511551


## 42: Final Full Fine-Tuning für GPT-Neo RE mit den besten Hyper-Params

In [172]:
# === Kapitel 42: Final Full Fine-Tuning für GPT-Neo RE mit den besten Hyper-Params ===

from datasets import concatenate_datasets
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

# 1) Train+Dev zusammenführen
full_train = concatenate_datasets([ds["train"], ds["dev"]])

# 2) Beste Hyper-Params aus Optuna-Study
best_params = {
    "learning_rate": 2.1593610305165778e-05,
    "batch_size":    16,
}

# 3) Modell & Pad-Token konfigurieren
model = AutoModelForSequenceClassification.from_pretrained(
    "EleutherAI/gpt-neo-125M",
    num_labels=len(label2id_re),
    id2label=id2label_re,
    label2id=label2id_re,
)
# Wichtig für GPT-Neo: sag dem Modell, welches Token es als PAD ignorieren soll
model.config.pad_token_id = tokenizer_re.pad_token_id

# 4) Data-Collator
data_collator_re = DataCollatorWithPadding(tokenizer_re)

# 5) Trainings-Argumente
ft_args = TrainingArguments(
    output_dir="outputs/neo-re-final-ft",
    evaluation_strategy="steps",
    eval_steps=10,
    logging_steps=10,
    save_strategy="no",
    max_steps=100,
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["batch_size"],
    per_device_eval_batch_size=best_params["batch_size"] * 2,
    fp16=torch.cuda.is_available(),
)

# 6) Trainer initialisieren
ft_trainer = Trainer(
    model=model,
    args=ft_args,
    train_dataset=full_train,
    eval_dataset=ds["dev"],
    tokenizer=tokenizer_re,
    data_collator=data_collator_re,
    compute_metrics=compute_metrics_re,
)

# 7) Training & Auswertung
ft_trainer.train()
dev_metrics = ft_trainer.evaluate()
print("🔖 GPT-Neo RE Final Full-FT Dev-F1:     ", dev_metrics["eval_f1"])
print("🔖 GPT-Neo RE Final Full-FT Dev-Accuracy:", dev_metrics.get("eval_accuracy"))


Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  ft_trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,3.9689,3.269184,0.5,0.5,0.5,0.5
20,3.2935,2.821665,0.407591,0.407591,0.407591,0.407591
30,2.7732,2.515672,0.49505,0.49505,0.49505,0.49505
40,2.5674,2.463848,0.5,0.5,0.5,0.5
50,2.5286,2.420529,0.485149,0.485149,0.485149,0.485149
60,2.4072,2.352352,0.5,0.5,0.5,0.5
70,2.4921,2.32212,0.5,0.5,0.5,0.5
80,2.4059,2.305504,0.5,0.5,0.5,0.5
90,2.4647,2.294711,0.5,0.5,0.5,0.5
100,2.4151,2.288176,0.5,0.5,0.5,0.5


🔖 GPT-Neo RE Final Full-FT Dev-F1:      0.5
🔖 GPT-Neo RE Final Full-FT Dev-Accuracy: 0.5


## 43: Final LoRA Fine-Tuning für GPT-Neo RE

In [174]:
# === Kapitel 43: Final LoRA Fine-Tuning für GPT-Neo RE ===

from peft import LoraConfig, get_peft_model
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

# 1) Train+Dev zusammenführen
full_train = concatenate_datasets([ds["train"], ds["dev"]])

# 2) Beste LoRA-Hyper-Params
best_lora = {
    "learning_rate": 0.0002827866072068735,
    "r":             16,
    "alpha":         16,
    "dropout":       0.255526969488984,
    "batch_size":    8,
}

# 3) Basis-Modell laden
base = AutoModelForSequenceClassification.from_pretrained(
    "EleutherAI/gpt-neo-125M",
    num_labels=len(label2id_re),
    id2label=id2label_re,
    label2id=label2id_re,
)
# Pad-Token für GPT-Neo setzen
base.config.pad_token_id = tokenizer_re.pad_token_id

# 4) LoRA konfigurieren & anwenden
lora_conf = LoraConfig(
    task_type="SEQUENCE_CLS",
    inference_mode=False,
    r=best_lora["r"],
    lora_alpha=best_lora["alpha"],
    lora_dropout=best_lora["dropout"],
)
model = get_peft_model(base, lora_conf)

# 5) DataCollator
data_collator_re = DataCollatorWithPadding(tokenizer_re)

# 6) Trainings-Argumente
lora_args = TrainingArguments(
    output_dir="outputs/neo-re-final-lora",
    evaluation_strategy="steps",
    eval_steps=20,
    logging_steps=10,
    save_strategy="no",
    max_steps=200,
    learning_rate=best_lora["learning_rate"],
    per_device_train_batch_size=best_lora["batch_size"],
    per_device_eval_batch_size=best_lora["batch_size"] * 2,
    fp16=torch.cuda.is_available(),
)

# 7) Trainer initialisieren & trainieren
lora_trainer = Trainer(
    model=model,
    args=lora_args,
    train_dataset=full_train,
    eval_dataset=ds["dev"],
    tokenizer=tokenizer_re,
    data_collator=data_collator_re,
    compute_metrics=compute_metrics_re,
)

lora_trainer.train()
lora_metrics = lora_trainer.evaluate()
print("🔖 GPT-Neo RE Final LoRA Dev-F1:     ", lora_metrics["eval_f1"])
print("🔖 GPT-Neo RE Final LoRA Dev-Accuracy:", lora_metrics.get("eval_accuracy"))

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Invalid task type: 'SEQUENCE_CLS'. Must be one of the following task types: SEQ_CLS, SEQ_2_SEQ_LM, CAUSAL_LM, TOKEN_CLS, QUESTION_ANS, FEATURE_EXTRACTION.

## 44: Final Partial-Freeze Fine-Tuning für GPT-Neo RE

In [176]:
best_freeze = {
    "learning_rate": 3.464080464231026e-05,
    "batch_size":    8,
    "freeze_pct":    0.4393379142919582,
}

# 3) Modell laden & Pad-Token setzen
model = AutoModelForSequenceClassification.from_pretrained(
    "EleutherAI/gpt-neo-125M",
    num_labels=len(label2id_re),
    id2label=id2label_re,
    label2id=label2id_re,
)
model.config.pad_token_id = tokenizer_re.pad_token_id

# 4) Freeze-Anteil der Layers
total_layers = model.config.num_layers
cutoff = int(total_layers * best_freeze["freeze_pct"])
for name, param in model.named_parameters():
    if name.startswith("transformer.h.") and int(name.split(".")[2]) < cutoff:
        param.requires_grad = False

# 5) DataCollator & Trainings-Argumente
freeze_args = TrainingArguments(
    output_dir="outputs/neo-re-final-freeze",
    evaluation_strategy="steps",
    eval_steps=10,
    logging_steps=10,
    save_strategy="no",
    max_steps=100,
    learning_rate=best_freeze["learning_rate"],
    per_device_train_batch_size=best_freeze["batch_size"],
    per_device_eval_batch_size=best_freeze["batch_size"] * 2,
    fp16=torch.cuda.is_available(),
)

# 6) Trainer initialisieren & trainieren
freeze_trainer = Trainer(
    model=model,
    args=freeze_args,
    train_dataset=full_train,
    eval_dataset=ds["dev"],
    tokenizer=tokenizer_re,
    data_collator=data_collator_re,
    compute_metrics=compute_metrics_re,
)

freeze_trainer.train()
freeze_metrics = freeze_trainer.evaluate()
print("🔖 GPT-Neo RE Final Freeze Dev-F1:     ", freeze_metrics["eval_f1"])
print("🔖 GPT-Neo RE Final Freeze Dev-Accuracy:", freeze_metrics.get("eval_accuracy"))

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  freeze_trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,3.4486,3.22363,0.5,0.5,0.5,0.5
20,2.8198,2.905955,0.493399,0.493399,0.493399,0.493399
30,2.951,2.799154,0.493399,0.493399,0.493399,0.493399
40,3.0805,2.639364,0.49505,0.49505,0.49505,0.49505
50,2.6641,2.558183,0.49835,0.49835,0.49835,0.49835
60,2.4998,2.519373,0.5,0.5,0.5,0.5
70,2.147,2.487896,0.5,0.5,0.5,0.5
80,2.8962,2.488675,0.5,0.5,0.5,0.5
90,2.2115,2.477239,0.5,0.5,0.5,0.5
100,2.8254,2.469461,0.5,0.5,0.5,0.5


🔖 GPT-Neo RE Final Freeze Dev-F1:      0.5
🔖 GPT-Neo RE Final Freeze Dev-Accuracy: 0.5


## Basline GPT RE

In [177]:
# === Kapitel 32b: Baseline-Training & -Eval für RE mit GPT-Neo-125M ===

from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import f1_score, accuracy_score

# 32b.1: GPT-Neo als Sequence-Classifier laden
model_neo_re = AutoModelForSequenceClassification.from_pretrained(
    "EleutherAI/gpt-neo-125M",
    num_labels=len(label2id_re),
    id2label=id2label_re,
    label2id=label2id_re,
)
# sicherheitshalber Pad-Token setzen (GPT-Neo braucht das zum Padden)
model_neo_re.config.pad_token_id = tokenizer_re.pad_token_id

# 32b.2: TrainingArguments sehr ähnlich zu BERT-Baseline
neo_re_baseline_args = TrainingArguments(
    output_dir="outputs/neo-re-baseline",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=1,
    fp16=torch.cuda.is_available(),
)

# 32b.3: DataCollator (pad auf max der Batch)
data_collator_re = DataCollatorWithPadding(tokenizer=tokenizer_re)

# 32b.4: Compute-Metrics (F1 + Accuracy)
def compute_metrics_re(p):
    preds = p.predictions.argmax(-1)
    return {
        "eval_f1":       f1_score(p.label_ids, preds, average="macro"),
        "eval_accuracy": accuracy_score(p.label_ids, preds),
    }

# 32b.5: Trainer instanziieren
trainer_neo_re = Trainer(
    model=model_neo_re,
    args=neo_re_baseline_args,
    train_dataset=ds["train"],
    eval_dataset=ds["dev"],
    tokenizer=tokenizer_re,
    data_collator=data_collator_re,
    compute_metrics=compute_metrics_re,
)

# 32b.6: Train & Eval
trainer_neo_re.train()
metrics_neo_re = trainer_neo_re.evaluate()
print("🔖 GPT-Neo RE Baseline Dev-F1:",       metrics_neo_re["eval_f1"])
print("🔖 GPT-Neo RE Baseline Dev-Accuracy:", metrics_neo_re["eval_accuracy"])


Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_neo_re = Trainer(


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,2.3561,2.842299,0.021505,0.5
2,2.0065,2.919954,0.021553,0.5
3,1.5117,2.988148,0.023764,0.5


🔖 GPT-Neo RE Baseline Dev-F1: 0.02376404835672195
🔖 GPT-Neo RE Baseline Dev-Accuracy: 0.5


## LOOP with NEO same as bert

In [178]:
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import torch
from sklearn.metrics import f1_score, accuracy_score

# 1) Load GPT-Neo for sequence classification with your RE label‐maps
model_re_neo_ft = AutoModelForSequenceClassification.from_pretrained(
    "EleutherAI/gpt-neo-125M",
    num_labels=len(label2id_re),
    id2label=id2label_re,
    label2id=label2id_re,
)
# make sure Neo has a pad token
model_re_neo_ft.config.pad_token_id = tokenizer_re.pad_token_id

# 2) Your best FT hyperparameters for GPT-Neo RE
best_neo_ft = {
    "learning_rate": 2.1593610305165778e-05,
    "batch_size":    16,
}

# 3) TrainingArguments (100 steps, log & eval every 10)
training_args_neo_ft = TrainingArguments(
    output_dir="outputs/re-neo-final-ft",
    evaluation_strategy="steps",
    eval_steps=10,
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=1,
    max_steps=100,
    per_device_train_batch_size=best_neo_ft["batch_size"],
    per_device_eval_batch_size=best_neo_ft["batch_size"] * 2,
    learning_rate=best_neo_ft["learning_rate"],
    fp16=torch.cuda.is_available(),
)

# 4) Data collator
data_collator_re = DataCollatorWithPadding(tokenizer=tokenizer_re)

# 5) Metrics (F1 + accuracy)
def compute_metrics_neo_ft(p):
    preds = p.predictions.argmax(-1)
    return {
        "eval_f1":       f1_score(p.label_ids, preds, average="macro"),
        "eval_accuracy": accuracy_score(p.label_ids, preds),
    }

# 6) Trainer
trainer_re_neo_ft = Trainer(
    model=model_re_neo_ft,
    args=training_args_neo_ft,
    train_dataset=ds["train"],
    eval_dataset=ds["dev"],
    tokenizer=tokenizer_re,
    data_collator=data_collator_re,
    compute_metrics=compute_metrics_neo_ft,
)

# 7) Train & Evaluate
trainer_re_neo_ft.train()
metrics = trainer_re_neo_ft.evaluate()
print("🔖 GPT-Neo RE Final-FT Dev-F1:",       metrics["eval_f1"])
print("🔖 GPT-Neo RE Final-FT Dev-Accuracy:", metrics["eval_accuracy"])

# 8) (Optional) Save
trainer_re_neo_ft.save_model("outputs/re-neo-final-ft")


Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_re_neo_ft = Trainer(


Step,Training Loss,Validation Loss,F1,Accuracy
10,3.9931,3.277524,0.021577,0.5
20,2.9695,2.907632,0.033351,0.473597
30,3.0046,2.811207,0.021577,0.49835
40,2.6699,2.835491,0.021529,0.5
50,2.4416,2.780799,0.021481,0.493399
60,2.3166,2.782464,0.021457,0.491749
70,2.4331,2.775737,0.021482,0.49505
80,2.3621,2.756645,0.021529,0.4967
90,2.0923,2.77126,0.021529,0.5
100,1.7614,2.775944,0.021529,0.5


🔖 GPT-Neo RE Final-FT Dev-F1: 0.021529060679266732
🔖 GPT-Neo RE Final-FT Dev-Accuracy: 0.5
