## 1. Setup & Imports

In [9]:
import os
import re
import random
import json
from pathlib import Path
from datasets import Dataset, DatasetDict

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
    DataCollatorWithPadding,
)
from peft import get_peft_config, get_peft_model, LoraConfig

import numpy as np
import sklearn
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

## 2. Pfade & Daten laden

In [10]:
# Deine lokalen Windows-Pfade
TRAIN_DIR = Path(r"C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\dataset\train")
DEV_DIR   = Path(r"C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\dataset\dev")
TEST_DIR  = Path(r"C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\dataset\test")

assert TRAIN_DIR.exists(), f"Train-Ordner nicht gefunden: {TRAIN_DIR}"
assert DEV_DIR.exists(),   f"Dev-Ordner nicht gefunden:   {DEV_DIR}"
assert TEST_DIR.exists(),  f"Test-Ordner nicht gefunden:  {TEST_DIR}"

In [11]:

def load_docie_docs(folder: Path, recursive: bool = False):
    docs = []
    pattern = "**/*.json" if recursive else "*.json"
    for file in folder.glob(pattern):
        data = json.loads(file.read_text(encoding="utf-8"))
        if isinstance(data, list):
            docs.extend(data)
        else:
            docs.append(data)
    return docs

train_docs = load_docie_docs(TRAIN_DIR)
dev_docs   = load_docie_docs(DEV_DIR)
test_docs  = load_docie_docs(TEST_DIR, recursive=True)

print("Train:", len(train_docs), "│ Dev:", len(dev_docs), "│ Test:", len(test_docs))
print("Train-Keys:", train_docs[0].keys())
print("Dev-Keys:  ", dev_docs[0].keys())
print("Test-Keys: ", test_docs[0].keys())

Train: 51 │ Dev: 23 │ Test: 248
Train-Keys: dict_keys(['domain', 'title', 'doc', 'entities', 'triples', 'label_set', 'entity_label_set'])
Dev-Keys:   dict_keys(['domain', 'title', 'doc', 'entities', 'triples', 'label_set', 'entity_label_set'])
Test-Keys:  dict_keys(['domain', 'document', 'RE_label_set', 'NER_label_set', 'id'])


In [12]:
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
    add_prefix_space=True
)

# ── WICHTIG: Pad-Token hinzufügen **vor** jeder Tokenisierung ──
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

## 3. HuggingFace-Datasets erstellen

In [13]:
# Robustes Mapping für Text-Feld
def docs_to_hf(docs):
    records = []
    for doc in docs:
        if "doc" in doc:
            text = doc["doc"]
        elif "document" in doc:
            text = doc["document"]
        else:
            raise KeyError(f"Kein Text-Feld in {doc.keys()}")
        entities = doc.get("entities", [])
        records.append({"text": text, "entities": entities})
    return Dataset.from_list(records)

hf_datasets = DatasetDict({
    "train": docs_to_hf(train_docs),
    "dev":   docs_to_hf(dev_docs),
    "test":  docs_to_hf(test_docs),
})
print(hf_datasets)


DatasetDict({
    train: Dataset({
        features: ['text', 'entities'],
        num_rows: 51
    })
    dev: Dataset({
        features: ['text', 'entities'],
        num_rows: 23
    })
    test: Dataset({
        features: ['text', 'entities'],
        num_rows: 248
    })
})


## 4. Label Mapping

In [14]:
# Entity-Typen aus Trainingsset
entity_types = train_docs[0]["entity_label_set"]
ner_labels   = ["O"] + [f"{p}-{t}" for t in entity_types for p in ("B","I")]
label2id     = {l:i for i,l in enumerate(ner_labels)}
id2label     = {i:l for l,i in label2id.items()}
print("Anzahl NER-Labels:", len(ner_labels))


Anzahl NER-Labels: 39


## 5. Tokenizer & Data Collator

In [15]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(ner_labels),
    id2label=id2label,
    label2id=label2id
)

# ── WICHTIG: Embedding-Matrix ans erweiterte Vokabular anpassen ──
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

data_collator = DataCollatorForTokenClassification(tokenizer)



Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


## 6. Tokenize & Align Labels (Char-Offset Mapping)

In [16]:
# ─── Chapter 6: Tokenize & Align Labels mit Chunking (BERT-Style) ───

import re

def tokenize_and_align_labels(examples):
    # 6.1 Tokenize mit Chunking und overlap
    tokenized = tokenizer(
        examples["text"],
        return_offsets_mapping=True,
        truncation=True,
        max_length=1024,
        stride=128,
        return_overflowing_tokens=True,
        return_special_tokens_mask=True,
        padding="max_length",
    )

    # 6.2 Hol Dir die Mappings, die nachher weggespeichert werden
    sample_map    = tokenized.pop("overflow_to_sample_mapping")
    offsets       = tokenized.pop("offset_mapping")
    special_masks = tokenized.pop("special_tokens_mask")

    labels = []
    # 6.3 Pro Chunk eine Label-Liste bauen
    for chunk_idx, doc_idx in enumerate(sample_map):
        # Original-Bezug
        text     = examples["text"][doc_idx]
        entities = examples["entities"][doc_idx]

        # Initial: alle Tokens auf "O"
        chunk_offsets = offsets[chunk_idx]
        chunk_labels  = [label2id["O"]] * len(chunk_offsets)

        # Für jede Entität alle Vorkommen markieren
        for ent in entities:
            b_id = label2id[f"B-{ent['type']}"]
            i_id = label2id[f"I-{ent['type']}"]
            for mention in ent.get("mentions", []):
                for m in re.finditer(re.escape(mention), text):
                    s_char, e_char = m.span()
                    # finde Token-IDs, die vollständig innerhalb des Mention-Spans liegen
                    covered = [
                        tok_i for tok_i, (s,e) in enumerate(chunk_offsets)
                        if s >= s_char and e <= e_char
                    ]
                    if not covered:
                        continue
                    chunk_labels[covered[0]] = b_id
                    for tid in covered[1:]:
                        chunk_labels[tid] = i_id

        # Maskiere Sonder-Tokens und overflowend Tokens auf -100
        for i, mask in enumerate(special_masks[chunk_idx]):
            if mask == 1:        # [CLS],[SEP],[PAD]…
                chunk_labels[i] = -100

        labels.append(chunk_labels)

    # 6.4 Füge die fertigen Labels wieder hinzu
    tokenized["labels"] = labels
    return tokenized

# 6.5 Anwenden auf Train & Dev
hf_train = hf_datasets["train"].map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["text","entities"]
)
hf_dev = hf_datasets["dev"].map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["text","entities"]
)

print(hf_train)
print(hf_dev)


Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 90
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 41
})


## 7. Eval-Metrik

In [17]:
def compute_metrics_entity_only(p):
    preds  = p.predictions.argmax(-1).reshape(-1)
    labels = p.label_ids.reshape(-1)
    mask   = labels >= 0
    p_, r_, f1, _ = precision_recall_fscore_support(
        labels[mask], preds[mask], average="micro"
    )
    acc = accuracy_score(labels[mask], preds[mask])
    return {"precision": p_, "recall": r_, "f1": f1, "accuracy": acc}


## 8. GPT NER baseline

In [25]:
from sklearn.metrics import accuracy_score

# ─── GPT-Neo NER Training mit 100 Steps & Evaluation alle 10 Steps ───

# 1. Trainingsargumente
baseline_args = TrainingArguments(
    output_dir="outputs/gptneo-ner-baseline",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    evaluation_strategy="steps",     # Evaluation alle X Schritte
    eval_steps=10,
    save_strategy="no",              # Keine Checkpoints speichern
    max_steps=100,                   # Maximal 100 Trainingsschritte
    learning_rate=3e-3,
    fp16=torch.cuda.is_available(),
    logging_steps=10,
    disable_tqdm=False,
    report_to=[],
)

# 2. Modell laden & Embeddings resize
baseline_model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(ner_labels),
    id2label=id2label,
    label2id=label2id
)
baseline_model.resize_token_embeddings(len(tokenizer))
baseline_model.config.pad_token_id = tokenizer.pad_token_id

# 3. Trainer definieren
baseline_trainer = Trainer(
    model=baseline_model,
    args=baseline_args,
    train_dataset=hf_train,
    eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)

# 4. Training & finale Evaluation
baseline_trainer.train()
baseline_metrics = baseline_trainer.evaluate()
print("🔖 GPT-Neo Baseline Dev-F1:", baseline_metrics["eval_f1"])


Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  baseline_trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,3.1939,1.06027,0.873791,0.873791,0.873791,0.873791
20,1.0815,0.714682,0.873791,0.873791,0.873791,0.873791
30,0.9589,0.660523,0.873791,0.873791,0.873791,0.873791
40,0.8294,0.631003,0.874129,0.874129,0.874129,0.874129
50,0.7123,0.620202,0.863845,0.863845,0.863845,0.863845
60,0.635,0.706414,0.867668,0.867668,0.867668,0.867668
70,0.5641,0.786124,0.860497,0.860497,0.860497,0.860497


KeyboardInterrupt: 

## 9: GPT NER Full fine tuning

In [23]:
# ─── Chapter 9: Schnelles Full-FT mit Best-Params (keine Chunking) ───

from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# 9.1: Beste Hyperparameter
best = {"learning_rate": 2.7233372871192413e-05, "batch_size": 16}

# 9.2: Model laden + Embedding-Resize
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(ner_labels),
    id2label=id2label,
    label2id=label2id
)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

# 9.3: TrainingArguments wie im Optuna-Notebook
args = TrainingArguments(
    output_dir="outputs/gptneo-full-opt",
    per_device_train_batch_size=best["batch_size"],
    per_device_eval_batch_size=best["batch_size"] * 2,
    evaluation_strategy="steps",   # alle X Schritte evaluiert
    eval_steps=10,
    logging_steps=10,
    save_strategy="no",            # keine Checkpoints
    max_steps=100,                 # nur 200 Training-Steps
    learning_rate=best["learning_rate"],
    fp16=torch.cuda.is_available(),
)

# 9.4: Trainer (mit simplem Tokenizer-Setup)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=hf_train,   # Achtung: hier hf_train ohne Chunking! s.u.
    eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)

# 9.5: Trainieren
trainer.train()


Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

## 10. Test Inference