## 1. Setup & Imports

In [60]:
import os
import re
import random
import json
from pathlib import Path
from datasets import Dataset

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
    DataCollatorWithPadding,
)
from peft import get_peft_config, get_peft_model, LoraConfig

import numpy as np
import sklearn
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

## 2. Pfade & Daten laden

In [61]:
# Deine lokalen Windows-Pfade
TRAIN_DIR = Path(r"C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\dataset\train")
DEV_DIR   = Path(r"C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\dataset\dev")
TEST_DIR  = Path(r"C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\dataset\test")

assert TRAIN_DIR.exists(), f"Train-Ordner nicht gefunden: {TRAIN_DIR}"
assert DEV_DIR.exists(),   f"Dev-Ordner nicht gefunden:   {DEV_DIR}"
assert TEST_DIR.exists(),  f"Test-Ordner nicht gefunden:  {TEST_DIR}"

In [62]:
def load_docie_docs(folder: Path, recursive: bool = False):
    docs = []
    pattern = "**/*.json" if recursive else "*.json"
    for file in folder.glob(pattern):
        data = json.loads(file.read_text(encoding="utf-8"))
        if isinstance(data, list):
            docs.extend(data)
        else:
            docs.append(data)
    return docs

train_docs = load_docie_docs(TRAIN_DIR)
dev_docs   = load_docie_docs(DEV_DIR)
test_docs  = load_docie_docs(TEST_DIR, recursive=True)

print("Train:", len(train_docs), "│ Dev:", len(dev_docs), "│ Test:", len(test_docs))


Train: 51 │ Dev: 23 │ Test: 248


## 3. Label-Mapping

In [63]:
# Entity-Typen & B-I Labels
entity_types = train_docs[0]["entity_label_set"]
ner_labels = ["O"] + [f"{p}-{t}" for t in entity_types for p in ("B","I")]
label2id   = {l:i for i,l in enumerate(ner_labels)}
id2label   = {i:l for l,i in label2id.items()}
print("Anzahl NER-Labels:", len(ner_labels))


Anzahl NER-Labels: 39


## 4. Tokenizer and dataset objects

In [64]:
model_name = "bert-base-uncased"
tokenizer  = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# HF-Datasets
hf_train = Dataset.from_list(train_docs)
hf_dev   = Dataset.from_list(dev_docs)

max_length, stride = 512, 128


## 5. Tokenize & Align Labels

In [65]:
def tokenize_and_align_labels(examples):
    all_input_ids, all_attention_mask, all_labels = [], [], []
    for doc, entities in zip(examples["doc"], examples["entities"]):
        tok = tokenizer(
            doc, return_offsets_mapping=True,
            truncation=True, max_length=max_length,
            stride=stride, return_overflowing_tokens=True
        )
        for i in range(len(tok["input_ids"])):
            offsets = tok["offset_mapping"][i]
            labels  = ["O"] * len(offsets)
            # Mentions einzeichnen…
            for ent in entities:
                for mention in ent["mentions"]:
                    start = doc.find(mention)
                    end   = start + len(mention)
                    for idx,(o_start,o_end) in enumerate(offsets):
                        if o_start>=start and o_end<=end:
                            labels[idx] = ("B" if o_start==start else "I") + f"-{ent['type']}"
            all_input_ids.append(tok["input_ids"][i])
            all_attention_mask.append(tok["attention_mask"][i])
            all_labels.append([label2id.get(l,0) for l in labels])
    return {"input_ids": all_input_ids,
            "attention_mask": all_attention_mask,
            "labels": all_labels}

## 6. Prepare Data

In [66]:
cols_to_remove = ["domain","title","doc","entities","triples","label_set","entity_label_set"]
hf_train = hf_train.map(tokenize_and_align_labels, batched=True, remove_columns=cols_to_remove)
hf_dev   = hf_dev.map(  tokenize_and_align_labels, batched=True, remove_columns=cols_to_remove)
data_collator = DataCollatorForTokenClassification(tokenizer)


Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

## 7. Eval-Metrik

In [67]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics_entity_only(pred):
    preds  = pred.predictions.argmax(-1).flatten()
    labels = pred.label_ids.flatten()
    # Nur echte Entity-Token (kein O, kein -100)
    mask = (labels != -100) & (labels != label2id["O"])
    if mask.sum() == 0:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0, "accuracy": 0.0}
    p, r, f1, _ = precision_recall_fscore_support(
        labels[mask], preds[mask], average="micro"
    )
    acc = accuracy_score(labels[mask], preds[mask])
    return {"precision": p, "recall": r, "f1": f1, "accuracy": acc}


## 8. Full-Fine-Tuning (BERT)

In [14]:
# 8.1: Hyperparams (aus 17.1)
best_ft = {"learning_rate":4.3586e-05,"batch_size":16}

# 8.2: TrainingArguments
ft_args = TrainingArguments(
    output_dir="outputs/bert-ner-full-ft-opt",
    per_device_train_batch_size=best_ft["batch_size"],
    per_device_eval_batch_size=best_ft["batch_size"]*2,
    evaluation_strategy="steps", eval_steps=30,
    logging_steps=10,
    save_steps=100,
    max_steps=150,
    learning_rate=best_ft["learning_rate"],
    fp16=torch.cuda.is_available(),
)

# 8.3: Model & Trainer
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(ner_labels),
    id2label=id2label,
    label2id=label2id,
)
trainer = Trainer(
    model=model,
    args=ft_args,
    train_dataset=hf_train,
    eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)

# 8.4: Train, Eval & Save
trainer.train()
metrics = trainer.evaluate()
print("✅ Full-FT Dev-F1:", metrics["eval_f1"], "Accuracy:", metrics["eval_accuracy"])
trainer.save_model("outputs/bert-ner-full-ft-opt")
tokenizer.save_pretrained("outputs/bert-ner-full-ft-opt")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
30,0.7688,0.529565,0.000251,0.000251,0.000251,0.000251
60,0.609,0.448234,0.167671,0.167671,0.167671,0.167671
90,0.4403,0.409679,0.241717,0.241717,0.241717,0.241717
120,0.3911,0.426748,0.2874,0.2874,0.2874,0.2874
150,0.345,0.426722,0.310994,0.310994,0.310994,0.310994


✅ Full-FT Dev-F1: 0.31099397590361444 Accuracy: 0.31099397590361444


('outputs/bert-ner-full-ft-opt\\tokenizer_config.json',
 'outputs/bert-ner-full-ft-opt\\special_tokens_map.json',
 'outputs/bert-ner-full-ft-opt\\vocab.txt',
 'outputs/bert-ner-full-ft-opt\\added_tokens.json',
 'outputs/bert-ner-full-ft-opt\\tokenizer.json')

## 9. LoRA-Fine-Tuning (BERT)

In [97]:
# 9.1: Beste LoRA-Params
best_lora = { "learning_rate":2.22e-05, "r":16, "alpha":16, "dropout":0.0158, "batch_size":8 }

# 9.2: Adapter konfigurieren
from peft import LoraConfig, get_peft_model
lora_conf = LoraConfig(task_type="TOKEN_CLS", inference_mode=False, **best_lora)
base = AutoModelForTokenClassification.from_pretrained(model_name,
       num_labels=len(ner_labels), id2label=id2label, label2id=label2id)
lora_model = get_peft_model(base, lora_conf)

# 9.3: Trainer
lora_args = TrainingArguments(
    output_dir="outputs/bert-ner-lora-final",
    per_device_train_batch_size=best_lora["batch_size"],
    per_device_eval_batch_size=best_lora["batch_size"]*2,
    evaluation_strategy="steps", eval_steps=10,
    logging_steps=10,
    save_strategy="no",
    max_steps=100,
    learning_rate=best_lora["learning_rate"],
    fp16=torch.cuda.is_available(),
    save_only_model=True,
)
lora_trainer = Trainer(
    model=lora_model,
    args=lora_args,
    train_dataset=hf_train,
    eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)
lora_trainer.train()
print("✅ LoRA Dev-F1:", lora_trainer.evaluate()["eval_f1"])


TypeError: LoraConfig.__init__() got an unexpected keyword argument 'learning_rate'

## 10. Partial-Freeze (BERT)

In [98]:
# 10.1: Beste Freeze-Params
best_freeze = {"freeze_pct":0.5,"learning_rate":3.23e-05,"batch_size":16}
# 10.2: Modell laden + einfrieren
freeze_model = AutoModelForTokenClassification.from_pretrained(
   model_name, num_labels=len(ner_labels), id2label=id2label, label2id=label2id
)
total_layers = len([n for n,_ in freeze_model.named_parameters() 
                    if n.startswith("bert.encoder.layer.")])//2
cutoff = int(total_layers * best_freeze["freeze_pct"])
for name,param in freeze_model.named_parameters():
    if name.startswith("bert.encoder.layer.") and int(name.split(".")[3])<cutoff:
        param.requires_grad=False

# 10.3: Trainer
freeze_args = TrainingArguments(
    output_dir="outputs/bert-ner-freeze-final",
    per_device_train_batch_size=best_freeze["batch_size"],
    per_device_eval_batch_size=best_freeze["batch_size"]*2,
    evaluation_strategy="steps", eval_steps=10,
    logging_steps=10,
    save_strategy="no",
    max_steps=100,
    learning_rate=best_freeze["learning_rate"],
    fp16=torch.cuda.is_available(),
)
freeze_trainer = Trainer(
    model=freeze_model,
    args=freeze_args,
    train_dataset=hf_train,
    eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)
freeze_trainer.train()
print("✅ Freeze Dev-F1:", freeze_trainer.evaluate()["eval_f1"])


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  freeze_trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,3.4899,3.430923,0.018323,0.018323,0.018323,0.018323
20,3.4078,3.342619,0.024598,0.024598,0.024598,0.024598
30,3.33,3.264952,0.028112,0.028112,0.028112,0.028112
40,3.248,3.197932,0.028865,0.028865,0.028865,0.028865
50,3.1985,3.140686,0.03012,0.03012,0.03012,0.03012
60,3.1493,3.093532,0.03238,0.03238,0.03238,0.03238
70,3.0998,3.056425,0.033384,0.033384,0.033384,0.033384
80,3.0924,3.02961,0.033133,0.033133,0.033133,0.033133
90,3.0529,3.013155,0.031878,0.031878,0.031878,0.031878
100,3.0436,3.007274,0.031878,0.031878,0.031878,0.031878


✅ Freeze Dev-F1: 0.03187751004016064


| Model | Method         | Dev-F1\_EI | Dev-F1\_EC |
| ----- | -------------- | ---------- | ---------- |
| BERT  | Zero-Shot      | 0.0271     | x.xxx      |
| BERT  | Full-FT (opt)  | 0.3753     | x.xxx      |
| BERT  | LoRA (opt)     | 0.0271     | x.xxx      |
| BERT  | Partial-Freeze | 0.0630     | x.xxx      |


## 11. Test-Set Inference

In [17]:
from transformers import pipeline, AutoTokenizer
import json

# Tokenizer & Pipeline
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
ner_pipe  = pipeline(
    "ner",
    model="outputs/bert-ner-full-ft-opt",
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0
)

# Inferenz auf test_docs und Umwandlung der Scores
ner_preds = []
for d in test_docs:
    ents = ner_pipe(d["document"])
    # Cast aller score-Felder zu Python float
    for ent in ents:
        ent["score"] = float(ent["score"])
    ner_preds.append({
        "id": d["id"],
        "entities": ents
    })

# Speichern der Predictions
with open("ner_test_full_ft_bert.json", "w", encoding="utf-8") as f:
    json.dump(ner_preds, f, ensure_ascii=False, indent=2)

print(f"✅ Test-Predictions gespeichert (insgesamt {len(ner_preds)} Dokumente).")


Device set to use cpu


✅ Test-Predictions gespeichert (insgesamt 248 Dokumente).


## 12. import the test file and evaluate it

In [25]:
import os, json
from transformers import pipeline, AutoTokenizer

# 1. Ordnerstruktur anlegen
os.makedirs("input/res", exist_ok=True)
os.makedirs("input/ref", exist_ok=True)
os.makedirs("output",   exist_ok=True)

# 2. Tokenizer & Pipeline laden
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
ner_pipe  = pipeline(
    "ner",
    model="outputs/bert-ner-full-ft-opt",
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0  # 0 für GPU, -1 für CPU
)

# 3. Vorhersagen auf dev_docs
dev_preds = {}
for idx, doc in enumerate(dev_docs):
    ents = ner_pipe(doc["doc"])
    for ent in ents:
        ent["score"] = float(ent["score"])  # JSON-kompatibel machen
    dev_preds[str(idx)] = {
        "entities": ents,
        "triples":  []  # Falls du später RE predizierst, kannst du das füllen
    }

# 4. Gold-Daten (reference) vorbereiten
gt_dev = {}
for idx, doc in enumerate(dev_docs):
    gt_dev[str(idx)] = {
        "entities": doc["entities"],
        "triples":  doc["triples"]
    }

# 5. Abspeichern
with open("input/res/results.json", "w", encoding="utf-8") as f:
    json.dump(dev_preds, f, ensure_ascii=False, indent=2)

with open("input/ref/reference.json", "w", encoding="utf-8") as f:
    json.dump(gt_dev, f, ensure_ascii=False, indent=2)

print("✅ Alles vorbereitet:")
print("input/res:", os.listdir("input/res"))
print("input/ref:", os.listdir("input/ref"))


Device set to use cpu


✅ Alles vorbereitet:
input/res: ['results.json']
input/ref: ['reference.json']


In [43]:
!python evaluate_ner.py

F1_EI (Entity Identification): 0.0000
F1_EC (Entity Classification): 0.0000


## look at resluts.json in the input file for the inference, show screenshot

## 13: RE-Daten aus JSON zu train/dev/test Examples aufbereiten

In [68]:
examples = []

# Helferfunktion zur Erstellung von positiven und negativen RE-Beispielen
def extract_examples(docs, split):
    local_examples = []
    for doc in docs:
        if not doc.get("triples") or not doc.get("entities"):
            continue

        text = doc.get("doc") or doc.get("document")
        if not text:
            continue

        ents = [e["mentions"][0] for e in doc["entities"] if e.get("mentions")]
        true_pairs = {(t["head"], t["tail"]) for t in doc["triples"]}

        for triple in doc["triples"]:
            # Positives Beispiel
            local_examples.append({
                "split":    split,
                "sentence": text,
                "head":     triple["head"],
                "tail":     triple["tail"],
                "label":    triple["relation"],
            })

            # Negativbeispiel: zufällige Kombination, die kein Gold-Paar ist
            while True:
                h, t = random.sample(ents, 2)
                if (h, t) not in true_pairs:
                    local_examples.append({
                        "split":    split,
                        "sentence": text,
                        "head":     h,
                        "tail":     t,
                        "label":    "no_relation",
                    })
                    break
    return local_examples

# Jetzt mit bestehenden Variablen arbeiten
examples.extend(extract_examples(train_docs, "train"))
examples.extend(extract_examples(dev_docs, "dev"))
examples.extend(extract_examples(test_docs, "test"))

# In HuggingFace DatasetDict umwandeln
train_ex = [e for e in examples if e["split"] == "train"]
dev_ex   = [e for e in examples if e["split"] == "dev"]
test_ex  = [e for e in examples if e["split"] == "test"]

ds = DatasetDict({
    "train": Dataset.from_list(train_ex),
    "dev":   Dataset.from_list(dev_ex),
    "test":  Dataset.from_list(test_ex),
})

# Optional: Preview
print(ds)

DatasetDict({
    train: Dataset({
        features: ['split', 'sentence', 'head', 'tail', 'label'],
        num_rows: 1222
    })
    dev: Dataset({
        features: ['split', 'sentence', 'head', 'tail', 'label'],
        num_rows: 606
    })
    test: Dataset({
        features: [],
        num_rows: 0
    })
})


In [69]:
raw_re_ds = ds 

## 14: Tokenisierung & Label-Mapping für RE

In [70]:
# === Kapitel 14 (angepasst): Tokenisierung & Label-Mapping für RE ===

from transformers import AutoTokenizer
import torch



# Labels aus Originaldaten extrahieren
original_train_labels = [ex["label"] for ex in raw_re_ds["train"]]
original_dev_labels = [ex["label"] for ex in raw_re_ds["dev"]]
all_labels = sorted(set(original_train_labels + original_dev_labels))

# Label-Mapping
label2id_re = {lab: i for i, lab in enumerate(all_labels)}
id2label_re = {i: lab for lab, i in label2id_re.items()}


# 14.2: Tokenizer laden
tokenizer_re = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
max_length_re = 128

# 14.3: Tokenisierungsfunktion
def tokenize_re(example):
    encoded = tokenizer_re(
        example["head"],
        example["tail"] + " " + example["sentence"],
        truncation=True,
        max_length=max_length_re,
        padding="max_length"
    )
    encoded["labels"] = label2id_re[example["label"]]
    return encoded

# 14.4: Nur auf Splits anwenden, die NICHT leer sind und noch die originalen Spalten enthalten
for split in ds:
    if len(ds[split]) > 0 and "label" in ds[split].column_names:
        ds[split] = ds[split].map(tokenize_re, batched=False, remove_columns=ds[split].column_names)

# 14.5: PyTorch-Format setzen (für bereits gemappte Splits)
for split in ds:
    if len(ds[split]) > 0:
        ds[split].set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/1222 [00:00<?, ? examples/s]

Map:   0%|          | 0/606 [00:00<?, ? examples/s]

## 15. BERT RE Baseline

In [71]:
# === Kapitel 15: RE – Baseline-Training mit BERT (Full Fine-Tuning) ===

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score

# 15.1: Modell laden mit passender Labelanzahl
model_re = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id_re),
    id2label=id2label_re,
    label2id=label2id_re
)

# 15.2: Trainingsargumente definieren
training_args_re = TrainingArguments(
    output_dir="outputs/bert-re-baseline",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=1,
    fp16=torch.cuda.is_available()  # nur aktiv, wenn CUDA verfügbar
)

# 15.3: F1-Metrik definieren
def compute_metrics_re(p):
    preds = p.predictions.argmax(-1)
    return {"eval_f1": f1_score(p.label_ids, preds, average="macro")}

# 15.4: Trainer initialisieren
trainer_re = Trainer(
    model=model_re,
    args=training_args_re,
    train_dataset=ds["train"],
    eval_dataset=ds["dev"],
    tokenizer=tokenizer_re,
    compute_metrics=compute_metrics_re
)

# 15.5: Training starten
trainer_re.train()

# 15.6: Evaluieren
metrics_re = trainer_re.evaluate()
print("🔖 RE Baseline Dev-F1:", metrics_re["eval_f1"])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_re = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,2.4127,2.501379,0.021458
2,2.1397,2.415065,0.033357
3,1.7599,2.41956,0.033919


🔖 RE Baseline Dev-F1: 0.0339194749216301


## 16: FUll fine tuning Bert RE

In [81]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from sklearn.metrics import f1_score

# 1. Modell mit Label-Mappings
model_re_ft = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id_re),
    id2label=id2label_re,
    label2id=label2id_re
)

# 2. Beste Hyperparameter (aus deinem Tuning)
best_ft = {
    "learning_rate": 4.3575643120387554e-05,
    "batch_size": 16
}

# 3. Trainingsargumente (→ mit Speicherpfad)
training_args_ft = TrainingArguments(
    output_dir="outputs/re-bert-final-ft",  # << output name geändert
    evaluation_strategy="steps",
    eval_steps=10,
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",  # speichert am Ende jeder Epoche
    save_total_limit=1,     # nur letztes Modell behalten
    max_steps=100,
    per_device_train_batch_size=best_ft["batch_size"],
    per_device_eval_batch_size=best_ft["batch_size"] * 2,
    learning_rate=best_ft["learning_rate"],
    fp16=torch.cuda.is_available()
)

# 4. F1-Metrik
def compute_metrics_ft(p):
    preds = p.predictions.argmax(-1)
    return {"eval_f1": f1_score(p.label_ids, preds, average="macro")}

# 5. Trainer
trainer_re_ft = Trainer(
    model=model_re_ft,
    args=training_args_ft,
    train_dataset=ds["train"],
    eval_dataset=ds["dev"],
    tokenizer=tokenizer_re,
    compute_metrics=compute_metrics_ft
)

# 6. Training starten
trainer_re_ft.train()

# 7. Modell speichern (optional manuell – falls nicht automatisch)
trainer_re_ft.save_model("outputs/re-bert-final-ft")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_re_ft = Trainer(


Step,Training Loss,Validation Loss,F1
10,4.0015,3.48552,0.020168
20,3.0454,2.861245,0.021505
30,3.0195,2.796352,0.021505
40,2.7394,2.719007,0.021505
50,2.6019,2.673312,0.021505
60,2.5416,2.631195,0.021505
70,2.4901,2.616581,0.021505
80,2.3949,2.564026,0.021505
90,2.2013,2.555,0.021505
100,2.0005,2.549991,0.021505


## What Does a Micro-F1 Score of 0.5 Really Mean in RE?
In our relation extraction task, the "no_relation" label is assigned to randomly paired entity mentions that do not have a meaningful relationship. Since these negative examples are added for every positive example (1:1), the "no_relation" class becomes the most frequent class in the dataset.

This introduces a critical imbalance:
A model that simply predicts "no_relation" for every input can still achieve a micro-F1 score of ~0.5, even though it fails to capture any true relationships.

Therefore:

A micro-F1 of 0.5 does not indicate meaningful learning.

The macro-F1 score, which treats all classes equally, remains very low (e.g., ~0.02), reflecting the model's poor performance on actual relation classes.

In summary, micro-F1 in this context can be misleading and should always be interpreted alongside macro-F1 and qualitative examples.

## 🔍 Why F1 Scores Are Higher in NER Than in RE (Relation Extraction)

It is completely normal for your model to achieve **higher F1 scores in Named Entity Recognition (NER)** than in **Relation Extraction (RE)**. Here’s why:

| Aspect                  | NER                                                  | RE                                                       |
|-------------------------|------------------------------------------------------|-----------------------------------------------------------|
| **Task**                | Detect entities in a sentence                        | Identify semantic relationships between two entities      |
| **Complexity**          | Easier – Local information is sufficient             | Harder – Requires understanding of sentence-level context |
| **Negative Examples**   | Rare or implicit                                     | Abundant due to many "no_relation" pairs                  |
| **Model Behavior**      | Learns boundary/entity types well                    | Struggles to semantically distinguish true relations      |
| **Typical F1**          | ~0.3–0.8 depending on model/data                     | Often very low (<0.2) without optimization                |
| **Baseline**            | Random is near 0                                     | Random can reach ≈ 0.5 if "no_relation" is dominant       |

### 🔁 Summary:
NER is generally a simpler task for pre-trained models like BERT. In RE, the model must understand **complex dependencies** between multiple parts of the sentence and distinguish fine-grained relations from the dominant "no_relation" class.

Hence, even if your model achieves **F1 = 0.5 in NER** but only **F1 = 0.03 in RE**, that is not unusual — it simply reflects the **increased difficulty** of the RE task.


### 🔧 Hyperparameter Tuning (Method)

We used Optuna to tune the most important hyperparameters for each fine-tuning strategy:

- Full Fine-Tuning
- LoRA
- Partial Freezing

The tuning was performed independently for:
- Named Entity Recognition (NER)
- Relation Extraction (RE)

The same optimization loop was used for all experiments.

#### Example: Tuning Loop (used across all models)

```python
def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    # ...


In [105]:
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import torch
import numpy as np

# Load tokenizer and model
tokenizer_re = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("outputs/re-bert-final-ft")
model.eval()

# Load input samples (adjust path if needed)
with open("re_bert_input_samples.json", "r", encoding="utf-8") as f:
    test_inputs = json.load(f)

# Predict relations
results = []
for ex in test_inputs:
    encoded = tokenizer_re(
        ex["head"],
        ex["tail"] + " " + ex["sentence"],
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding="max_length"
    )

    with torch.no_grad():
        output = model(**encoded)
        probs = torch.nn.functional.softmax(output.logits, dim=-1).squeeze()
        pred_id = probs.argmax().item()
        pred_label = model.config.id2label[pred_id]
        score = probs[pred_id].item()

    results.append({
        "head": ex["head"],
        "tail": ex["tail"],
        "relation": pred_label,
        "score": round(score, 4)
    })

# Save results for visualization or evaluation
with open("re_bert_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print("✅ Predictions saved to re_bert_results.json")


✅ Predictions saved to re_bert_results.json


In [106]:
with open("re_bert_input_samples.json", "r", encoding="utf-8") as f:
    test_inputs = json.load(f)
