## 1. Imports

In [1]:
import os
import json
import torch
import numpy as np
from pathlib import Path
from datasets import Dataset
from collections import Counter
from sklearn.metrics import precision_recall_fscore_support

import optuna
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
)
from peft import LoraConfig, get_peft_model





## 2. Data Paths

In [2]:
TRAIN_DIR = Path(r"C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\dataset\train")
DEV_DIR = Path(r"C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\dataset\dev")
TEST_DIR = Path(r"C:\Users\nmilo\OneDrive\Desktop\Master\Semester2\NLP\project\dataset\test")

assert TRAIN_DIR.exists(), f"Train directory not found: {TRAIN_DIR}"
assert DEV_DIR.exists(), f"Dev directory not found: {DEV_DIR}"
assert TEST_DIR.exists(), f"Test directory not found: {TEST_DIR}"

## 3. Load DocIE Data

In [3]:
def load_docie_docs(folder: Path, recursive: bool = False):
    """Load JSON documents from DocIE dataset."""
    docs = []
    pattern = "**/*.json" if recursive else "*.json"
    for file in folder.glob(pattern):
        data = json.loads(file.read_text(encoding="utf-8"))
        if isinstance(data, list):
            docs.extend(data)
        else:
            docs.append(data)
    return docs

train_docs = load_docie_docs(TRAIN_DIR)
dev_docs = load_docie_docs(DEV_DIR)
test_docs = load_docie_docs(TEST_DIR, recursive=True)

print(f"Loaded documents - Train: {len(train_docs)}, Dev: {len(dev_docs)}, Test: {len(test_docs)}")

Loaded documents - Train: 51, Dev: 23, Test: 248


## 4. Exploratory Data Analysis

In [4]:
# 4.1 Document lengths
lengths = [len(doc["doc"].split()) for doc in train_docs]
print(f"Document statistics - Avg tokens: {np.mean(lengths):.1f}, Max tokens: {np.max(lengths)}")

# 4.2 Entity distribution
entity_counter = Counter(ent["type"] for doc in train_docs for ent in doc["entities"])
print(f"\nTop 10 entity types:")
for entity_type, count in entity_counter.most_common(10):
    print(f"  {entity_type}: {count}")

Document statistics - Avg tokens: 919.1, Max tokens: 2560

Top 10 entity types:
  DATE: 647
  MISC: 417
  PERSON: 242
  ORG: 241
  CARDINAL: 224
  GPE: 157
  WORK_OF_ART: 65
  NORP: 59
  ORDINAL: 55
  QUANTITY: 42


## 5. Setup Label Mappings

In [5]:
# Get all entity types from train set
entity_types = train_docs[0]["entity_label_set"]

# Create BIO tags
ner_labels = ["O"]
for entity_type in entity_types:
    ner_labels.extend([f"B-{entity_type}", f"I-{entity_type}"])

label2id = {label: i for i, label in enumerate(ner_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Total NER labels: {len(ner_labels)}")

Total NER labels: 39


## 6. Initialize GPT-Neo Tokenizer

In [6]:
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# CRITICAL: Add padding token for GPT-Neo
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

print(f"Tokenizer vocab size: {len(tokenizer)}")

Tokenizer vocab size: 50258


## 7. Tokenization with Label Alignment

In [7]:
max_length = 512
stride = 128

def tokenize_and_align_labels(examples):
    """Tokenize documents and align NER labels with subword tokens."""
    all_input_ids = []
    all_attention_mask = []
    all_labels = []
    
    for doc, entities in zip(examples["doc"], examples["entities"]):
        # Tokenize with overflow handling
        tokenized = tokenizer(
            doc,
            return_offsets_mapping=True,
            truncation=True,
            max_length=max_length,
            stride=stride,
            return_overflowing_tokens=True,
        )
        
        # Process each chunk
        for i in range(len(tokenized["input_ids"])):
            offsets = tokenized["offset_mapping"][i]
            input_ids = tokenized["input_ids"][i]
            attention_mask = tokenized["attention_mask"][i]
            
            # Initialize with "O" labels
            chunk_labels = ["O"] * len(offsets)
            
            # Map entity mentions to token labels
            for entity in entities:
                entity_type = entity["type"]
                for mention in entity["mentions"]:
                    start = doc.find(mention)
                    if start < 0:
                        continue
                    end = start + len(mention)
                    
                    # Label tokens that overlap with entity mention
                    for idx, (token_start, token_end) in enumerate(offsets):
                        if token_start >= start and token_end <= end:
                            prefix = "B" if token_start == start else "I"
                            chunk_labels[idx] = f"{prefix}-{entity_type}"
            
            # Convert labels to IDs
            label_ids = [label2id.get(label, label2id["O"]) for label in chunk_labels]
            
            all_input_ids.append(input_ids)
            all_attention_mask.append(attention_mask)
            all_labels.append(label_ids)
    
    return {
        "input_ids": all_input_ids,
        "attention_mask": all_attention_mask,
        "labels": all_labels,
    }

## 8. Create Hugging Face Datasets

In [8]:
# Convert to HF Dataset
hf_train = Dataset.from_list(train_docs)
hf_dev = Dataset.from_list(dev_docs)

# Apply tokenization
columns_to_remove = ["domain", "title", "doc", "entities", "triples", "label_set", "entity_label_set"]

hf_train = hf_train.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=columns_to_remove,
)

hf_dev = hf_dev.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=columns_to_remove,
)

print(f"Tokenized datasets - Train: {len(hf_train)}, Dev: {len(hf_dev)}")

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Tokenized datasets - Train: 167, Dev: 75


## 9. Setup Training Components

In [9]:
# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Metrics computation
def compute_metrics_entity_only(pred):
    """Compute metrics only on entity tokens (non-O labels)."""
    preds = pred.predictions.argmax(-1).flatten()
    labels = pred.label_ids.flatten()
    
    # Filter out non-entity labels and padding
    mask = (labels != label2id["O"]) & (labels != -100)
    
    if mask.sum() == 0:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels[mask], preds[mask], average="micro"
    )
    
    return {"precision": precision, "recall": recall, "f1": f1}


## 10. Baseline: Full Fine-Tuning (3 epochs)

## 10.1 Train Baseline Model

In [10]:
def train_baseline():
    """Train GPT-Neo baseline for 3 epochs."""
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(ner_labels),
        id2label=id2label,
        label2id=label2id,
    )
    
    # Resize embeddings to accommodate padding token
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id
    
    training_args = TrainingArguments(
        output_dir="outputs/gpt-neo-ner-baseline",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=3,
        learning_rate=3e-3,
        weight_decay=0.0,
        fp16=torch.cuda.is_available(),
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1",
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=hf_train,
        eval_dataset=hf_dev,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_entity_only,
    )
    
    trainer.train()
    metrics = trainer.evaluate()
    
    return trainer, metrics

baseline_trainer, baseline_metrics = train_baseline()
print(f"Baseline Dev F1: {baseline_metrics['eval_f1']:.4f}")

Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.606034,0.0,0.0,0.0
2,No log,0.448494,0.0,0.0,0.0
3,No log,0.435772,0.0,0.0,0.0


Baseline Dev F1: 0.0000


## 11. Hyperparameter Tuning with 100 Steps Budget

## 11.1 Full Fine-Tuning Hyperparameter Search

In [11]:
def ft_objective(trial):
    """Optuna objective for full fine-tuning."""
    lr = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    bs = trial.suggest_categorical("batch_size", [4, 8, 16])
    
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(ner_labels),
        id2label=id2label,
        label2id=label2id,
    )
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id
    
    args = TrainingArguments(
        output_dir=f"tmp/gpt-neo-ft-{trial.number}",
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs * 2,
        evaluation_strategy="steps",
        eval_steps=20,
        save_strategy="no",
        max_steps=100,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
        logging_steps=40,
    )
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=hf_train,
        eval_dataset=hf_dev,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_entity_only,
    )
    
    trainer.train()
    return trainer.evaluate()["eval_f1"]

study_ft = optuna.create_study(direction="maximize")
study_ft.optimize(ft_objective, n_trials=8)

print(f"Best Full-FT params: {study_ft.best_params}")
print(f"Best Full-FT Dev F1: {study_ft.best_value:.4f}")


[I 2025-05-28 01:16:48,897] A new study created in memory with name: no-name-fa03aecb-0a36-47ae-be4b-abde409aba82
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.622342,0.004813,0.004813,0.004813
40,1.328600,0.452824,0.029563,0.029563,0.029563
60,1.328600,0.412285,0.022001,0.022001,0.022001
80,0.630500,0.393966,0.033001,0.033001,0.033001
100,0.630500,0.388718,0.038157,0.038157,0.038157


[I 2025-05-28 01:25:14,051] Trial 0 finished with value: 0.03815744242007563 and parameters: {'learning_rate': 2.3360560900592628e-05, 'batch_size': 4}. Best is trial 0 with value: 0.03815744242007563.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.714098,0.005156,0.005156,0.005156
40,1.701400,0.521812,0.009282,0.009282,0.009282
60,1.701400,0.458327,0.012719,0.012719,0.012719
80,0.656000,0.435911,0.020626,0.020626,0.020626
100,0.656000,0.424649,0.025782,0.025782,0.025782


[I 2025-05-28 01:39:40,170] Trial 1 finished with value: 0.02578205568924029 and parameters: {'learning_rate': 1.57433061586488e-05, 'batch_size': 8}. Best is trial 0 with value: 0.03815744242007563.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.973589,0.007563,0.007563,0.007563
40,1.818500,0.589955,0.011344,0.011344,0.011344
60,1.818500,0.513352,0.020282,0.020282,0.020282
80,0.817100,0.484245,0.019594,0.019594,0.019594
100,0.817100,0.47647,0.017532,0.017532,0.017532


[I 2025-05-28 01:47:55,313] Trial 2 finished with value: 0.017531797868683398 and parameters: {'learning_rate': 1.1388188337387634e-05, 'batch_size': 4}. Best is trial 0 with value: 0.03815744242007563.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,1.114825,0.009282,0.009282,0.009282
40,2.075800,0.64571,0.004813,0.004813,0.004813
60,2.075800,0.529065,0.009969,0.009969,0.009969
80,0.802600,0.487641,0.011344,0.011344,0.011344
100,0.802600,0.478245,0.011688,0.011688,0.011688


[I 2025-05-28 02:13:50,491] Trial 3 finished with value: 0.011687865245788931 and parameters: {'learning_rate': 1.0286725513372162e-05, 'batch_size': 16}. Best is trial 0 with value: 0.03815744242007563.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.463056,0.022001,0.022001,0.022001
40,1.041300,0.363042,0.037814,0.037814,0.037814
60,1.041300,0.326878,0.135785,0.135785,0.135785
80,0.493500,0.314693,0.137848,0.137848,0.137848
100,0.493500,0.31292,0.139911,0.139911,0.139911


[I 2025-05-28 02:20:53,216] Trial 4 finished with value: 0.13991062220694397 and parameters: {'learning_rate': 4.8286874395325485e-05, 'batch_size': 4}. Best is trial 4 with value: 0.13991062220694397.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,1.217464,0.009282,0.009282,0.009282
40,2.185700,0.658668,0.005156,0.005156,0.005156
60,2.185700,0.561856,0.006531,0.006531,0.006531
80,0.853700,0.518426,0.009625,0.009625,0.009625
100,0.853700,0.509103,0.009969,0.009969,0.009969


[I 2025-05-28 02:27:52,057] Trial 5 finished with value: 0.009969061533172912 and parameters: {'learning_rate': 1.0504389705952048e-05, 'batch_size': 4}. Best is trial 4 with value: 0.13991062220694397.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.67329,0.004469,0.004469,0.004469
40,1.587500,0.490267,0.011688,0.011688,0.011688
60,1.587500,0.441822,0.01375,0.01375,0.01375
80,0.623100,0.417038,0.027157,0.027157,0.027157
100,0.623100,0.406989,0.030938,0.030938,0.030938


[I 2025-05-28 02:39:55,699] Trial 6 finished with value: 0.030938466827088347 and parameters: {'learning_rate': 1.7932173751818086e-05, 'batch_size': 8}. Best is trial 4 with value: 0.13991062220694397.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.615403,0.008938,0.008938,0.008938
40,1.354900,0.461433,0.016501,0.016501,0.016501
60,1.354900,0.403151,0.030251,0.030251,0.030251
80,0.614000,0.381995,0.036439,0.036439,0.036439
100,0.614000,0.37676,0.040564,0.040564,0.040564


[I 2025-05-28 03:01:25,562] Trial 7 finished with value: 0.040563767617738056 and parameters: {'learning_rate': 1.811829484599362e-05, 'batch_size': 16}. Best is trial 4 with value: 0.13991062220694397.


Best Full-FT params: {'learning_rate': 4.8286874395325485e-05, 'batch_size': 4}
Best Full-FT Dev F1: 0.1399


## 11.2 LoRA Hyperparameter Search

In [12]:
def lora_objective(trial):
    """Optuna objective for LoRA fine-tuning."""
    lr = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    r = trial.suggest_categorical("r", [4, 8, 16])
    alpha = trial.suggest_categorical("alpha", [16, 32])
    dropout = trial.suggest_float("dropout", 0.0, 0.3)
    bs = trial.suggest_categorical("batch_size", [4, 8, 16])
    
    lora_config = LoraConfig(
        task_type="TOKEN_CLS",
        inference_mode=False,
        r=r,
        lora_alpha=alpha,
        lora_dropout=dropout,
    )
    
    base_model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(ner_labels),
        id2label=id2label,
        label2id=label2id,
    )
    base_model.resize_token_embeddings(len(tokenizer))
    base_model.config.pad_token_id = tokenizer.pad_token_id
    
    model = get_peft_model(base_model, lora_config)
    
    args = TrainingArguments(
        output_dir=f"tmp/gpt-neo-lora-{trial.number}",
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs * 2,
        evaluation_strategy="steps",
        eval_steps=20,
        save_strategy="no",
        max_steps=100,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
        logging_steps=40,
    )
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=hf_train,
        eval_dataset=hf_dev,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_entity_only,
    )
    
    trainer.train()
    return trainer.evaluate()["eval_f1"]

study_lora = optuna.create_study(direction="maximize")
study_lora.optimize(lora_objective, n_trials=8)

print(f"Best LoRA params: {study_lora.best_params}")
print(f"Best LoRA Dev F1: {study_lora.best_value:.4f}")


[I 2025-05-28 03:01:25,576] A new study created in memory with name: no-name-9f4e01c9-d70b-457c-86a1-d6ebab550cb1
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,5.235757,0.025782,0.025782,0.025782
40,4.831700,4.993161,0.028876,0.028876,0.028876
60,4.831700,4.811079,0.029907,0.029907,0.029907
80,4.377000,4.698526,0.03197,0.03197,0.03197
100,4.377000,4.659576,0.032314,0.032314,0.032314


[I 2025-05-28 03:07:12,907] Trial 0 finished with value: 0.03231350979718116 and parameters: {'learning_rate': 1.699587963095526e-05, 'r': 16, 'alpha': 32, 'dropout': 0.1543889276946275, 'batch_size': 4}. Best is trial 0 with value: 0.03231350979718116.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,1.906203,0.039189,0.039189,0.039189
40,2.576400,0.616284,0.008594,0.008594,0.008594
60,2.576400,0.504432,0.022344,0.022344,0.022344
80,0.766700,0.462681,0.036782,0.036782,0.036782
100,0.766700,0.45114,0.036782,0.036782,0.036782


[I 2025-05-28 03:26:10,381] Trial 1 finished with value: 0.036782399449982815 and parameters: {'learning_rate': 0.0001346154675646194, 'r': 16, 'alpha': 32, 'dropout': 0.10921539303642946, 'batch_size': 16}. Best is trial 1 with value: 0.036782399449982815.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,1.331085,0.073565,0.073565,0.073565
40,2.104800,0.511995,0.030595,0.030595,0.030595
60,2.104800,0.461866,0.090753,0.090753,0.090753
80,0.692300,0.427077,0.089034,0.089034,0.089034
100,0.692300,0.420581,0.090409,0.090409,0.090409


[I 2025-05-28 03:31:56,903] Trial 2 finished with value: 0.0904090752836026 and parameters: {'learning_rate': 0.00025904814349906566, 'r': 8, 'alpha': 16, 'dropout': 0.20472740729579403, 'batch_size': 4}. Best is trial 2 with value: 0.0904090752836026.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,1.263208,0.021657,0.021657,0.021657
40,2.223900,0.551404,0.009282,0.009282,0.009282
60,2.223900,0.458954,0.038501,0.038501,0.038501
80,0.669600,0.435083,0.044689,0.044689,0.044689
100,0.669600,0.423088,0.053627,0.053627,0.053627


[I 2025-05-28 03:42:08,857] Trial 3 finished with value: 0.053626675833619804 and parameters: {'learning_rate': 0.00017770439231661823, 'r': 16, 'alpha': 32, 'dropout': 0.2981820339412351, 'batch_size': 8}. Best is trial 2 with value: 0.0904090752836026.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.478688,0.038501,0.038501,0.038501
40,4.440900,3.285784,0.063596,0.063596,0.063596
60,4.440900,2.341301,0.054314,0.054314,0.054314
80,2.360200,1.775057,0.036439,0.036439,0.036439
100,2.360200,1.589349,0.032314,0.032314,0.032314


[I 2025-05-28 03:52:18,219] Trial 4 finished with value: 0.03231350979718116 and parameters: {'learning_rate': 8.513578365221181e-05, 'r': 8, 'alpha': 16, 'dropout': 0.018202468861115106, 'batch_size': 8}. Best is trial 2 with value: 0.0904090752836026.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,5.208848,0.017188,0.017188,0.017188
40,5.150600,4.701285,0.030595,0.030595,0.030595
60,5.150600,4.312561,0.039189,0.039189,0.039189
80,4.217100,4.068164,0.04297,0.04297,0.04297
100,4.217100,3.982038,0.044689,0.044689,0.044689


[I 2025-05-28 04:02:30,266] Trial 5 finished with value: 0.0446888965280165 and parameters: {'learning_rate': 4.014320539354151e-05, 'r': 16, 'alpha': 16, 'dropout': 0.2094956589758433, 'batch_size': 8}. Best is trial 2 with value: 0.0904090752836026.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.223549,0.050533,0.050533,0.050533
40,4.166900,2.808394,0.070471,0.070471,0.070471
60,4.166900,1.73719,0.03747,0.03747,0.03747
80,1.841500,1.170613,0.021657,0.021657,0.021657
100,1.841500,1.006924,0.019251,0.019251,0.019251


[I 2025-05-28 04:21:14,320] Trial 6 finished with value: 0.019250601581299414 and parameters: {'learning_rate': 9.58673524408832e-05, 'r': 8, 'alpha': 16, 'dropout': 0.02212344112971184, 'batch_size': 16}. Best is trial 2 with value: 0.0904090752836026.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,3.727169,0.081128,0.081128,0.081128
40,3.784400,2.578005,0.11241,0.11241,0.11241
60,3.784400,1.74258,0.098316,0.098316,0.098316
80,1.851400,1.298755,0.071158,0.071158,0.071158
100,1.851400,1.166322,0.063939,0.063939,0.063939


[I 2025-05-28 04:31:27,284] Trial 7 finished with value: 0.06393949810931591 and parameters: {'learning_rate': 8.404875701450256e-05, 'r': 16, 'alpha': 16, 'dropout': 0.2597506010900016, 'batch_size': 8}. Best is trial 2 with value: 0.0904090752836026.


Best LoRA params: {'learning_rate': 0.00025904814349906566, 'r': 8, 'alpha': 16, 'dropout': 0.20472740729579403, 'batch_size': 4}
Best LoRA Dev F1: 0.0904


## 11.3 Partial Freezing Hyperparameter Search

In [13]:
def freeze_objective(trial):
    """Optuna objective for partial freezing."""
    lr = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    bs = trial.suggest_categorical("batch_size", [4, 8, 16])
    freeze_pct = trial.suggest_float("freeze_pct", 0.25, 0.75)
    
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(ner_labels),
        id2label=id2label,
        label2id=label2id,
    )
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id
    
    # Freeze lower layers
    total_layers = len([n for n, _ in model.named_parameters() if n.startswith("transformer.h.")])
    cutoff = int(total_layers * freeze_pct)
    
    for name, param in model.named_parameters():
        if name.startswith("transformer.h.") and int(name.split(".")[2]) < cutoff:
            param.requires_grad = False
    
    args = TrainingArguments(
        output_dir=f"tmp/gpt-neo-freeze-{trial.number}",
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs * 2, 
        evaluation_strategy="steps",
        eval_steps=20,
        save_strategy="no",
        max_steps=100,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
        logging_steps=40,
    )
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=hf_train,
        eval_dataset=hf_dev,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_entity_only,
    )
    
    trainer.train()
    return trainer.evaluate()["eval_f1"]

study_freeze = optuna.create_study(direction="maximize")
study_freeze.optimize(freeze_objective, n_trials=8)

print(f"Best Freeze params: {study_freeze.best_params}")
print(f"Best Freeze Dev F1: {study_freeze.best_value:.4f}")

[I 2025-05-28 04:31:27,295] A new study created in memory with name: no-name-54b4691e-cf0d-468f-933f-69e97a77eadf
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.999034,0.017532,0.017532,0.017532
40,4.900000,4.366651,0.035064,0.035064,0.035064
60,4.900000,3.917424,0.046408,0.046408,0.046408
80,3.636100,3.649133,0.05122,0.05122,0.05122
100,3.636100,3.556528,0.052252,0.052252,0.052252


[I 2025-05-28 04:41:33,218] Trial 0 finished with value: 0.052251632863526985 and parameters: {'learning_rate': 4.68643894587759e-05, 'batch_size': 8, 'freeze_pct': 0.49605228016306246}. Best is trial 0 with value: 0.052251632863526985.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,3.276584,0.02647,0.02647,0.02647
40,3.694200,2.867229,0.029907,0.029907,0.029907
60,3.694200,2.589751,0.034032,0.034032,0.034032
80,2.841800,2.428768,0.035751,0.035751,0.035751
100,2.841800,2.374424,0.035407,0.035407,0.035407


[I 2025-05-28 04:59:41,298] Trial 1 finished with value: 0.03540735647989 and parameters: {'learning_rate': 3.137799220931297e-05, 'batch_size': 16, 'freeze_pct': 0.5162507103405969}. Best is trial 0 with value: 0.052251632863526985.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.811994,0.03197,0.03197,0.03197
40,4.333000,4.270221,0.03472,0.03472,0.03472
60,4.333000,3.886184,0.03747,0.03747,0.03747
80,3.255900,3.655025,0.039876,0.039876,0.039876
100,3.255900,3.575303,0.040564,0.040564,0.040564


[I 2025-05-28 05:17:49,246] Trial 2 finished with value: 0.040563767617738056 and parameters: {'learning_rate': 3.615326922303352e-05, 'batch_size': 16, 'freeze_pct': 0.6016283413192749}. Best is trial 0 with value: 0.052251632863526985.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,5.255941,0.02647,0.02647,0.02647
40,4.809300,5.051569,0.029563,0.029563,0.029563
60,4.809300,4.906227,0.030251,0.030251,0.030251
80,4.390100,4.818561,0.030938,0.030938,0.030938
100,4.390100,4.78824,0.031626,0.031626,0.031626


[I 2025-05-28 05:35:59,030] Trial 3 finished with value: 0.031625988312134756 and parameters: {'learning_rate': 1.3478567737864302e-05, 'batch_size': 16, 'freeze_pct': 0.4388738585587837}. Best is trial 0 with value: 0.052251632863526985.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.943456,0.029563,0.029563,0.029563
40,4.484700,4.486163,0.033689,0.033689,0.033689
60,4.484700,4.161949,0.036095,0.036095,0.036095
80,3.617000,3.969516,0.036782,0.036782,0.036782
100,3.617000,3.903218,0.037126,0.037126,0.037126


[I 2025-05-28 05:46:00,809] Trial 4 finished with value: 0.037126160192506016 and parameters: {'learning_rate': 3.306793054978061e-05, 'batch_size': 8, 'freeze_pct': 0.7400036459277188}. Best is trial 0 with value: 0.052251632863526985.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,3.062951,0.027157,0.027157,0.027157
40,3.462100,2.516263,0.034376,0.034376,0.034376
60,3.462100,2.160491,0.036439,0.036439,0.036439
80,2.347900,1.961669,0.036782,0.036782,0.036782
100,2.347900,1.896145,0.036782,0.036782,0.036782


[I 2025-05-28 06:04:03,772] Trial 5 finished with value: 0.036782399449982815 and parameters: {'learning_rate': 4.392241100313621e-05, 'batch_size': 16, 'freeze_pct': 0.604964613507464}. Best is trial 0 with value: 0.052251632863526985.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.882679,0.030938,0.030938,0.030938
40,4.422800,4.378869,0.03472,0.03472,0.03472
60,4.422800,4.022056,0.036782,0.036782,0.036782
80,3.466900,3.810516,0.037814,0.037814,0.037814
100,3.466900,3.737684,0.038845,0.038845,0.038845


[I 2025-05-28 06:14:07,152] Trial 6 finished with value: 0.038844963905122036 and parameters: {'learning_rate': 3.6501860444839506e-05, 'batch_size': 8, 'freeze_pct': 0.5958103899935909}. Best is trial 0 with value: 0.052251632863526985.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,3.466835,0.022688,0.022688,0.022688
40,3.916900,3.182568,0.02647,0.02647,0.02647
60,3.916900,2.987263,0.028532,0.028532,0.028532
80,3.358900,2.873738,0.02922,0.02922,0.02922
100,3.358900,2.835142,0.030251,0.030251,0.030251


[I 2025-05-28 06:19:51,224] Trial 7 finished with value: 0.030250945342041938 and parameters: {'learning_rate': 2.5752130433873785e-05, 'batch_size': 4, 'freeze_pct': 0.6114340820059213}. Best is trial 0 with value: 0.052251632863526985.


Best Freeze params: {'learning_rate': 4.68643894587759e-05, 'batch_size': 8, 'freeze_pct': 0.49605228016306246}
Best Freeze Dev F1: 0.0523


## 12. Final Training with Optimal Hyperparameters

## 12.1 Full Fine-Tuning with Best Parameters

In [14]:
best_ft_params = study_ft.best_params

ft_model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(ner_labels),
    id2label=id2label,
    label2id=label2id,
)
ft_model.resize_token_embeddings(len(tokenizer))
ft_model.config.pad_token_id = tokenizer.pad_token_id

ft_args = TrainingArguments(
    output_dir="outputs/gpt-neo-ner-ft-final",
    per_device_train_batch_size=best_ft_params["batch_size"],
    per_device_eval_batch_size=best_ft_params["batch_size"] * 2,
    evaluation_strategy="steps",
    eval_steps=40,
    save_strategy="epoch",
    max_steps=200,
    learning_rate=best_ft_params["learning_rate"],
    fp16=torch.cuda.is_available(),
    logging_steps=40,
)

ft_trainer = Trainer(
    model=ft_model,
    args=ft_args,
    train_dataset=hf_train,
    eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)

ft_trainer.train()
ft_final_metrics = ft_trainer.evaluate()
print(f"Final Full-FT Dev F1: {ft_final_metrics['eval_f1']:.4f}")

Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  ft_trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
40,1.0132,0.363463,0.043314,0.043314,0.043314
80,0.4799,0.301928,0.186662,0.186662,0.186662
120,0.3812,0.295022,0.2011,0.2011,0.2011
160,0.305,0.292476,0.215194,0.215194,0.215194
200,0.3,0.29522,0.214163,0.214163,0.214163


Final Full-FT Dev F1: 0.2142


## 12.2 LoRA with Best Parameters

In [15]:
best_lora_params = study_lora.best_params

lora_config = LoraConfig(
    task_type="TOKEN_CLS",
    inference_mode=False,
    r=best_lora_params["r"],
    lora_alpha=best_lora_params["alpha"],
    lora_dropout=best_lora_params["dropout"],
)

base_model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(ner_labels),
    id2label=id2label,
    label2id=label2id,
)
base_model.resize_token_embeddings(len(tokenizer))
base_model.config.pad_token_id = tokenizer.pad_token_id

lora_model = get_peft_model(base_model, lora_config)

lora_args = TrainingArguments(
    output_dir="outputs/gpt-neo-ner-lora-final",
    per_device_train_batch_size=best_lora_params["batch_size"],
    per_device_eval_batch_size=best_lora_params["batch_size"] * 2,
    evaluation_strategy="steps",
    eval_steps=40,
    save_strategy="epoch",
    max_steps=200,
    learning_rate=best_lora_params["learning_rate"],
    fp16=torch.cuda.is_available(),
    logging_steps=40,
)

lora_trainer = Trainer(
    model=lora_model,
    args=lora_args,
    train_dataset=hf_train,
    eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)

lora_trainer.train()
lora_final_metrics = lora_trainer.evaluate()
print(f"Final LoRA Dev F1: {lora_final_metrics['eval_f1']:.4f}")

Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  lora_trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
40,1.4648,0.468565,0.072877,0.072877,0.072877
80,0.6116,0.363007,0.103128,0.103128,0.103128
120,0.4936,0.3378,0.094878,0.094878,0.094878
160,0.4336,0.325761,0.133035,0.133035,0.133035
200,0.4487,0.322332,0.12066,0.12066,0.12066




Final LoRA Dev F1: 0.1207


## 12.3 Partial Freezing with Best Parameters

In [16]:
best_freeze_params = study_freeze.best_params

freeze_model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(ner_labels),
    id2label=id2label,
    label2id=label2id,
)
freeze_model.resize_token_embeddings(len(tokenizer))
freeze_model.config.pad_token_id = tokenizer.pad_token_id

# Apply freezing
total_layers = len([n for n, _ in freeze_model.named_parameters() if n.startswith("transformer.h.")])
cutoff = int(total_layers * best_freeze_params["freeze_pct"])

for name, param in freeze_model.named_parameters():
    if name.startswith("transformer.h.") and int(name.split(".")[2]) < cutoff:
        param.requires_grad = False

freeze_args = TrainingArguments(
    output_dir="outputs/gpt-neo-ner-freeze-final",
    per_device_train_batch_size=best_freeze_params["batch_size"],
    per_device_eval_batch_size=best_freeze_params["batch_size"] * 2,
    evaluation_strategy="steps",
    eval_steps=40,
    save_strategy="epoch",
    max_steps=200,
    learning_rate=best_freeze_params["learning_rate"],
    fp16=torch.cuda.is_available(),
    logging_steps=40,
)

freeze_trainer = Trainer(
    model=freeze_model,
    args=freeze_args,
    train_dataset=hf_train,
    eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)

freeze_trainer.train()
freeze_final_metrics = freeze_trainer.evaluate()
print(f"Final Freeze Dev F1: {freeze_final_metrics['eval_f1']:.4f}")


Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  freeze_trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
40,5.3746,4.242479,0.036439,0.036439,0.036439
80,3.6053,2.904651,0.050189,0.050189,0.050189
120,2.3735,2.022399,0.043314,0.043314,0.043314
160,1.6829,1.572234,0.036439,0.036439,0.036439
200,1.4039,1.439071,0.036095,0.036095,0.036095


Final Freeze Dev F1: 0.0361


## 13. Results Summary

## 13.1 Compile Results

In [17]:
results = {
    "GPT-Neo NER Results": {
        "Baseline (3 epochs)": {
            "Dev F1": baseline_metrics["eval_f1"],
            "Parameters": "Full model (~125M)"
        },
        "Full Fine-Tuning (200 steps)": {
            "Dev F1": ft_final_metrics["eval_f1"],
            "Best Params": best_ft_params,
            "Parameters": "Full model (~125M)"
        },
        "LoRA (200 steps)": {
            "Dev F1": lora_final_metrics["eval_f1"],
            "Best Params": best_lora_params,
            "Parameters": f"~{best_lora_params['r'] * 2 * 768 / 1e6:.2f}M trainable"
        },
        "Partial Freezing (200 steps)": {
            "Dev F1": freeze_final_metrics["eval_f1"],
            "Best Params": best_freeze_params,
            "Parameters": f"~{(1 - best_freeze_params['freeze_pct']) * 125:.1f}M trainable"
        }
    }
}

print("\n" + "="*50)
print("GPT-Neo NER RESULTS SUMMARY")
print("="*50)
for method, metrics in results["GPT-Neo NER Results"].items():
    print(f"\n{method}:")
    print(f"  Dev F1: {metrics['Dev F1']:.4f}")
    print(f"  Trainable Parameters: {metrics['Parameters']}")
    if "Best Params" in metrics:
        print(f"  Best Hyperparameters: {metrics['Best Params']}")


GPT-Neo NER RESULTS SUMMARY

Baseline (3 epochs):
  Dev F1: 0.0000
  Trainable Parameters: Full model (~125M)

Full Fine-Tuning (200 steps):
  Dev F1: 0.2142
  Trainable Parameters: Full model (~125M)
  Best Hyperparameters: {'learning_rate': 4.8286874395325485e-05, 'batch_size': 4}

LoRA (200 steps):
  Dev F1: 0.1207
  Trainable Parameters: ~0.01M trainable
  Best Hyperparameters: {'learning_rate': 0.00025904814349906566, 'r': 8, 'alpha': 16, 'dropout': 0.20472740729579403, 'batch_size': 4}

Partial Freezing (200 steps):
  Dev F1: 0.0361
  Trainable Parameters: ~63.0M trainable
  Best Hyperparameters: {'learning_rate': 4.68643894587759e-05, 'batch_size': 8, 'freeze_pct': 0.49605228016306246}


## 13.2 Save Results

In [18]:
import json

with open("outputs/gpt_neo_ner_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("\nResults saved to outputs/gpt_neo_ner_results.json")



Results saved to outputs/gpt_neo_ner_results.json
