## 1. Imports

In [6]:
import os
import json
import torch
import numpy as np
from pathlib import Path
from datasets import Dataset, DatasetDict, Features, Value, Sequence
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
import optuna
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
)
from peft import LoraConfig, get_peft_model


## 2. Data Paths

In [2]:
TRAIN_DIR = Path(r"/kaggle/input/nlp-augmentedset/aug_train")
DEV_DIR   = Path(r"/kaggle/input/nlp-augmentedset/aug_dev")
TEST_DIR  = Path(r"/kaggle/input/nlp-augmentedset/test")

assert TRAIN_DIR.exists(), f"Train directory not found: {TRAIN_DIR}"
assert DEV_DIR.exists(), f"Dev directory not found: {DEV_DIR}"
assert TEST_DIR.exists(), f"Test directory not found: {TEST_DIR}"

## 3. Load DocIE Data

In [3]:
def load_docie_docs(folder: Path, recursive: bool = False):
    """Load JSON documents from DocIE dataset."""
    docs = []
    pattern = "**/*.json" if recursive else "*.json"
    for file in folder.glob(pattern):
        data = json.loads(file.read_text(encoding="utf-8"))
        if isinstance(data, list):
            docs.extend(data)
        else:
            docs.append(data)
    return docs

train_docs = load_docie_docs(TRAIN_DIR)
dev_docs = load_docie_docs(DEV_DIR)
test_docs = load_docie_docs(TEST_DIR, recursive=True)

print(f"Loaded documents - Train: {len(train_docs)}, Dev: {len(dev_docs)}, Test: {len(test_docs)}")

Loaded documents - Train: 204, Dev: 88, Test: 248


## 4. Exploratory Data Analysis

In [7]:
# 4.1 Document lengths
lengths = [len(doc["doc"].split()) for doc in train_docs]
print(f"Document statistics - Avg tokens: {np.mean(lengths):.1f}, Max tokens: {np.max(lengths)}")

# 4.2 Entity distribution
entity_counter = Counter(ent["type"] for doc in train_docs for ent in doc["entities"])
print(f"\nTop 10 entity types:")
for entity_type, count in entity_counter.most_common(10):
    print(f"  {entity_type}: {count}")

Document statistics - Avg tokens: 839.2, Max tokens: 2571

Top 10 entity types:
  DATE: 2588
  MISC: 1668
  PERSON: 968
  ORG: 964
  CARDINAL: 896
  GPE: 628
  WORK_OF_ART: 260
  NORP: 236
  ORDINAL: 220
  QUANTITY: 168


## 5. Setup Label Mappings

In [8]:
# Get all entity types from train set
entity_types = train_docs[0]["entity_label_set"]

# Create BIO tags
ner_labels = ["O"]
for entity_type in entity_types:
    ner_labels.extend([f"B-{entity_type}", f"I-{entity_type}"])

label2id = {label: i for i, label in enumerate(ner_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Total NER labels: {len(ner_labels)}")

Total NER labels: 39


## 6. Initialize GPT-Neo Tokenizer

In [9]:
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# CRITICAL: Add padding token for GPT-Neo
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

print(f"Tokenizer vocab size: {len(tokenizer)}")

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Tokenizer vocab size: 50258


## 7. Tokenization with Label Alignment

In [10]:
max_length = 512
stride = 128

def tokenize_and_align_labels(examples):
    """Tokenize documents and align NER labels with subword tokens."""
    all_input_ids = []
    all_attention_mask = []
    all_labels = []
    
    for doc, entities in zip(examples["doc"], examples["entities"]):
        # Tokenize with overflow handling
        tokenized = tokenizer(
            doc,
            return_offsets_mapping=True,
            truncation=True,
            max_length=max_length,
            stride=stride,
            return_overflowing_tokens=True,
        )
        
        # Process each chunk
        for i in range(len(tokenized["input_ids"])):
            offsets = tokenized["offset_mapping"][i]
            input_ids = tokenized["input_ids"][i]
            attention_mask = tokenized["attention_mask"][i]
            
            # Initialize with "O" labels
            chunk_labels = ["O"] * len(offsets)
            
            # Map entity mentions to token labels
            for entity in entities:
                entity_type = entity["type"]
                for mention in entity["mentions"]:
                    start = doc.find(mention)
                    if start < 0:
                        continue
                    end = start + len(mention)
                    
                    # Label tokens that overlap with entity mention
                    for idx, (token_start, token_end) in enumerate(offsets):
                        if token_start >= start and token_end <= end:
                            prefix = "B" if token_start == start else "I"
                            chunk_labels[idx] = f"{prefix}-{entity_type}"
            
            # Convert labels to IDs
            label_ids = [label2id.get(label, label2id["O"]) for label in chunk_labels]
            
            all_input_ids.append(input_ids)
            all_attention_mask.append(attention_mask)
            all_labels.append(label_ids)
    
    return {
        "input_ids": all_input_ids,
        "attention_mask": all_attention_mask,
        "labels": all_labels,
    }

## 8. Create Hugging Face Datasets

In [None]:
# Convert to HF Dataset
hf_train = Dataset.from_list(train_docs)
hf_dev = Dataset.from_list(dev_docs)

# Define output features for the .map() function

output_features_for_map = Features({
    "input_ids": Sequence(Value("int32")),
    "attention_mask": Sequence(Value("int8")),
    "labels": Sequence(Value("int64")),
})

# Apply tokenization
# The .map() function will automatically remove columns that are not part of the new features
# if 'remove_columns' is not set, OR it will remove the specified ones AFTER processing.

# Columns that tokenize_and_align_labels consumes from the input
input_map_columns = ["doc", "entities"]
# Columns to remove AFTER the map operation (original columns that are not in output_features_for_map)
columns_to_remove_after_map = [
    col for col in hf_train.column_names if col not in output_features_for_map
]
# Ensure 'doc' and 'entities' are removed as they are processed into new features
if 'doc' not in columns_to_remove_after_map:
    columns_to_remove_after_map.append('doc')
if 'entities' not in columns_to_remove_after_map:
    columns_to_remove_after_map.append('entities')
columns_to_remove_after_map = list(set(columns_to_remove_after_map))


hf_train_tokenized = hf_train.map(
    tokenize_and_align_labels,
    batched=True,
    batch_size=100,
    remove_columns=columns_to_remove_after_map,
    features=output_features_for_map # Define the output structure
)

hf_dev_tokenized = hf_dev.map(
    tokenize_and_align_labels,
    batched=True,
    batch_size=100,
    remove_columns=columns_to_remove_after_map, # Adjust for dev if columns differ, usually same
    features=output_features_for_map # Define the output structure
)

hf_train = hf_train_tokenized
hf_dev = hf_dev_tokenized

print(f"Tokenized datasets - Train: {len(hf_train)}, Dev: {len(hf_dev)}")

Map:   0%|          | 0/204 [00:00<?, ? examples/s]

Map:   0%|          | 0/88 [00:00<?, ? examples/s]

Tokenized datasets - Train: 627, Dev: 272


## 9. Setup Training Components

In [14]:
# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Metrics computation
def compute_metrics_entity_only(pred):
    """Compute metrics only on entity tokens (non-O labels)."""
    preds = pred.predictions.argmax(-1).flatten()
    labels = pred.label_ids.flatten()
    
    # Filter out non-entity labels and padding
    mask = (labels != label2id["O"]) & (labels != -100)
    
    if mask.sum() == 0:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels[mask], preds[mask], average="micro"
    )
    
    return {"precision": precision, "recall": recall, "f1": f1}


## 10. Baseline: Full Fine-Tuning (3 epochs)

## 10.1 Train Baseline Model

In [16]:
def train_baseline():
    """Train GPT-Neo baseline for 3 epochs."""
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(ner_labels),
        id2label=id2label,
        label2id=label2id,
    )
    
    # Resize embeddings to accommodate padding token
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id
    
    training_args = TrainingArguments(
        output_dir="outputs/gpt-neo-ner-baseline",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=3,
        learning_rate=3e-3,
        weight_decay=0.0,
        fp16=torch.cuda.is_available(),
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1",
        report_to="none",
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=hf_train,
        eval_dataset=hf_dev,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_entity_only,
    )
    
    trainer.train()
    metrics = trainer.evaluate()
    
    return trainer, metrics

baseline_trainer, baseline_metrics = train_baseline()
print(f"Baseline Dev F1: {baseline_metrics['eval_f1']:.4f}")

Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.457558,0.0,0.0,0.0
2,No log,0.400045,0.057704,0.057704,0.057704
3,No log,0.399712,0.004885,0.004885,0.004885




Baseline Dev F1: 0.0577


## 11. Hyperparameter Tuning with 100 Steps Budget

## 11.1 Full Fine-Tuning Hyperparameter Search

In [17]:
def ft_objective(trial):
    """Optuna objective for full fine-tuning."""
    lr = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    bs = trial.suggest_categorical("batch_size", [4, 8, 16])
    
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(ner_labels),
        id2label=id2label,
        label2id=label2id,
    )
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id
    
    args = TrainingArguments(
        output_dir=f"tmp/gpt-neo-ft-{trial.number}",
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs * 2,
        eval_strategy="steps",
        eval_steps=20,
        save_strategy="no",
        max_steps=100,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
        logging_steps=40,
        report_to="none",
    )
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=hf_train,
        eval_dataset=hf_dev,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_entity_only,
    )
    
    trainer.train()
    return trainer.evaluate()["eval_f1"]

study_ft = optuna.create_study(direction="maximize")
study_ft.optimize(ft_objective, n_trials=8)

print(f"Best Full-FT params: {study_ft.best_params}")
print(f"Best Full-FT Dev F1: {study_ft.best_value:.4f}")


Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.658414,0.025443,0.025443,0.025443
40,1.523800,0.444792,0.058518,0.058518,0.058518
60,1.523800,0.384895,0.020558,0.020558,0.020558
80,0.573500,0.362635,0.02412,0.02412,0.02412
100,0.573500,0.355448,0.033279,0.033279,0.033279




[I 2025-05-29 16:20:09,552] Trial 0 finished with value: 0.033279055566863426 and parameters: {'learning_rate': 1.5596622305412592e-05, 'batch_size': 16}. Best is trial 0 with value: 0.033279055566863426.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.460811,0.005699,0.005699,0.005699
40,1.063700,0.35586,0.053023,0.053023,0.053023
60,1.063700,0.316568,0.116222,0.116222,0.116222
80,0.434200,0.302295,0.156015,0.156015,0.156015
100,0.434200,0.299728,0.175453,0.175453,0.175453




[I 2025-05-29 16:23:50,702] Trial 1 finished with value: 0.1754528801139833 and parameters: {'learning_rate': 3.200739245599043e-05, 'batch_size': 16}. Best is trial 1 with value: 0.1754528801139833.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.52876,0.002341,0.002341,0.002341
40,1.183600,0.401629,0.01608,0.01608,0.01608
60,1.183600,0.350814,0.061266,0.061266,0.061266
80,0.533200,0.337007,0.085589,0.085589,0.085589
100,0.533200,0.335211,0.080094,0.080094,0.080094




[I 2025-05-29 16:25:27,664] Trial 2 finished with value: 0.08009362914716059 and parameters: {'learning_rate': 2.6994081548659585e-05, 'batch_size': 4}. Best is trial 1 with value: 0.1754528801139833.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.458752,0.023611,0.023611,0.023611
40,1.123000,0.354063,0.034907,0.034907,0.034907
60,1.123000,0.321281,0.098412,0.098412,0.098412
80,0.417800,0.304699,0.143395,0.143395,0.143395
100,0.417800,0.305521,0.134541,0.134541,0.134541




[I 2025-05-29 16:27:44,679] Trial 3 finished with value: 0.13454101363728882 and parameters: {'learning_rate': 3.813637473006929e-05, 'batch_size': 8}. Best is trial 1 with value: 0.1754528801139833.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.421473,0.024323,0.024323,0.024323
40,1.014500,0.332058,0.097191,0.097191,0.097191
60,1.014500,0.305778,0.130979,0.130979,0.130979
80,0.397800,0.297594,0.157338,0.157338,0.157338
100,0.397800,0.294736,0.180236,0.180236,0.180236




[I 2025-05-29 16:31:25,368] Trial 4 finished with value: 0.1802361082841441 and parameters: {'learning_rate': 3.994900876547036e-05, 'batch_size': 16}. Best is trial 4 with value: 0.1802361082841441.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.496029,0.003155,0.003155,0.003155
40,1.122400,0.368362,0.040912,0.040912,0.040912
60,1.122400,0.325207,0.096784,0.096784,0.096784
80,0.452600,0.309977,0.132811,0.132811,0.132811
100,0.452600,0.30629,0.153063,0.153063,0.153063




[I 2025-05-29 16:35:06,452] Trial 5 finished with value: 0.15306330144514554 and parameters: {'learning_rate': 2.805721423233097e-05, 'batch_size': 16}. Best is trial 4 with value: 0.1802361082841441.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.867661,0.003867,0.003867,0.003867
40,1.681900,0.525399,0.003155,0.003155,0.003155
60,1.681900,0.447331,0.010279,0.010279,0.010279
80,0.660700,0.413107,0.015469,0.015469,0.015469
100,0.660700,0.404533,0.018828,0.018828,0.018828




[I 2025-05-29 16:37:24,161] Trial 6 finished with value: 0.01882760024424995 and parameters: {'learning_rate': 1.273275420070486e-05, 'batch_size': 8}. Best is trial 4 with value: 0.1802361082841441.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.976506,0.005496,0.005496,0.005496
40,1.848200,0.550722,0.004885,0.004885,0.004885
60,1.848200,0.451409,0.013332,0.013332,0.013332
80,0.715000,0.42238,0.013027,0.013027,0.013027
100,0.715000,0.415784,0.013943,0.013943,0.013943




[I 2025-05-29 16:39:01,011] Trial 7 finished with value: 0.01394260126195807 and parameters: {'learning_rate': 1.3076326876907804e-05, 'batch_size': 4}. Best is trial 4 with value: 0.1802361082841441.


Best Full-FT params: {'learning_rate': 3.994900876547036e-05, 'batch_size': 16}
Best Full-FT Dev F1: 0.1802


## 11.2 LoRA Hyperparameter Search

In [18]:
def lora_objective(trial):
    """Optuna objective for LoRA fine-tuning."""
    lr = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    r = trial.suggest_categorical("r", [4, 8, 16])
    alpha = trial.suggest_categorical("alpha", [16, 32])
    dropout = trial.suggest_float("dropout", 0.0, 0.3)
    bs = trial.suggest_categorical("batch_size", [4, 8, 16])
    
    lora_config = LoraConfig(
        task_type="TOKEN_CLS",
        inference_mode=False,
        r=r,
        lora_alpha=alpha,
        lora_dropout=dropout,
    )
    
    base_model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(ner_labels),
        id2label=id2label,
        label2id=label2id,
    )
    base_model.resize_token_embeddings(len(tokenizer))
    base_model.config.pad_token_id = tokenizer.pad_token_id
    
    model = get_peft_model(base_model, lora_config)
    
    args = TrainingArguments(
        output_dir=f"tmp/gpt-neo-lora-{trial.number}",
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs * 2,
        eval_strategy="steps",
        eval_steps=20,
        save_strategy="no",
        max_steps=100,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
        logging_steps=40,
        report_to="none",
    )
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=hf_train,
        eval_dataset=hf_dev,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_entity_only,
    )
    
    trainer.train()
    return trainer.evaluate()["eval_f1"]

study_lora = optuna.create_study(direction="maximize")
study_lora.optimize(lora_objective, n_trials=8)

print(f"Best LoRA params: {study_lora.best_params}")
print(f"Best LoRA Dev F1: {study_lora.best_value:.4f}")


[I 2025-05-29 16:39:01,020] A new study created in memory with name: no-name-798b3149-a545-4ce4-828e-aa799c05534e
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,5.570324,0.015266,0.015266,0.015266
40,5.323300,5.31819,0.017505,0.017505,0.017505
60,5.323300,5.12907,0.020354,0.020354,0.020354
80,4.867400,5.012,0.022695,0.022695,0.022695
100,4.867400,4.971854,0.023407,0.023407,0.023407




[I 2025-05-29 16:42:04,396] Trial 0 finished with value: 0.023407286790148585 and parameters: {'learning_rate': 1.8859282896255805e-05, 'r': 16, 'alpha': 16, 'dropout': 0.1599934449030607, 'batch_size': 16}. Best is trial 0 with value: 0.023407286790148585.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.382599,0.086302,0.086302,0.086302
40,0.910500,0.302471,0.129656,0.129656,0.129656
60,0.910500,0.281222,0.153674,0.153674,0.153674
80,0.360200,0.267198,0.168024,0.168024,0.168024
100,0.360200,0.264256,0.198962,0.198962,0.198962




[I 2025-05-29 16:44:01,006] Trial 1 finished with value: 0.19896193771626297 and parameters: {'learning_rate': 0.0006864436426020529, 'r': 16, 'alpha': 32, 'dropout': 0.13102133790278095, 'batch_size': 8}. Best is trial 1 with value: 0.19896193771626297.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,2.107285,0.026359,0.026359,0.026359
40,2.501400,0.64455,0.019744,0.019744,0.019744
60,2.501400,0.46542,0.031752,0.031752,0.031752
80,0.689500,0.422331,0.049155,0.049155,0.049155
100,0.689500,0.410419,0.047731,0.047731,0.047731




[I 2025-05-29 16:47:04,735] Trial 2 finished with value: 0.0477305108894769 and parameters: {'learning_rate': 0.00012983952880904096, 'r': 16, 'alpha': 32, 'dropout': 0.07319761422656419, 'batch_size': 16}. Best is trial 1 with value: 0.19896193771626297.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.709233,0.015469,0.015469,0.015469
40,1.592100,0.432424,0.042133,0.042133,0.042133
60,1.592100,0.376216,0.054651,0.054651,0.054651
80,0.561200,0.354376,0.068899,0.068899,0.068899
100,0.561200,0.350253,0.065337,0.065337,0.065337




[I 2025-05-29 16:48:30,035] Trial 3 finished with value: 0.06533686138815388 and parameters: {'learning_rate': 0.00024850857514170053, 'r': 4, 'alpha': 32, 'dropout': 0.09113061629333172, 'batch_size': 4}. Best is trial 1 with value: 0.19896193771626297.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,1.096762,0.021372,0.021372,0.021372
40,2.005200,0.473517,0.03562,0.03562,0.03562
60,2.005200,0.413774,0.043354,0.043354,0.043354
80,0.569100,0.390699,0.070832,0.070832,0.070832
100,0.569100,0.381135,0.060452,0.060452,0.060452




[I 2025-05-29 16:49:55,321] Trial 4 finished with value: 0.060451862405862 and parameters: {'learning_rate': 0.0003046668762171177, 'r': 16, 'alpha': 16, 'dropout': 0.04972267741305342, 'batch_size': 4}. Best is trial 1 with value: 0.19896193771626297.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,5.300737,0.017708,0.017708,0.017708
40,5.058400,4.8206,0.026359,0.026359,0.026359
60,5.058400,4.449072,0.032363,0.032363,0.032363
80,4.188400,4.214063,0.036841,0.036841,0.036841
100,4.188400,4.132644,0.038062,0.038062,0.038062




[I 2025-05-29 16:52:58,057] Trial 5 finished with value: 0.03806228373702422 and parameters: {'learning_rate': 3.491787560798341e-05, 'r': 8, 'alpha': 16, 'dropout': 0.202989670831165, 'batch_size': 16}. Best is trial 1 with value: 0.19896193771626297.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,3.952615,0.018013,0.018013,0.018013
40,3.890100,3.402635,0.023814,0.023814,0.023814
60,3.890100,2.986992,0.02585,0.02585,0.02585
80,2.925800,2.731315,0.025341,0.025341,0.025341
100,2.925800,2.644315,0.024934,0.024934,0.024934




[I 2025-05-29 16:56:00,993] Trial 6 finished with value: 0.024933848972114796 and parameters: {'learning_rate': 4.0562002433105186e-05, 'r': 8, 'alpha': 16, 'dropout': 0.19795698792925626, 'batch_size': 16}. Best is trial 1 with value: 0.19896193771626297.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,0.509303,0.030531,0.030531,0.030531
40,1.268100,0.357551,0.07816,0.07816,0.07816
60,1.268100,0.314594,0.098412,0.098412,0.098412
80,0.424200,0.297823,0.127621,0.127621,0.127621
100,0.424200,0.294599,0.136373,0.136373,0.136373




[I 2025-05-29 16:59:04,499] Trial 7 finished with value: 0.13637288825564828 and parameters: {'learning_rate': 0.0004178040450177898, 'r': 16, 'alpha': 16, 'dropout': 0.15023311250292662, 'batch_size': 16}. Best is trial 1 with value: 0.19896193771626297.


Best LoRA params: {'learning_rate': 0.0006864436426020529, 'r': 16, 'alpha': 32, 'dropout': 0.13102133790278095, 'batch_size': 8}
Best LoRA Dev F1: 0.1990


## 11.3 Partial Freezing Hyperparameter Search

In [19]:
def freeze_objective(trial):
    """Optuna objective for partial freezing."""
    lr = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    bs = trial.suggest_categorical("batch_size", [4, 8, 16])
    freeze_pct = trial.suggest_float("freeze_pct", 0.25, 0.75)
    
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(ner_labels),
        id2label=id2label,
        label2id=label2id,
    )
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id
    
    # Freeze lower layers
    total_layers = len([n for n, _ in model.named_parameters() if n.startswith("transformer.h.")])
    cutoff = int(total_layers * freeze_pct)
    
    for name, param in model.named_parameters():
        if name.startswith("transformer.h.") and int(name.split(".")[2]) < cutoff:
            param.requires_grad = False
    
    args = TrainingArguments(
        output_dir=f"tmp/gpt-neo-freeze-{trial.number}",
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs * 2, 
        eval_strategy="steps",
        eval_steps=20,
        save_strategy="no",
        max_steps=100,
        learning_rate=lr,
        fp16=torch.cuda.is_available(),
        logging_steps=40,
        report_to="none",
    )
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=hf_train,
        eval_dataset=hf_dev,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_entity_only,
    )
    
    trainer.train()
    return trainer.evaluate()["eval_f1"]

study_freeze = optuna.create_study(direction="maximize")
study_freeze.optimize(freeze_objective, n_trials=8)

print(f"Best Freeze params: {study_freeze.best_params}")
print(f"Best Freeze Dev F1: {study_freeze.best_value:.4f}")

[I 2025-05-29 16:59:04,515] A new study created in memory with name: no-name-327b0011-996b-4004-9d16-5e4bfe956512
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,3.942537,0.016589,0.016589,0.016589
40,3.813800,3.429366,0.019947,0.019947,0.019947
60,3.813800,3.068757,0.022491,0.022491,0.022491
80,2.812900,2.85704,0.023611,0.023611,0.023611
100,2.812900,2.786121,0.02412,0.02412,0.02412




[I 2025-05-29 17:02:02,239] Trial 0 finished with value: 0.024119682475066146 and parameters: {'learning_rate': 3.2799657790412474e-05, 'batch_size': 16, 'freeze_pct': 0.5823635847865559}. Best is trial 0 with value: 0.024119682475066146.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.192076,0.015062,0.015062,0.015062
40,4.075700,3.859626,0.016283,0.016283,0.016283
60,4.075700,3.622548,0.018929,0.018929,0.018929
80,3.413400,3.481192,0.019744,0.019744,0.019744
100,3.413400,3.433424,0.019947,0.019947,0.019947




[I 2025-05-29 17:04:59,832] Trial 1 finished with value: 0.019947079177691836 and parameters: {'learning_rate': 2.0836059096300748e-05, 'batch_size': 16, 'freeze_pct': 0.7347109035053484}. Best is trial 0 with value: 0.024119682475066146.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.245159,0.015062,0.015062,0.015062
40,4.174000,3.947846,0.016589,0.016589,0.016589
60,4.174000,3.744111,0.018115,0.018115,0.018115
80,3.571000,3.620102,0.018828,0.018828,0.018828
100,3.571000,3.577064,0.019031,0.019031,0.019031




[I 2025-05-29 17:06:52,949] Trial 2 finished with value: 0.01903114186851211 and parameters: {'learning_rate': 1.9911060209753957e-05, 'batch_size': 8, 'freeze_pct': 0.39967121626549673}. Best is trial 0 with value: 0.024119682475066146.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.315043,0.018421,0.018421,0.018421
40,4.415400,3.922961,0.019845,0.019845,0.019845
60,4.415400,3.653962,0.021372,0.021372,0.021372
80,3.641400,3.490267,0.022084,0.022084,0.022084
100,3.641400,3.433566,0.021982,0.021982,0.021982




[I 2025-05-29 17:08:45,895] Trial 3 finished with value: 0.021982495420313455 and parameters: {'learning_rate': 2.5963730440346943e-05, 'batch_size': 8, 'freeze_pct': 0.40316731279059026}. Best is trial 0 with value: 0.024119682475066146.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.593669,0.016996,0.016996,0.016996
40,4.705000,4.412306,0.017912,0.017912,0.017912
60,4.705000,4.287029,0.018319,0.018319,0.018319
80,4.344400,4.210409,0.018726,0.018726,0.018726
100,4.344400,4.183796,0.018624,0.018624,0.018624




[I 2025-05-29 17:10:39,010] Trial 4 finished with value: 0.01862405861998779 and parameters: {'learning_rate': 1.1824810165667613e-05, 'batch_size': 8, 'freeze_pct': 0.2708083573380225}. Best is trial 0 with value: 0.024119682475066146.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.27182,0.018013,0.018013,0.018013
40,4.350200,3.853682,0.020049,0.020049,0.020049
60,4.350200,3.555363,0.02127,0.02127,0.02127
80,3.508600,3.376915,0.021575,0.021575,0.021575
100,3.508600,3.316618,0.021982,0.021982,0.021982




[I 2025-05-29 17:13:36,807] Trial 5 finished with value: 0.021982495420313455 and parameters: {'learning_rate': 2.5932819926246227e-05, 'batch_size': 16, 'freeze_pct': 0.26458001021709354}. Best is trial 0 with value: 0.024119682475066146.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.216228,0.01496,0.01496,0.01496
40,4.144300,3.89723,0.016283,0.016283,0.016283
60,4.144300,3.678923,0.018929,0.018929,0.018929
80,3.500000,3.546189,0.019133,0.019133,0.019133
100,3.500000,3.500149,0.019438,0.019438,0.019438




[I 2025-05-29 17:15:30,138] Trial 6 finished with value: 0.019438225117036433 and parameters: {'learning_rate': 2.1408496700657068e-05, 'batch_size': 8, 'freeze_pct': 0.5908479493424346}. Best is trial 0 with value: 0.024119682475066146.
Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
20,No log,4.405234,0.017708,0.017708,0.017708
40,4.501200,4.086965,0.019744,0.019744,0.019744
60,4.501200,3.857914,0.020151,0.020151,0.020151
80,3.940600,3.722417,0.020761,0.020761,0.020761
100,3.940600,3.677211,0.020965,0.020965,0.020965




[I 2025-05-29 17:16:53,226] Trial 7 finished with value: 0.020964787299002648 and parameters: {'learning_rate': 2.3563691462538734e-05, 'batch_size': 4, 'freeze_pct': 0.38411610730206336}. Best is trial 0 with value: 0.024119682475066146.


Best Freeze params: {'learning_rate': 3.2799657790412474e-05, 'batch_size': 16, 'freeze_pct': 0.5823635847865559}
Best Freeze Dev F1: 0.0241


## 12. Final Training with Optimal Hyperparameters

## 12.1 Full Fine-Tuning with Best Parameters

In [20]:
best_ft_params = study_ft.best_params

ft_model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(ner_labels),
    id2label=id2label,
    label2id=label2id,
)
ft_model.resize_token_embeddings(len(tokenizer))
ft_model.config.pad_token_id = tokenizer.pad_token_id

ft_args = TrainingArguments(
    output_dir="outputs/gpt-neo-ner-ft-final",
    per_device_train_batch_size=best_ft_params["batch_size"],
    per_device_eval_batch_size=best_ft_params["batch_size"] * 2,
    eval_strategy="steps",
    eval_steps=40,
    save_strategy="epoch",
    max_steps=200,
    learning_rate=best_ft_params["learning_rate"],
    fp16=torch.cuda.is_available(),
    logging_steps=40,
    report_to="none",
)

ft_trainer = Trainer(
    model=ft_model,
    args=ft_args,
    train_dataset=hf_train,
    eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)

ft_trainer.train()
ft_final_metrics = ft_trainer.evaluate()
print(f"Final Full-FT Dev F1: {ft_final_metrics['eval_f1']:.4f}")

Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  ft_trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
40,1.0719,0.328616,0.082434,0.082434,0.082434
80,0.3493,0.290784,0.184714,0.184714,0.184714
120,0.2268,0.308614,0.212192,0.212192,0.212192
160,0.1559,0.324837,0.231936,0.231936,0.231936
200,0.1215,0.337132,0.22471,0.22471,0.22471




Final Full-FT Dev F1: 0.2247


## 12.2 LoRA with Best Parameters

In [21]:
best_lora_params = study_lora.best_params

lora_config = LoraConfig(
    task_type="TOKEN_CLS",
    inference_mode=False,
    r=best_lora_params["r"],
    lora_alpha=best_lora_params["alpha"],
    lora_dropout=best_lora_params["dropout"],
)

base_model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(ner_labels),
    id2label=id2label,
    label2id=label2id,
)
base_model.resize_token_embeddings(len(tokenizer))
base_model.config.pad_token_id = tokenizer.pad_token_id

lora_model = get_peft_model(base_model, lora_config)

lora_args = TrainingArguments(
    output_dir="outputs/gpt-neo-ner-lora-final",
    per_device_train_batch_size=best_lora_params["batch_size"],
    per_device_eval_batch_size=best_lora_params["batch_size"] * 2,
    eval_strategy="steps",
    eval_steps=40,
    save_strategy="epoch",
    max_steps=200,
    learning_rate=best_lora_params["learning_rate"],
    fp16=torch.cuda.is_available(),
    logging_steps=40,
    report_to="none",
)

lora_trainer = Trainer(
    model=lora_model,
    args=lora_args,
    train_dataset=hf_train,
    eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)

lora_trainer.train()
lora_final_metrics = lora_trainer.evaluate()
print(f"Final LoRA Dev F1: {lora_final_metrics['eval_f1']:.4f}")

Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  lora_trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Precision,Recall,F1
40,1.0175,0.314297,0.114085,0.114085,0.114085
80,0.349,0.262005,0.209953,0.209953,0.209953
120,0.2915,0.252488,0.227254,0.227254,0.227254
160,0.2643,0.257544,0.235905,0.235905,0.235905
200,0.2454,0.255932,0.232241,0.232241,0.232241




Final LoRA Dev F1: 0.2322


## 12.3 Partial Freezing with Best Parameters

In [23]:
import shutil
from pathlib import Path
import os

def remove_dir_if_exists(dir_path_str):
    dir_path = Path(dir_path_str)
    if dir_path.exists() and dir_path.is_dir():
        print(f"Removing directory: {dir_path}")
        shutil.rmtree(dir_path)
    else:
        print(f"Directory not found or not a directory: {dir_path}")

print("Attempting to clean up space...")

# --- Optuna Temporary Directories ---
# These are generally safe to remove if the study_*.optimize() calls for FT and LoRA are complete
# and you have stored study_ft.best_params and study_lora.best_params.
# Be cautious with tmp/gpt-neo-freeze-* if you might want to resume the freeze HPO.
# However, the error occurs in the FINAL freeze training, so HPO for freeze should be done.
print("\\nCleaning Optuna temporary directories...")
for i in range(8): # Assuming max 8 trials based on your n_trials=8
    remove_dir_if_exists(f"/kaggle/working/tmp/gpt-neo-ft-{i}")
    remove_dir_if_exists(f"/kaggle/working/tmp/gpt-neo-lora-{i}")
    remove_dir_if_exists(f"/kaggle/working/tmp/gpt-neo-freeze-{i}")


# --- Checkpoint Directories for COMPLETED Final Training Runs ---
# Only remove these if the corresponding Python variables holding metrics are populated
# (e.g., baseline_metrics, ft_final_metrics, lora_final_metrics)
# and you don't need the actual saved model files for later loading.
# The summary table uses the Python variables for metrics.

print("\\nCleaning completed final model output directories (checkpoints)...")
if 'baseline_metrics' in globals() and baseline_metrics:
    print("Baseline training complete, metrics available. Removing its output directory.")
    remove_dir_if_exists("/kaggle/working/outputs/gpt-neo-ner-baseline/")
else:
    print("Baseline metrics not found, skipping cleanup of its output directory.")

if 'ft_final_metrics' in globals() and ft_final_metrics:
    print("Full FT final training complete, metrics available. Removing its output directory.")
    remove_dir_if_exists("/kaggle/working/outputs/gpt-neo-ner-ft-final/")
else:
    print("Full FT final metrics not found, skipping cleanup of its output directory.")

if 'lora_final_metrics' in globals() and lora_final_metrics:
    print("LoRA final training complete, metrics available. Removing its output directory.")
    remove_dir_if_exists("/kaggle/working/outputs/gpt-neo-ner-lora-final/")
else:
    print("LoRA final metrics not found, skipping cleanup of its output directory.")

# DO NOT remove /kaggle/working/outputs/gpt-neo-ner-freeze-final/ yet, as that's the one currently failing.

print("\\nDisk space after cleanup attempt:")
!df -h /kaggle/working/

Attempting to clean up space...
\nCleaning Optuna temporary directories...
Removing directory: /kaggle/working/tmp/gpt-neo-ft-0
Removing directory: /kaggle/working/tmp/gpt-neo-lora-0
Removing directory: /kaggle/working/tmp/gpt-neo-freeze-0
Removing directory: /kaggle/working/tmp/gpt-neo-ft-1
Removing directory: /kaggle/working/tmp/gpt-neo-lora-1
Removing directory: /kaggle/working/tmp/gpt-neo-freeze-1
Removing directory: /kaggle/working/tmp/gpt-neo-ft-2
Removing directory: /kaggle/working/tmp/gpt-neo-lora-2
Removing directory: /kaggle/working/tmp/gpt-neo-freeze-2
Removing directory: /kaggle/working/tmp/gpt-neo-ft-3
Removing directory: /kaggle/working/tmp/gpt-neo-lora-3
Removing directory: /kaggle/working/tmp/gpt-neo-freeze-3
Removing directory: /kaggle/working/tmp/gpt-neo-ft-4
Removing directory: /kaggle/working/tmp/gpt-neo-lora-4
Removing directory: /kaggle/working/tmp/gpt-neo-freeze-4
Removing directory: /kaggle/working/tmp/gpt-neo-ft-5
Removing directory: /kaggle/working/tmp/gpt-neo

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [24]:
best_freeze_params = study_freeze.best_params

freeze_model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(ner_labels),
    id2label=id2label,
    label2id=label2id,
)
freeze_model.resize_token_embeddings(len(tokenizer))
freeze_model.config.pad_token_id = tokenizer.pad_token_id

# Apply freezing
total_layers = len([n for n, _ in freeze_model.named_parameters() if n.startswith("transformer.h.")])
cutoff = int(total_layers * best_freeze_params["freeze_pct"])

for name, param in freeze_model.named_parameters():
    if name.startswith("transformer.h.") and int(name.split(".")[2]) < cutoff:
        param.requires_grad = False

freeze_args = TrainingArguments(
    output_dir="outputs/gpt-neo-ner-freeze-final",
    per_device_train_batch_size=best_freeze_params["batch_size"],
    per_device_eval_batch_size=best_freeze_params["batch_size"] * 2,
    eval_strategy="steps",
    eval_steps=40,
    save_strategy="epoch",
    max_steps=200,
    learning_rate=best_freeze_params["learning_rate"],
    fp16=torch.cuda.is_available(),
    logging_steps=40,
    report_to="none",
)

freeze_trainer = Trainer(
    model=freeze_model,
    args=freeze_args,
    train_dataset=hf_train,
    eval_dataset=hf_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_entity_only,
)

freeze_trainer.train()
freeze_final_metrics = freeze_trainer.evaluate()
print(f"Final Freeze Dev F1: {freeze_final_metrics['eval_f1']:.4f}")


Some weights of GPTNeoForTokenClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  freeze_trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
40,4.3341,4.01368,0.030328,0.030328,0.030328
80,2.9323,2.943712,0.023306,0.023306,0.023306
120,1.9595,2.22508,0.018624,0.018624,0.018624
160,1.4136,1.830544,0.01781,0.01781,0.01781
200,1.1946,1.707416,0.017301,0.017301,0.017301




Final Freeze Dev F1: 0.0173


## 13. Results Summary

## 13.1 Compile Results

In [25]:
results = {
    "GPT-Neo NER Results": {
        "Baseline (3 epochs)": {
            "Dev F1": baseline_metrics["eval_f1"],
            "Parameters": "Full model (~125M)"
        },
        "Full Fine-Tuning (200 steps)": {
            "Dev F1": ft_final_metrics["eval_f1"],
            "Best Params": best_ft_params,
            "Parameters": "Full model (~125M)"
        },
        "LoRA (200 steps)": {
            "Dev F1": lora_final_metrics["eval_f1"],
            "Best Params": best_lora_params,
            "Parameters": f"~{best_lora_params['r'] * 2 * 768 / 1e6:.2f}M trainable"
        },
        "Partial Freezing (200 steps)": {
            "Dev F1": freeze_final_metrics["eval_f1"],
            "Best Params": best_freeze_params,
            "Parameters": f"~{(1 - best_freeze_params['freeze_pct']) * 125:.1f}M trainable"
        }
    }
}

print("\n" + "="*50)
print("GPT-Neo NER RESULTS SUMMARY")
print("="*50)
for method, metrics in results["GPT-Neo NER Results"].items():
    print(f"\n{method}:")
    print(f"  Dev F1: {metrics['Dev F1']:.4f}")
    print(f"  Trainable Parameters: {metrics['Parameters']}")
    if "Best Params" in metrics:
        print(f"  Best Hyperparameters: {metrics['Best Params']}")


GPT-Neo NER RESULTS SUMMARY

Baseline (3 epochs):
  Dev F1: 0.0577
  Trainable Parameters: Full model (~125M)

Full Fine-Tuning (200 steps):
  Dev F1: 0.2247
  Trainable Parameters: Full model (~125M)
  Best Hyperparameters: {'learning_rate': 3.994900876547036e-05, 'batch_size': 16}

LoRA (200 steps):
  Dev F1: 0.2322
  Trainable Parameters: ~0.02M trainable
  Best Hyperparameters: {'learning_rate': 0.0006864436426020529, 'r': 16, 'alpha': 32, 'dropout': 0.13102133790278095, 'batch_size': 8}

Partial Freezing (200 steps):
  Dev F1: 0.0173
  Trainable Parameters: ~52.2M trainable
  Best Hyperparameters: {'learning_rate': 3.2799657790412474e-05, 'batch_size': 16, 'freeze_pct': 0.5823635847865559}


## 13.2 Save Results

In [26]:
import json

with open("outputs/gpt_neo_ner_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("\nResults saved to outputs/gpt_neo_ner_results.json")



Results saved to outputs/gpt_neo_ner_results.json
