In [2]:
!pip install torchcrf
!pip install transformers datasets seqeval pandas openpyxl
!pip install evaluate
!pip install tf-keras
# pip install numpy==1.24.3
!pip install --upgrade scipy tensorflow scikit-learn
!pip install tiktoken
!pip install sentencepiece
!pip install optuna

Defaulting to user installation because normal site-packages is not writeable
Collecting torchcrf
  Downloading TorchCRF-1.1.0-py3-none-any.whl.metadata (2.3 kB)
Downloading TorchCRF-1.1.0-py3-none-any.whl (5.2 kB)
Installing collected packages: torchcrf
Successfully installed torchcrf-1.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting openpyxl
  Downloading openpyxl-3.1.5-

In [6]:
import json
import pandas as pd
import numpy as np
import uuid
from datasets import Dataset, DatasetDict, load_dataset
import torch
from torch import nn
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    BertTokenizerFast,
    AdamW,
    get_scheduler,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)

import uuid
import gc
import optuna
from transformers import EarlyStoppingCallback


In [None]:
# Global label mappings will be set by the dataset loader
id2label = {}
label2id = {}


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels from inputs
        labels = inputs.pop("labels")
        # Forward pass: get outputs and logits
        outputs = model(**inputs)
        logits = outputs["logits"]
        # Use your custom loss function (here CrossEntropyLoss is used)
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

class NERDataset:
    def __init__(self, json_path: str):
        self.json_path = json_path
        self.dataset = None

    def create_dataset(self) -> DatasetDict:
        js = pd.read_json(self.json_path, encoding="utf-8")
        tokens_flat = []
        labels_flat = []
        sentence_ids_flat = []

        for _, row in js.iterrows():
            tokens = row["tokens"]
            labels = row["labels"]
            sentence_id = row["sentence_id"]  # This is an integer
            if len(tokens) == len(labels):
                tokens_flat.extend(tokens)
                labels_flat.extend(labels)
                sentence_ids_flat.extend([sentence_id] * len(tokens))
            else:
                print(f"Skipping sentence_id {sentence_id} due to mismatched lengths: {len(tokens)} tokens vs {len(labels)} labels")

        assert len(tokens_flat) == len(labels_flat) == len(sentence_ids_flat), "Mismatch in list lengths!"

        global id2label, label2id
        unique_labels = list(set(labels_flat))
        id2label = {idx: label for idx, label in enumerate(unique_labels)}
        label2id = {label: idx for idx, label in enumerate(unique_labels)}
        labels_numeric_flat = [label2id[label] for label in labels_flat]

        dataset_dict = {
            "tokens": tokens_flat,
            "ner_tags": labels_flat,
            "sentence_id": sentence_ids_flat,
            "labels_numeric": labels_numeric_flat
        }
        dataset = Dataset.from_dict(dataset_dict)
        dataset_split = dataset.train_test_split(test_size=0.2, seed=42)
        self.dataset = DatasetDict({
            "train": dataset_split["train"],
            "test": dataset_split["test"]
        })

        print("Sample training data:", self.dataset["train"][0])
        print(f"Training set size: {len(self.dataset['train'])}")
        print(f"Test set size: {len(self.dataset['test'])}")
        return self.dataset


class TokenizerAligner:
    def __init__(self, model_name: str):
        try:
            if "roberta" in model_name.lower():
                # For RoBERTa models, add_prefix_space is required.
                self.tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
            else:
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        except ValueError as e:
            print(f"Fast tokenizer loading failed for {model_name} with error: {e}. Falling back to slow tokenizer.")
            if "roberta" in model_name.lower():
                self.tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True, use_fast=False)
            else:
                self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

    def tokenize_and_align_labels(self, dataset: DatasetDict) -> DatasetDict:
        def tokenize_fn(batch):
            def align_target(labels, word_ids):
                align_labels = []
                for word in word_ids:
                    if word is None:
                        label = -100  # For special tokens like [CLS] and [SEP]
                    else:
                        label = labels[word]
                    align_labels.append(label)
                return align_labels

            tokenized_inputs = self.tokenizer(
                batch["tokens"],
                truncation=True,
                padding=True,
                is_split_into_words=True,
                max_length=512,
                return_tensors="np"
            )
            labels_batch = batch["labels_numeric"]
            # Align labels with tokenized outputs (using word_ids from the first example)
            aligned_targets_batch = [align_target(labels_batch, tokenized_inputs.word_ids())]
            tokenized_inputs["labels"] = aligned_targets_batch
            return tokenized_inputs

        tokenized_datasets = dataset.map(tokenize_fn, batched=True, remove_columns=dataset['train'].column_names)
        tokenized_datasets.set_format("torch")
        print("Tokenization and alignment completed.")
        return tokenized_datasets



def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)  # Get the predicted labels
    accuracy = (predictions == labels).mean()  # Basic accuracy
    return {"eval_accuracy": accuracy}


class NERTrainer:
    def __init__(self, model_name: str, tokenizer, tokenized_datasets: DatasetDict):
        self.model_name = model_name
        self.tokenizer = tokenizer
        self.tokenized_datasets = tokenized_datasets
        self.data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.model = AutoModelForTokenClassification.from_pretrained(
            model_name,
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )
        self.model.to(self.device)

    def train(self, hyperparams=None):
        if hyperparams is None:
            hyperparams = {
                "learning_rate": 3e-5,
                "per_device_train_batch_size": 8,
                "per_device_eval_batch_size": 8,
                "num_train_epochs": 100,
                "weight_decay": 0.01,
                "no_cuda": False,
                "output_dir": "fine_tuned_model",
                "gradient_accumulation_steps": 2,
                "optim": "adamw_torch",
                "lr_scheduler_type": "linear"
            }
        
        training_args = TrainingArguments(
            output_dir=hyperparams.get("output_dir", "fine_tuned_model"),
            evaluation_strategy="epoch",
            remove_unused_columns=False,
            learning_rate=hyperparams["learning_rate"],
            per_device_train_batch_size=hyperparams["per_device_train_batch_size"],
            per_device_eval_batch_size=hyperparams["per_device_eval_batch_size"],
            num_train_epochs=hyperparams["num_train_epochs"],
            weight_decay=hyperparams["weight_decay"],
            no_cuda=hyperparams["no_cuda"],
            gradient_accumulation_steps=hyperparams["gradient_accumulation_steps"],
            optim=hyperparams["optim"],
            lr_scheduler_type=hyperparams["lr_scheduler_type"],
            metric_for_best_model="eval_accuracy",
        )
    
        trainer = CustomTrainer(
            model=self.model,
            args=training_args,
            train_dataset=self.tokenized_datasets['train'],
            eval_dataset=self.tokenized_datasets['test'],
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator,
        )
    
        trainer.train()
        return trainer


def log_results_to_excel(results, excel_path="model_results.xlsx"):
    try:
        df = pd.read_excel(excel_path)
    except FileNotFoundError:
        df = pd.DataFrame(columns=["Model", "Hyperparameters", "Accuracy"])

    new_data = pd.DataFrame(results, columns=["Model", "Hyperparameters", "Accuracy"])
    df = pd.concat([df, new_data], ignore_index=True)
    df.to_excel(excel_path, index=False)
    print(f"Results logged to {excel_path}")


def main():
    model_names = [
        "ProsusAI/finbert",
        "nbroad/ESG-BERT",
        "dbmdz/bert-large-cased-finetuned-conll03-english",
        "bert-large-cased",
        "dmis-lab/biobert-base-cased-v1.1",
    ]
    
    json_path = "Augmented_Annotated_JSON_1103.json"
    dataset_loader = NERDataset(json_path)
    dataset = dataset_loader.create_dataset()

    results = []

    for model_name in model_names:
        print(f"\n=== Training model: {model_name} ===")
        
        tokenizer_aligner = TokenizerAligner(model_name)
        tokenized_datasets = tokenizer_aligner.tokenize_and_align_labels(dataset)
        
        # Use Optuna for hyperparameter search
        def objective(trial):
            learning_rate = trial.suggest_categorical("learning_rate", [0.00001, 0.0001, 0.001])
            num_train_epochs = trial.suggest_categorical("num_train_epochs", [350, 400, 450, 500, 550, 600])
            per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32, 64])
            gradient_accumulation_steps = trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4, 8])
            optim_choice = trial.suggest_categorical("optim", ["adamw_torch", "adamw_hf"])
            weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
            
            trial_params = {
                "learning_rate": learning_rate,
                "per_device_train_batch_size": per_device_train_batch_size,
                "per_device_eval_batch_size": per_device_train_batch_size,  # Same as training batch size
                "num_train_epochs": num_train_epochs,
                "weight_decay": weight_decay,
                "no_cuda": False,
                "output_dir": "temp_model_" + uuid.uuid4().hex[:8],
                "gradient_accumulation_steps": gradient_accumulation_steps,
                "optim": optim_choice,
                "lr_scheduler_type": "linear"
            }
            
            ner_trainer = NERTrainer(model_name, tokenizer_aligner.tokenizer, tokenized_datasets)
            trainer = ner_trainer.train(trial_params)
            eval_result = trainer.evaluate()
            accuracy = eval_result.get("eval_accuracy", 0)
            # We return 1 - accuracy to have a minimization objective.
            return accuracy

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=10)
        best_params = study.best_trial.params
        # Overriding output directory and ensuring eval batch size consistency.
        best_params["output_dir"] = "best_hyperparameters"
        best_params["per_device_eval_batch_size"] = best_params["per_device_train_batch_size"]
        
        print("Best hyperparameters found:", best_params)
        
        ner_trainer = NERTrainer(model_name, tokenizer_aligner.tokenizer, tokenized_datasets)
        trainer = ner_trainer.train(best_params)
        eval_result = trainer.evaluate()
        accuracy = eval_result.get("eval_accuracy", 0)
        results.append((model_name, json.dumps(best_params), accuracy))
        print(f"Final evaluation accuracy for {model_name}: {accuracy}")

        print(f'Model {model_name} trained. Saving results and starting to train another model.')

    log_results_to_excel(results)
        
        # final_model_name = "nl_thesis_" + model_name.split("/")[-1]
        # ner_trainer.model.save_pretrained(final_model_name)
        # tokenizer_aligner.tokenizer.save_pretrained(final_model_name)
        
        # try:
        #     ner_trainer.model.push_to_hub(final_model_name)
        #     tokenizer_aligner.tokenizer.push_to_hub(final_model_name)
        #     print(f"Model {final_model_name} pushed to Hugging Face Hub.")
        # except Exception as e:
        #     print("Error pushing model to Hugging Face Hub:", e)
        
        # # Clear cache after training each model
        # torch.cuda.empty_cache()
        # gc.collect()
    
    

if __name__ == "__main__":
    main()

Sample training data: {'tokens': 'goods', 'ner_tags': 'I-KPI', 'sentence_id': 58.32, 'labels_numeric': 42}
Training set size: 100857
Test set size: 25215

=== Training model: dbmdz/bert-large-cased-finetuned-conll03-english ===


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Map:   0%|          | 0/100857 [00:00<?, ? examples/s]

Map:   0%|          | 0/25215 [00:00<?, ? examples/s]

[I 2025-03-22 21:26:43,846] A new study created in memory with name: no-name-b1569cb4-962b-4c64-b6d0-a4e70be8544a


Tokenization and alignment completed.


  learning_rate = trial.suggest_loguniform("learning_rate", 1e-6, 1e-4)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)


model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([43]) in the model instanti

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.710758,0.43119
2,No log,2.348292,0.431716
3,No log,2.116721,0.504357
4,No log,1.925434,0.49692
5,No log,1.782619,0.527494
6,No log,1.676702,0.537936
7,No log,1.558903,0.563852
8,No log,1.463937,0.566031
9,No log,1.366601,0.577073
10,No log,1.29369,0.60021


[I 2025-03-22 22:29:58,254] Trial 0 finished with value: 0.3233924278846154 and parameters: {'learning_rate': 7.71365876340364e-06, 'num_train_epochs': 500, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'optim': 'adamw_hf', 'weight_decay': 0.03128377522430548}. Best is trial 0 with value: 0.3233924278846154.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-6, 1e-4)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from th

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.947848,0.004207
2,No log,3.911206,0.004282
3,No log,3.875153,0.004657
4,No log,3.839263,0.004507
5,No log,3.803383,0.004507
6,No log,3.767152,0.004357
7,No log,3.730975,0.004357
8,No log,3.694676,0.004507
9,No log,3.658369,0.004883
10,No log,3.622309,0.006235


[I 2025-03-22 23:34:29,901] Trial 1 finished with value: 0.37439903846153844 and parameters: {'learning_rate': 1.0980604469293178e-06, 'num_train_epochs': 550, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 4, 'optim': 'adamw_hf', 'weight_decay': 0.03742771077352043}. Best is trial 0 with value: 0.3233924278846154.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-6, 1e-4)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.808263,0.430814
2,No log,2.513024,0.430889
3,No log,2.303953,0.451322
4,No log,2.147233,0.50338
5,No log,2.067343,0.500526
6,No log,1.996776,0.500977
7,No log,1.921678,0.505108
8,No log,1.889361,0.499023
9,No log,1.816519,0.515475
10,No log,1.75933,0.525541


[I 2025-03-23 00:21:22,831] Trial 2 finished with value: 0.32692307692307687 and parameters: {'learning_rate': 2.868077711925507e-05, 'num_train_epochs': 400, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 8, 'optim': 'adamw_torch', 'weight_decay': 0.050774669362868466}. Best is trial 0 with value: 0.3233924278846154.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-6, 1e-4)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification f

Epoch,Training Loss,Validation Loss
