In [2]:
!pip install torchcrf
!pip install transformers datasets seqeval pandas openpyxl
!pip install evaluate
!pip install tf-keras
# pip install numpy==1.24.3
!pip install --upgrade scipy tensorflow scikit-learn
!pip install tiktoken
!pip install sentencepiece
!pip install tf-keras
!pip install optuna

Defaulting to user installation because normal site-packages is not writeable
Collecting torchcrf
  Downloading TorchCRF-1.1.0-py3-none-any.whl.metadata (2.3 kB)
Downloading TorchCRF-1.1.0-py3-none-any.whl (5.2 kB)
Installing collected packages: torchcrf
Successfully installed torchcrf-1.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting openpyxl
  Downloading openpyxl-3.1.5-

In [3]:
# pip show torchcrf

In [4]:
import os
import uuid
import json
import optuna
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments
)
from torchcrf import CRF  # Ensure you have installed torchcrf (pip install torchcrf)
import torch.nn.functional as F

ModuleNotFoundError: No module named 'optuna'

In [3]:


# Global label mappings will be set by the dataset loader
id2label = {}
label2id = {}



class CustomLossTrainer(Trainer):
    
    def compute_loss(self, model, inputs, return_outputs=False):
        # Use your custom loss function here
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs["logits"]
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        """
        Overriding prediction_step so that if predictions are not torch.Tensor
        (e.g. lists from CRF decode), we simply return them as is without padding.
        """
        has_labels = "labels" in inputs
        labels = inputs.get("labels") if has_labels else None

        # Prepare inputs
        inputs = self._prepare_inputs(inputs)
        with torch.no_grad():
            outputs = model(**inputs)
            loss = outputs.get("loss", None)
            predictions = outputs.get("predictions", None)

        # If predictions are not torch.Tensor (for example, a list of lists), then
        # return them directly without trying to pad across processes.
        if predictions is not None and not isinstance(predictions, torch.Tensor):
            return (loss, predictions, labels)
        else:
            # Otherwise, use the default behavior.
            return (loss, self.accelerator.pad_across_processes(predictions, dim=1, pad_index=-100), labels)

#############################################
# Custom Dataset Loader
#############################################
class NERDataset:
    def __init__(self, json_path: str):
        self.json_path = json_path
        self.dataset = None

    def create_dataset(self) -> DatasetDict:
        js = pd.read_json(self.json_path, encoding="utf-8")
        tokens_flat = []
        labels_flat = []
        sentence_ids_flat = []

        for _, row in js.iterrows():
            tokens = row["tokens"]
            labels = row["labels"]
            sentence_id = row["sentence_id"]  # This is an integer
            if len(tokens) == len(labels):
                tokens_flat.append(tokens)  # Keep sentences as lists
                labels_flat.append(labels)
                sentence_ids_flat.append(sentence_id)
            else:
                print(f"Skipping sentence_id {sentence_id} due to mismatched lengths: {len(tokens)} tokens vs {len(labels)} labels")

        # Check that all lists have the same number of sentences
        assert len(tokens_flat) == len(labels_flat) == len(sentence_ids_flat), "Mismatch in list lengths!"

        global id2label, label2id
        unique_labels = list(set([label for sent_labels in labels_flat for label in sent_labels]))
        id2label = {idx: label for idx, label in enumerate(unique_labels)}
        label2id = {label: idx for idx, label in enumerate(unique_labels)}
        
        # Convert labels to numeric lists per sentence
        labels_numeric = [[label2id[label] for label in sent_labels] for sent_labels in labels_flat]

        dataset_dict = {
            "tokens": tokens_flat,
            "ner_tags": labels_flat,
            "sentence_id": sentence_ids_flat,
            "labels_numeric": labels_numeric
        }
        dataset = Dataset.from_dict(dataset_dict)
        dataset_split = dataset.train_test_split(test_size=0.2, seed=42)
        self.dataset = DatasetDict({
            "train": dataset_split["train"],
            "test": dataset_split["test"]
        })

        print("Sample training data:", self.dataset["train"][0])
        print(f"Training set size: {len(self.dataset['train'])}")
        print(f"Test set size: {len(self.dataset['test'])}")
        return self.dataset

#############################################
# Tokenizer and Label Aligner
#############################################
class TokenizerAligner:
    def __init__(self, model_name: str):
        try:
            if "roberta" in model_name.lower():
                # For RoBERTa models, add_prefix_space is required.
                self.tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
            else:
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        except ValueError as e:
            print(f"Fast tokenizer loading failed for {model_name} with error: {e}. Falling back to slow tokenizer.")
            if "roberta" in model_name.lower():
                self.tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True, use_fast=False)
            else:
                self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

    def tokenize_and_align_labels(self, dataset: DatasetDict) -> DatasetDict:
        def tokenize_fn(batch):
            tokenized_inputs = self.tokenizer(
                batch["tokens"],
                truncation=True,
                padding=True,
                is_split_into_words=True,
                max_length=512,
                return_tensors="np"
            )
            # Align labels: for each sentence in the batch
            all_aligned_labels = []
            for i in range(len(batch["tokens"])):
                word_ids = tokenized_inputs.word_ids(batch_index=i)
                labels = batch["labels_numeric"][i]
                aligned_labels = []
                for word_idx in word_ids:
                    if word_idx is None:
                        aligned_labels.append(-100)  # For special tokens
                    else:
                        aligned_labels.append(labels[word_idx])
                all_aligned_labels.append(aligned_labels)
            tokenized_inputs["labels"] = all_aligned_labels
            return tokenized_inputs

        tokenized_datasets = dataset.map(tokenize_fn, batched=True, remove_columns=dataset["train"].column_names)
        tokenized_datasets.set_format("torch")
        print("Tokenization and alignment completed.")
        return tokenized_datasets

#############################################
# Custom CRF-based Model
#############################################
class CRFNER(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.hidden2tag = nn.Linear(self.base_model.config.hidden_size, num_labels)
        # TorchCRF does not support batch_first so we remove that flag
        self.crf = CRF(num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        if token_type_ids is not None and hasattr(self.base_model, "embeddings") and hasattr(self.base_model.embeddings, "token_type_embeddings"):
            outputs = self.base_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        else:
            outputs = self.base_model(input_ids, attention_mask=attention_mask)
        
        sequence_output = self.dropout(outputs.last_hidden_state)  # (batch, seq_len, hidden)
        emissions = self.hidden2tag(sequence_output)              # (batch, seq_len, num_labels)
        
        # Transpose emissions to (seq_len, batch, num_labels) for TorchCRF
        emissions = emissions.transpose(0, 1)
        mask = attention_mask.byte().transpose(0, 1) if attention_mask is not None else None
    
        if labels is not None:
            labels = torch.where(labels == -100, torch.tensor(0, device=labels.device), labels)
            labels = labels.transpose(0, 1)
            log_likelihood = self.crf(emissions, labels, mask=mask)
            loss = -log_likelihood.mean()
            return {"loss": loss, "logits": emissions.transpose(0, 1)}
        else:
            predictions = self.crf.decode(emissions, mask=mask)
            return {"logits": emissions.transpose(0, 1), "predictions": predictions}

#############################################
# Trainer and Metrics Functions
#############################################


def to_scalar(x):
    arr = np.array(x)
    if arr.size == 1:
        return int(arr.item())
    else:
        return int(arr.flatten()[0])

def compute_metrics(eval_pred):
    predictions, label_ids = eval_pred
    if isinstance(predictions, dict):
        predictions = predictions.get("predictions", predictions.get("logits"))
    
    if isinstance(label_ids, torch.Tensor):
        label_ids = label_ids.tolist()
    
    total_tokens = 0
    correct_tokens = 0
    
    for pred_seq, true_seq in zip(predictions, label_ids):
        for p, t in zip(pred_seq, true_seq):
            if to_scalar(t) != -100:
                total_tokens += 1
                if to_scalar(p) == to_scalar(t):
                    correct_tokens += 1
                    
    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0.0
    return {"eval_accuracy": accuracy}

#############################################
# NER Trainer Class (with hyperparameter overrides)
#############################################
class NERTrainer:
    def __init__(self, model_name: str, tokenizer, tokenized_datasets, use_crf=True):
        self.model_name = model_name
        self.tokenizer = tokenizer
        self.tokenized_datasets = tokenized_datasets
        self.data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        if use_crf:
            num_labels = len(id2label)
            self.model = CRFNER(model_name, num_labels)
        else:
            self.model = AutoModelForTokenClassification.from_pretrained(
                model_name,
                id2label=id2label,
                label2id=label2id,
                ignore_mismatched_sizes=True
            )
        self.model.to(self.device)

    def train(self, training_args_overrides=None):
        default_args = {
            "output_dir": "fine_tuned_model",
            "evaluation_strategy": "epoch",
            "remove_unused_columns": False,
            "learning_rate": 3e-5,
            "per_device_train_batch_size": 8,
            "per_device_eval_batch_size": 8,
            "num_train_epochs": 100,
            "weight_decay": 0.01,
            "no_cuda": False,
            "gradient_accumulation_steps": 1,
            "optim": "adamw_torch"
        }
        if training_args_overrides:
            default_args.update(training_args_overrides)
        training_args = TrainingArguments(**default_args)
        
        trainer = CustomLossTrainer(
            model=self.model,
            args=training_args,
            train_dataset=self.tokenized_datasets['train'],
            eval_dataset=self.tokenized_datasets['test'],
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator,
        )
        trainer.train()
        return trainer

#############################################
# Function to Log Results to Excel
#############################################
def log_results_to_excel(results, excel_path="model_results_crf.xlsx"):
    try:
        df = pd.read_excel(excel_path)
    except FileNotFoundError:
        df = pd.DataFrame(columns=["Model", "Hyperparameters", "Accuracy"])

    new_data = pd.DataFrame(results, columns=["Model", "Hyperparameters", "Accuracy"])
    df = pd.concat([df, new_data], ignore_index=True)
    df.to_excel(excel_path, index=False)
    print(f"Results logged to {excel_path}")

#############################################
# Main Function with Optuna Hyperparameter Tuning
#############################################
def main():
    # List of models to evaluate
    model_names = [
        "bert-large-uncased",
        "dmis-lab/biobert-base-cased-v1.1",
        "ProsusAI/finbert",
        "nbroad/ESG-BERT",
        "dbmdz/bert-large-cased-finetuned-conll03-english"    ]
    
    json_path = "Augmented_Annotated_JSON_1103.json"
    dataset_loader = NERDataset(json_path)
    dataset = dataset_loader.create_dataset()

    results = []

    for model_name in model_names:
        print(f"\n=== Training model with CRF layer: {model_name} ===")
        
        # Initialize tokenizer and aligner for current model
        tokenizer_aligner = TokenizerAligner(model_name)
        tokenized_datasets = tokenizer_aligner.tokenize_and_align_labels(dataset)
        
        # -------------------------------
        # Hyperparameter Optimization via Optuna
        # -------------------------------
        def objective(trial):
            learning_rate = trial.suggest_loguniform("learning_rate", 1e-6, 1e-4)
            per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32, 64])
            num_train_epochs = trial.suggest_categorical("num_train_epochs", [350, 400, 450, 500, 550, 600])
            weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
            gradient_accumulation_steps = trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4, 8])
            optim_choice = trial.suggest_categorical("optim", ["adamw_torch", "adamw_hf", "adamw_apex_fused", "adamw_torch_fused"])

            overrides = {
                "learning_rate": learning_rate,
                "per_device_train_batch_size": per_device_train_batch_size,
                "num_train_epochs": num_train_epochs,
                "weight_decay": weight_decay,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                "optim": optim_choice,
                "output_dir": f"tmp_{model_name.replace('/', '_')}_{str(uuid.uuid4())}"
            }
            ner_trainer_hp = NERTrainer(model_name, tokenizer_aligner.tokenizer, tokenized_datasets, use_crf=True)
            trainer_hp = ner_trainer_hp.train(training_args_overrides=overrides)
            eval_results = trainer_hp.evaluate()
            accuracy = eval_results.get("eval_accuracy", 0.0)
            print("Optuna eval accuracy:", accuracy)
            return accuracy  # We maximize accuracy

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=5)
        best_params = study.best_trial.params
        print("Best hyperparameters for", model_name, ":", best_params)
        
        best_overrides = {
            "learning_rate": best_params["learning_rate"],
            "per_device_train_batch_size": best_params["per_device_train_batch_size"],
            "num_train_epochs": best_params["num_train_epochs"],
            "weight_decay": best_params["weight_decay"],
            "gradient_accumulation_steps": best_params["gradient_accumulation_steps"],
            "optim": best_params["optim"],
            "output_dir": f"final_{model_name.replace('/', '_')}"
        }
        
        # -------------------------------
        # Final Training with Best Hyperparameters
        # -------------------------------
        print(f"Retraining final model for {model_name} with best hyperparameters...")
        ner_trainer_final = NERTrainer(model_name, tokenizer_aligner.tokenizer, tokenized_datasets, use_crf=True)
        trainer_final = ner_trainer_final.train(training_args_overrides=best_overrides)
        eval_results = trainer_final.evaluate()
        accuracy = eval_results.get("eval_accuracy", 0.0)
        print(f"Final eval accuracy for {model_name}: {accuracy}")
        results.append((model_name, json.dumps(best_overrides), accuracy))

        final_model_name = "nl_thesis_crf" + model_name.split("/")[-1]
        ner_trainer_final.model.save_pretrained(final_model_name)
        tokenizer_aligner.tokenizer.save_pretrained(final_model_name)
        
        try:
            ner_trainer_final.model.push_to_hub(final_model_name)
            tokenizer_aligner.tokenizer.push_to_hub(final_model_name)
            print(f"Model {final_model_name} pushed to Hugging Face Hub.")
        except Exception as e:
            print("Error pushing model to Hugging Face Hub:", e)
    
    log_results_to_excel(results)

if __name__ == "__main__":
    main()


NameError: name 'DatasetDict' is not defined