In [None]:
import os
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, AdamW
from datasets import Dataset, ClassLabel
from seqeval.metrics import f1_score, precision_score, recall_score
from seqeval.scheme import IOB2
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


# Function to generate BIO tags with label validation
def generate_bio_tags(text, labels, tokenizer):
    tokenized_text = tokenizer(text)
    tokens = tokenized_text["input_ids"]
    bio_tags = ["O"] * len(tokens)

    token_offsets = tokenizer.encode_plus(
        text,
        return_offsets_mapping=True
    )["offset_mapping"]

    # Loop over entity labels to generate BIO tags
    for entity in labels:
        start = entity["start"]
        end = entity["end"]
        entity_label = entity["labels"][0]

        # Apply BIO tagging
        start_idx = None
        for i, offset in enumerate(token_offsets):
            if offset[0] >= start and start_idx is None:
                bio_tags[i] = "B-" + entity_label  # Apply "B-" prefix
                start_idx = i
            elif start_idx is not None and offset[0] < end:
                bio_tags[i] = "I-" + entity_label  # Apply "I-" prefix
            elif start_idx is not None and offset[0] >= end:
                break

    return tokens, bio_tags


def chunk_tokens_and_labels(tokenized_text, bio_tags, chunk_size, tokenizer):
    # Lists to store chunks of text and labels
    text_chunks = []
    label_chunks = []

    padding_token = tokenizer.pad_token_id  # The token used for padding
    padding_label = "O"  # Default label for padding

    # Split the tokenized text and bio_tags into chunks
    start_idx = 0

    while start_idx < len(tokenized_text):
        end_idx = min(start_idx + chunk_size, len(tokenized_text))

        # Adjust end_idx to avoid splitting entities
        if end_idx < len(tokenized_text):
            while end_idx > start_idx and bio_tags[end_idx].startswith("I-"):
                end_idx -= 1

        # Get the chunked text and labels
        chunk_text = tokenized_text[start_idx:end_idx]
        chunk_labels = bio_tags[start_idx:end_idx]

        # Apply padding if the chunk is shorter than 128 tokens
        if len(chunk_text) < chunk_size:
            padding_length = chunk_size - len(chunk_text)
            chunk_text += [padding_token] * padding_length
            chunk_labels += [padding_label] * padding_length

        text_chunks.append(chunk_text)
        label_chunks.append(chunk_labels)

        # Move the start index for the next chunk
        start_idx = end_idx

    return text_chunks, label_chunks


def prepare_sentences(json_data, tokenizer):
    sentences = []

    for item in json_data:
        file_path = item["text"]  # The file path to the text file
        if "label" in item:
            labels = item["label"]
        else:
            print(f"Missing 'label' key ")
            continue

        with open(f'balance_context/{file_path}', 'r') as txt_file:
            text = txt_file.read()

        # Ensure labels are correctly formatted
        tokens, bio_tags = generate_bio_tags(text, labels, tokenizer)
        text_rows, label_rows = chunk_tokens_and_labels(tokens, bio_tags, 128, tokenizer)

        for text_row, label_row in zip(text_rows, label_rows):
            sentence = {'text': tokenizer.decode(text_row), 'labels': label_row}
            sentences.append(sentence)

    return sentences


def encode_labels(labels, num_classes, device):
    encoded_labels = torch.zeros(labels.size(0), labels.size(1), num_classes, device=device)
    for i in range(labels.size(0)):
        for j in range(labels.size(1)):
            encoded_labels[i, j, int(labels[i, j])] = 1
    return encoded_labels


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Convert predicted labels to multi-class format if they are one-hot encoded
    if len(preds.shape) > 1:
        preds = preds.argmax(axis=1)

    # Ensure labels are in a multi-class format
    if len(labels.shape) > 1:
        labels = labels.argmax(axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=1)

    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }
classmap = ClassLabel(
    num_classes=11,
    names=[
        'B-ASSET', 'I-ASSET', 'B-DEBT', 'I-DEBT', 'B-EQUITY', 'I-EQUITY',
        'B-DEPOSIT', 'I-DEPOSIT', 'B-MISC', 'I-MISC', 'O'
    ]
)
id2label = {i: classmap.int2str(i) for i in range(classmap.num_classes)}
label2id = {classmap.str2int(c): c for c in classmap.names}


def model_init():
    torch.cuda.empty_cache()
    return AutoModelForTokenClassification.from_pretrained(
        "KB/bert-base-swedish-cased-ner",
        id2label=id2label,
        label2id=label2id,
        finetuning_task="ner",
        ignore_mismatched_sizes=True,
        return_dict=True
    )


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased-ner")


# Modify the CustomTrainer class to use one-hot encoded labels



# Define training arguments
training_args = TrainingArguments(
    output_dir="models/ner_model_output1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_steps=300,
    eval_steps=300,
    learning_rate=2e-5,  # Consider experimenting with this value
    num_train_epochs=5,  # Adjust based on performance
    per_device_train_batch_size=32,  # Test smaller or larger sizes
    warmup_steps=100,  # Adjust based on total number of steps
    weight_decay=0.01,  # Increase if overfitting is observed
    logging_steps=50,  # More frequent logs can help monitor training
    load_best_model_at_end=True  # Load the best model at the end based on validation loss
    # lr_scheduler_type='linear'  # Apply a learning rate scheduler
)
# Define the class mapping with accurate labels

# Create id2label and label2id mappings based on classmap


# Load the model with correct classifier initialization

device = torch.device("cuda")
# model.to(device)

# Load the JSON data
with open('context_balance.json', 'r') as f:
    json_data = json.load(f)

# Prepare sentences
sentences = prepare_sentences(json_data, tokenizer)

# Create a dataset from the collected sentences
ds = Dataset.from_pandas(pd.DataFrame(data=sentences))
ds.set_format("torch")
datasetDict = ds.train_test_split(test_size=0.4)

# Tokenize the text with proper truncation and padding
ds_train = datasetDict.map(lambda x: tokenizer(x["text"], truncation=True, max_length=128, padding='max_length'))
# Use the classmap to convert label names to class indices
ds_train = ds_train.map(lambda y: {"labels": classmap.str2int(y["labels"])})

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
import optuna

def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32, 64, 128]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
        "warmup_steps": trial.suggest_int("warmup_steps", 0, 1000),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 10),
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine", "polynomial"]),
        # Add more parameters as needed
    }

# Set up the Trainer
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=ds_train['train'],
    eval_dataset=ds_train['test'],
    tokenizer=tokenizer,
    compute_metrics = compute_metrics
)


import warnings

warnings.filterwarnings("ignore")
#best_trial = trainer.hyperparameter_search(
#    direction="maximize",
#    backend="optuna",
#    hp_space=optuna_hp_space,
#    n_trials=20
#)

# Train the model
#trainer.train()




In [None]:
print(best_trial.hyperparameters)

torch.cuda.empty_cache()

In [None]:

# Define training arguments
training_args = TrainingArguments(
    output_dir="models/ner_model_output",
    learning_rate=5.799352882984339e-05,
    per_device_train_batch_size=8,
    weight_decay= 0.16,
    warmup_steps= 100,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
      # Increase if overfitting is observed
    logging_steps=50,  # More frequent logs can help monitor training
    load_best_model_at_end=True,
    #**best_trial.hyperparameters
)


# Set up the Trainer
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=ds_train['train'],
    eval_dataset=ds_train['test'],
    tokenizer=tokenizer,
    compute_metrics = compute_metrics
)


import warnings

warnings.filterwarnings("ignore")

trainer.train()




In [None]:
trainer.save_model("models/random")
