In [None]:
import json
from datasets import Dataset, ClassLabel
import pandas as pd
from transformers import AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer, \
    AutoModelForTokenClassification, AutoConfig
import os



# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased-ner")

# Define the ClassLabel for consistent label handling
classmap = ClassLabel(
    num_classes=6,
    names=['O', 'ASSET', 'DEBT', 'EQUITY', 'DEPOSIT', 'MISC']
    # Ensure 'O' is included and at the correct index if used often
)
# Create label to ID and ID to label mappings
label2id = {label: classmap.str2int(label) for label in classmap.names}
id2label = {id: label for label, id in label2id.items()}


def label_data(jsondata, tokenizer, label2id):
    token_list = []
    label_list = []
    attention_mask_list = []
    for item in jsondata:
        file_path = item['text']
        labels = item.get('label', [])  # Handle missing labels
        with open(f'test_set/{file_path}', 'r') as file:
            text = file.read()
             
        tokenized_input = tokenizer(text, return_offsets_mapping=True, return_attention_mask=True)
        tokens = tokenized_input['input_ids']
        offset_mapping = tokenized_input['offset_mapping']
        attention_mask_list.append(tokenized_input['attention_mask'])

        token_labels = [label2id['O']] * len(tokens)  # Default label is 'O'

        if labels:
            for label in labels:
                start_label = label['start']
                end_label = label['end']
                entity_label = label['labels'][0]
                for idx, (start, end) in enumerate(offset_mapping):
                    if start < end_label and end > start_label:
                        token_labels[idx] = label2id[entity_label]

        token_list.append(tokens)
        label_list.append(token_labels)

    return token_list, label_list, attention_mask_list


def process_json_file(filename, tokenizer, label2id):
    with open(filename, 'r') as file:
        json_data = json.load(file)
    token_list, label_list, attention_mask_list = label_data(json_data, tokenizer, label2id)
    return pd.DataFrame({
        'input_ids': token_list,
        'labels': label_list,
        'attention_mask': attention_mask_list
    })


# Path to your JSON file
json_file_path = 'context_balance.json'
processed_data_2 = process_json_file(json_file_path, tokenizer, label2id)

from collections import Counter

# Initialize a Counter to count occurrences of each label
label_counter2 = Counter()

# Iterate over the label lists in the processed data
for labels in processed_data_2['labels']:
    # Update the counter with the labels in the current list
    label_counter2.update(labels)

# Print the counts of each label
for label, count in label_counter2.items():
    print(f"Label '{id2label[label]}' appears {count} times.")


dataset = Dataset.from_pandas(processed_data)
dataset.set_format('torch')
datasetDict = dataset.train_test_split(0.2)

print("dataset prepared")





In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

def compute_metrics(p):
    # Extract predictions and labels from the outputs
    predictions = np.argmax(p.predictions, axis=2)
    true_labels = p.label_ids
    
    # Flatten the predictions and true_labels to handle them with sklearn metrics
    # Also, remove ignored indices (-100 used for padding or special tokens)
    true_labels_flat = [label for sublist in true_labels for label in sublist if label != -100]
    predictions_flat = [pred for sublist, true_list in zip(predictions, p.label_ids)
                        for pred, true in zip(sublist, true_list) if true != -100]

    # Calculate metrics using sklearn
    precision = precision_score(true_labels_flat, predictions_flat, average='macro', zero_division=0)
    recall = recall_score(true_labels_flat, predictions_flat, average='macro', zero_division=0)
    f1 = f1_score(true_labels_flat, predictions_flat, average='macro', zero_division=0)
    accuracy = accuracy_score(true_labels_flat, predictions_flat)  # Optional

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy  # Optional
    }


In [None]:
import torch
import gc
import warnings
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoConfig
from datasets import load_metric
import optuna

data_collator = DataCollatorForTokenClassification(tokenizer)
torch.cuda.empty_cache()
gc.collect()



def objective(trial):
    print(torch.cuda.memory_summary(device=None, abbreviated=False))
    # Sample hyperparameters using trial object
    learning_rate = trial.suggest_float('learning_rate', 1e-6, 5e-3, log=True)
    #, 64,128
    batch_size = trial.suggest_categorical('per_device_train_batch_size', [8, 16, 32])
    num_train_epochs = trial.suggest_int('num_train_epochs', 1, 10)
    warmup_steps = trial.suggest_int('warmup_steps', 0, 500)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)
    lr_scheduler_type = trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine", "polynomial"])

    def model_init():
        torch.cuda.empty_cache()
        gc.collect()
        model_name = "KB/bert-base-swedish-cased-ner"  # Example model, replace with your specific model
        
        
        # Update the config with the number of labels you have
        num_labels = len(label2id)  # This should be set up from your ClassLabel or similar configuration
        config = AutoConfig.from_pretrained(model_name)
        config.num_labels = num_labels
        config.id2label = id2label
        config.label2id = label2id
        
        # Load the model with updated configuration
        model = AutoModelForTokenClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True)
        
       # model.classifier = torch.nn.Linear(model.bert.config.hidden_size, config.num_labels)
       # model.classifier.apply(model._init_weights)
        return model

    training_args = TrainingArguments(
        output_dir='./models/opti',
        evaluation_strategy='epoch',
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        warmup_steps=warmup_steps,
        weight_decay=weight_decay,
        lr_scheduler_type=lr_scheduler_type,
        logging_dir='./logs',
        logging_steps=10
    )

    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=datasetDict['train'],  # Ensure these datasets are correctly defined
        eval_dataset=datasetDict['test'],
        data_collator= data_collator,
        compute_metrics=compute_metrics  # Define this function to compute your desired metrics
    )

    try:
        # Train and evaluate the model
        trainer.train()
        eval_results = trainer.evaluate()
        f1_score = eval_results['eval_f1']
    finally:
    # Ensure cleanup happens even if there are errors during training/evaluation
        del trainer
        torch.cuda.empty_cache()
        gc.collect()

    return f1_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, gc_after_trial = True)

print("Best trial:")
print(study.best_trial.params)


In [None]:
all_trials = study.trials

# Sort trials based on their achieved f1_scores
sorted_trials = sorted(all_trials, key=lambda t: t.value, reverse=True)  # Assuming higher f1_score is better

# Display sorted trials
for trial in sorted_trials:
    print(f"Trial {trial.number} F1-score: {trial.value}, Parameters: {trial.params}")
#trainer.save_model("models/new_test_page/")
#print("model saved")

In [None]:
model_name = "KB/bert-base-swedish-cased-ner"  # Example model, replace with your specific model
        
        
# Update the config with the number of labels you have
num_labels = len(label2id)  # This should be set up from your ClassLabel or similar configuration
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels
config.id2label = id2label
config.label2id = label2id

# Load the model with updated configuration
model = AutoModelForTokenClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir='./models/nah',  # specify output directory              # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    save_strategy="epoch",
    learning_rate=8.118900895030543e-05,
    per_device_train_batch_size=8,
    num_train_epochs=9,
    warmup_steps=361,
    weight_decay=0.21913316520609985,
    lr_scheduler_type="cosine"
    #**study.best_params
)

# Initialize Trainer
trainer = Trainer(
    model=model, # make sure model is loaded and correctly configured for Token Classification
    args=training_args,
    data_collator=data_collator,
    tokenizer =tokenizer,
    train_dataset=datasetDict['train'],
    eval_dataset=datasetDict['test'],
    compute_metrics=compute_metrics
)
# Start training
trainer.train()

In [None]:
trainer.save_model("./models/finished")

In [None]:
model_path = './models/finished/'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Test line
test_line = ""\n
B al ans räkn ing Not 2016-12-31 2016-04-30
EGET KAPITAL OCH SKULDER

Eget kapital

Bundet eget kapital

Aktiekapital 1080000 1000000
Reservfond 3300 3300
Summa bundet eget kapital 1083300 1003300
Fritt eget kapital

Överkursfond 11920000 0
Balanserat resultat 14725416 8713489
Årets resultat -21226857 6011927
Summa fritt eget kapital 5418559 14725416
Summa eget kapital 6501859 15 '728716
Långfristiga skulder |

Övriga skulder till kreditinstitut 0 355564
Skulder till koncernföretag 12900941 22243045
Övriga skulder 79264641 0
Summa långfristiga skulder 20865582 22598609
Kortfristiga skulder

Leverantörsskulder 484794 360235
Skatteskulder 0 101316
SMS-lån 10000 10000
Övriga skulder 186722 71386
Upplupna kostnader och förutbetalda intäkter 185566 65000
Summa kortfristiga skulder 857082 597937
SUMMA EGET KAPITAL OCH SKULDER 28224523 38925262

"""
tokenized_input = tokenizer(test_line, return_tensors="pt")

# Prediction
model.eval()
with torch.no_grad():
    outputs = model(**tokenized_input)
    predictions = outputs.logits.argmax(dim=-1)
 # Adjust as per your setup

# Convert indices to labels
predicted_label_strings = [id2label[idx] for idx in predictions[0].tolist()]

# Print tokens with their labels
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"][0])
print("Predicted NER tags:")
for token, label in zip(tokens, predicted_label_strings):
    print(f"{token}: {label}")


In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np

def compute_metrics(p):
    # Extract predictions and labels from the outputs
    predictions = np.argmax(p.predictions, axis=2)
    true_labels = p.label_ids
    
    # Flatten the predictions and true_labels to handle them with sklearn metrics
    # Also, remove ignored indices (-100 used for padding or special tokens)
    true_labels_flat = [label for sublist in true_labels for label in sublist if label != -100]
    predictions_flat = [pred for sublist, true_list in zip(predictions, p.label_ids)
                        for pred, true in zip(sublist, true_list) if true != -100]

    # Calculate the confusion matrix
    cm = confusion_matrix(true_labels_flat, predictions_flat)

    return {
        "confusion_matrix": cm
    }


In [None]:
test_json_file_path = 'modified-test-set-new.json'
model_path = './models/finished/'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

processed_data_2 = process_json_file(test_json_file_path, tokenizer, label2id)

print(tokenizer.pad_token_id)
test_dataset = Dataset.from_pandas(processed_data_2)


dataset.set_format('torch')
 # now promt the model to do prediictions on the whole dataset
# and calculate metrics maybe with the previously created compute_metrics()
trainer = Trainer(model=model,
                 compute_metrics=compute_metrics,
                data_collator=data_collator)
predictions = trainer.predict(test_dataset)
#predictions = trainer.predict(datasetDict['train'])
#predictions = trainer.predict(datasetDict['test'])
metrics = predictions.metrics






In [None]:
print(metrics)

In [None]:
print(metrics['test_confusion_matrix'])

In [None]:
np.save('confusion_matrix.npy', metrics['test_confusion_matrix'])