In [None]:
import pandas as pd
import sklearn
import os
from simpletransformers.ner import NERArgs, NERModel
import torch
import csv

In [None]:
# Data for the model
number = 1
train_data_dir = 'train.tsv'
eval_data_dir = 'train_a.tsv'
labels = ["B", ":", ";", ",", ".", "-", "...", "?", "!"]
model_type = "bert"
model_name = "dkleczek/bert-base-polish-cased-v1"
output_dir = f"model_dir_{model_name}_{number}"

In [None]:
# Changing data to DataFrame
try:
    train_data = pd.read_csv(train_data_dir, sep="\t", header=0, quoting=csv.QUOTE_NONE, on_bad_lines='skip', quotechar='"')
    evaluate_data = pd.read_csv(eval_data_dir, sep="\t", header=0, quoting=csv.QUOTE_NONE, on_bad_lines='skip', quotechar='"')
except Exception as e:
    print("Error reading the data:", e)

In [None]:
# Configurating model
model_args = NERArgs()
#model args
model_args.model_type = model_type
model_args.model_name = model_name
model_args.early_stopping_metric = "f1_weighted"
model_args.early_stopping_metric_minimize = False
model_args.early_stopping_patience = 10 
model_args.train_batch_size = 12
model_args.output_dir = output_dir
model_args.best_model_dir = os.path.join(output_dir,"best_model")
model_args.num_train_epochs = 100
model_args.use_cuda = torch.cuda.is_available() 

In [None]:
# Creating model
model = NERModel(
        model_type=model_type,
        model_name=model_name,
        labels=labels,
        args=model_args,
        use_cuda=torch.cuda.is_available()
    )

In [ ]:
# Train the model on train data
model.train_model(train_data, output_dir=output_dir,evaluate_data=evaluate_data, f1=sklearn.metrics.f1_score)

In [None]:
# Evaluate the model on test-A data
result, model_outputs, wrong_predictions = model.eval_model(evaluate_data)
print("Evaluation Results:", result)
with open(f"{model_type}_evaluation_result.txt", 'w') as file:
    for key, value in result.items():
        file.write(f'{key}: {value}\n')

print("Results saved successfully!")

In [None]:
# Predictions on test-D data
input_path = 'in.tsv'
with open(input_path, 'r', encoding='utf-8') as file:
    to_predict = [line.strip().split('\t')[1] for line in file if line.strip()]

predictions, model_outputs_pred = model.predict(to_predict, split_on_space=True)

output_path = f"f{model_type}_predictions_output.txt"

with open(output_path, 'w', encoding='utf-8') as file:
    for text, prediction in zip(to_predict, predictions):
        file.write(f"Text: {text}\n")
        file.write("Predictions:\n")
        for token, label in zip(text.split(), prediction):
            file.write(f"{label}\n")
        file.write("\n")

print("Predictions saved successfully!")