Setup the environment with SimpleTransformers:

In [None]:
!pip install tokenizers==0.9.4
!pip install simpletransformers --upgrade simpletransformers

Load input file from my Drive:

In [None]:
from google.colab import drive

drive.mount('/content/drive')

# Here you need to configure the path where you saved the files in your Drive
%cd /content/drive/MyDrive/TCC

Configure the parameters for the run:

In [None]:
from simpletransformers.ner import NERModel, NERArgs
import time

# use the same configuration used in the Jupyter notebook

# number of folds
n_folds = 10

# run entity identification task or entity classification task
just_identify = False

if (just_identify):
  dst_folder = "folds_CoNLL_identification/"

  labels = [
    "B",
    "I",
    "O"
  ]
else:
  dst_folder = "folds_CoNLL_classification/"

  labels = [
    "B-individuo",
    "I-individuo",
    "B-local",
    "I-local",
    "B-organizacao",
    "I-organizacao",
    "B-obra",
    "I-obra",
    "B-acontecimento",
    "I-acontecimento",
    "B-tempo",
    "I-tempo",
    "B-valor",
    "I-valor",
    "B-abstraccao",
    "I-abstraccao",
    "B-outro",
    "I-outro",
    "O"
    ]

model_args = NERArgs()

model_args.overwrite_output_dir = True

model_args.max_seq_length = 512

model_args.labels_list = labels

Run the task with cross-validation:

In [None]:
for i in range(n_folds):
  start_time = time.time()

  print("------------------------------------- FOLD {} --------------------------------------".format(i))

  train_path = dst_folder + "CoNLL_train_fold_{}_v2.txt".format(i)
  test_path = dst_folder + "CoNLL_test_fold_{}_notype_v2.txt".format(i)
  
    # CUDA is enabled by default (and I recommend using it)
    # You must configure the Colab environment to use a GPU
    # or put "use_cuda=False" like this "...args=model_args, use_cuda=False)..."
  model = NERModel('bert', 'neuralmind/bert-base-portuguese-cased', args=model_args)

  model.train_model(train_path)

  result, model_outputs, predictions = model.eval_model(test_path)

  end_time = time.time()

  print("--- %s seconds ---" % (end_time - start_time))

  with open(dst_folder + "prediction_fold_{}.txt".format(i), "w+", encoding="utf-8") as fileWriter:
    print(predictions, file=fileWriter)

  print("\n")