## Adaptando dataset ao Flair
O Flair recebe o dataset dividido em train, test e dev para treinar o modelo. Portanto inicialmente tratarei o dataset para adequá-lo ao Flair.
As células abaixo embaralham, dividem e salvam o dataset em arquivos txt.

A implementação busca o dataset montando o Google Drive. Para mudar o caminho do arquivo em seu drive atualize a variável DATASET_GDRIVE_PATH.

In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
df = pd.read_csv('./ner_dataset.csv', encoding='Latin-1')
df = df.fillna(method='ffill') # Preenche as colunas NA com a informação da célula acima
df = df.set_index('Sentence #', append=True)

In [3]:
# Embaralha o dataset mantendendo a estrutura das sentenças intactas. 
def shuffle_preserving_sentences(df):
  sentence_groupby = df.groupby('Sentence #') # Agrupa por sentença

  sentences_shuffled = list(sentence_groupby.groups.keys()) # Lista das chaves de cada grupo 
  random.shuffle(sentences_shuffled) # Embaralha as chaves

  # Cria e preenche uma lista com os dataframes de cada sentença. 
  shuffled_dfs = []
  for sentence_n in sentences_shuffled:
    shuffled_dfs.append(sentence_groupby.get_group(sentence_n))

  return shuffled_dfs

In [4]:
def write_sentences_to_file(groupby, file):
  for _, group in groupby:
    group.to_csv(file, index=False, header=False, sep=' ', encoding='Latin-1', lineterminator='\n')
    file.write('\n')


In [5]:
def split_dataset(df, train_ratio, test_ratio):
  sentences_dfs = shuffle_preserving_sentences(df)

  total_size = len(sentences_dfs)
  train_size = int(total_size * train_ratio)
  test_size = int(total_size * test_ratio)

  train_data = pd.concat(sentences_dfs[:train_size])
  test_data = pd.concat(sentences_dfs[train_size:train_size + test_size])
  dev_data = pd.concat(sentences_dfs[train_size + test_size:])

  return train_data, test_data, dev_data


In [6]:
train_ratio = 0.8
test_ratio = 0.1
train_df, test_df, dev_df = split_dataset(df, train_ratio, test_ratio)

with open('./train.txt', 'w', encoding='Latin-1') as f:
  train_df_groupby = train_df.groupby('Sentence #')
  write_sentences_to_file(train_df_groupby, f)

with open('./test.txt', 'w', encoding='Latin-1') as f:
  test_df_groupby = test_df.groupby('Sentence #')
  write_sentences_to_file(test_df_groupby, f)

with open('./dev.txt', 'w', encoding='Latin-1') as f:
  dev_df_groupby = dev_df.groupby('Sentence #')
  write_sentences_to_file(dev_df_groupby, f)

## Carregando dataset

In [7]:
pip install flair




In [8]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
columns = {0: 'text', 1: 'pos', 2: 'ner'}
data_folder = './'

corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt',
                              encoding='Latin-1')

2023-07-28 17:49:01,669 Reading data from .
2023-07-28 17:49:01,669 Train: train.txt
2023-07-28 17:49:01,669 Dev: dev.txt
2023-07-28 17:49:01,670 Test: test.txt


In [10]:
print("Train size: " + str(len(corpus.train)))
print("Test size: " + str(len(corpus.test)))
print("Dev size: " + str(len(corpus.dev)))

Train size: 38367
Test size: 4795
Dev size: 4797


In [11]:
print(corpus.train[0].to_tagged_string('ner'))
print(corpus.train[0].to_tagged_string('pos'))

Sentence[24]: "Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country ." → ["London"/geo, "Iraq"/geo, "British"/gpe]
Sentence[24]: "Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country ." → ["Thousands"/NNS, "of"/IN, "demonstrators"/NNS, "have"/VBP, "marched"/VBN, "through"/IN, "London"/NNP, "to"/TO, "protest"/VB, "the"/DT, "war"/NN, "in"/IN, "Iraq"/NNP, "and"/CC, "demand"/VB, "the"/DT, "withdrawal"/NN, "of"/IN, "British"/JJ, "troops"/NNS, "from"/IN, "that"/DT, "country"/NN, "."/.]


## Treinando modelo


In [12]:
from flair.embeddings import TransformerWordEmbeddings, WordEmbeddings, FlairEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

In [13]:
label_type = 'ner' # named entity recognition

label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
print("\nLabels Dictionary: " + str(label_dict))

2023-07-28 17:49:20,315 Computing label dictionary. Progress:


38367it [00:00, 62611.81it/s]

2023-07-28 17:49:20,934 Dictionary created for label 'ner' with 8 values: geo (seen 29942 times), tim (seen 16305 times), org (seen 16180 times), per (seen 13672 times), gpe (seen 12756 times), art (seen 310 times), eve (seen 244 times), nat (seen 168 times)

Labels Dictionary: Dictionary with 8 tags: geo, tim, org, per, gpe, art, eve, nat





Modelo NER com transformers

In [14]:
embeddings = TransformerWordEmbeddings(model='xlm-roberta-large',
                                       layers="-1",
                                       subtoken_pooling="first",
                                       fine_tune=True,
                                       use_context=True,
                                       )

tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type='ner',
                        use_crf=False,
                        use_rnn=False,
                        reproject_embeddings=False,
                        )

trainer = ModelTrainer(tagger, corpus)

trainer.fine_tune('resources/taggers/sota-ner-flert',
                  learning_rate=5.0e-6,
                  mini_batch_size=4,
                  #mini_batch_chunk_size=1,
                  )

2023-07-28 17:49:52,904 SequenceTagger predicts: Dictionary with 33 tags: O, S-geo, B-geo, E-geo, I-geo, S-tim, B-tim, E-tim, I-tim, S-org, B-org, E-org, I-org, S-per, B-per, E-per, I-per, S-gpe, B-gpe, E-gpe, I-gpe, S-art, B-art, E-art, I-art, S-eve, B-eve, E-eve, I-eve, S-nat, B-nat, E-nat, I-nat
2023-07-28 17:49:52,911 ----------------------------------------------------------------------------------------------------
2023-07-28 17:49:52,912 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250003, 1024)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0-23): 24 x XLMRobertaLayer(
    

  1%|          | 7/1199 [00:28<1:20:23,  4.05s/it]


KeyboardInterrupt: 

Modelo NER com Flair embeddings

In [None]:
embedding_types = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

embeddings = StackedEmbeddings(embeddings=embedding_types)
