In [2]:
import pandas as pd
from enum import Enum
import re
import random
import spacy
from spacy.util import minibatch
from spacy.training import Example
from spacy.scorer import Scorer
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

In [3]:
prqt = pd.read_parquet('./data/parquet/MultL/test_multilingual.parquet', engine='fastparquet')

In [4]:
def map_tag(tag_id):
    if tag_id in range(1, 3):
        return "PER"
    elif tag_id in range(3, 5):
        return "ORG"
    elif tag_id in range(5, 7):
        return "LOC"
    elif tag_id in range(7, 9):
        return "MIST"
    else:
        return "O"

In [5]:
def convert_tokens_to_spacy_format(tokens, tags):
    text = ""
    entities = []
    token_start = 0

    for i, (token, tag_id) in enumerate(zip(tokens, tags)):
        tag = map_tag(tag_id)

        # Calcula os offsets
        start = len(text)
        text += token
        end = len(text)

        if i < len(tokens) - 1:
            text += " "

        if tag != "O":
            # Se for entidade, adiciona aos spans
            entities.append((start, end, tag))

    return (text.strip(), {"entities": entities})

In [6]:
data_spacy = [
    convert_tokens_to_spacy_format(row["tokens"], row["ner_tags"])
    for _, row in prqt.iterrows()
]

for item in data_spacy[:5]:
    print(item)


("On this occasion he failed to gain the support of the South Wales Miners ' Federation and had to stand down .", {'entities': [(54, 59, 'ORG'), (60, 65, 'ORG'), (66, 72, 'ORG'), (73, 74, 'ORG'), (75, 85, 'ORG')]})
("On both these occasions he was backed by the South Wales Miners ' Federation , but he was not successful .", {'entities': [(45, 50, 'ORG'), (51, 56, 'ORG'), (57, 63, 'ORG'), (64, 65, 'ORG'), (66, 76, 'ORG')]})
('He also appeared as himself in the 1996 film " Eddie " .', {'entities': [(47, 52, 'MIST')]})
('The Colorado Rockies were created as an expansion franchise in 1993 and Coors Field opened in 1995 .', {'entities': [(4, 12, 'ORG'), (13, 20, 'ORG'), (72, 77, 'LOC'), (78, 83, 'LOC')]})
('He kept busy recording demo tapes at his home and working various jobs , including a position as a contracted security guard at the La Valencia Hotel in La Jolla .', {'entities': [(132, 134, 'LOC'), (135, 143, 'LOC'), (144, 149, 'LOC'), (153, 155, 'LOC'), (156, 161, 'LOC')]})


In [7]:
TRAIN_DATA = data_spacy[:int(len(data_spacy) * 0.90)]
DEV_DATA = data_spacy[:int(len(data_spacy) * 0.10)]

In [8]:
def create_docbin(data, file_path):
    nlp = spacy.blank("pt")
    db = DocBin()

    print(f"Gerando dados para {file_path}...")
    for text, annot in tqdm(data):
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is not None:
                ents.append(span)
        doc.ents = ents
        db.add(doc)

    db.to_disk(file_path)
    print(f"Dados salvos em '{file_path}'")

In [9]:
create_docbin(TRAIN_DATA, "train_data.spacy")
create_docbin(DEV_DATA, "dev_data.spacy")

Gerando dados para train_data.spacy...


100%|██████████| 28237/28237 [00:03<00:00, 9265.63it/s] 


Dados salvos em 'train_data.spacy'
Gerando dados para dev_data.spacy...


100%|██████████| 3137/3137 [00:00<00:00, 11197.18it/s]

Dados salvos em 'dev_data.spacy'





In [None]:
import spacy

nlp = spacy.load("./model/model-best")

doc = nlp("Nossa Senhora Aparecida")

for ent in doc.ents:
    print(ent.text, ent.label_)

if "textcat" in nlp.pipe_names:
    print(doc.cats)


Nossa LOC
Senhora LOC
Aparecida LOC


In [None]:
!python -m spacy train ./config.cfg --output ./model --paths.train ./traning/train_data.spacy --paths.dev ./traning/dev_data.spacy