In [None]:
import pandas as pd
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [5]:
prqt = pd.read_parquet('../data/parquet/MultL/train_pt_multilingual.parquet', engine='fastparquet')

In [8]:
def map_tag(tag_id):
    if tag_id in range(1, 3):
        return "PER"
    elif tag_id in range(3, 5):
        return "ORG"
    elif tag_id in range(5, 7):
        return "LOC"
    elif tag_id in range(7, 9):
        return "MIST"
    else:
        return "O"

def map_tag_MultL(tag_id):
    if tag_id in range(1, 3):
        return "PER"
    elif tag_id in range(3, 5):
        return "ORG"
    elif tag_id in range(5, 7):
        return "LOC"
    elif tag_id in range(7, 9):
        return "MIST"
    else:
        return "O"

def map_tag_LeNER(tag_id: int) -> str:
    if tag_id == 0:
        return "O"
    elif tag_id in range(1, 3):
        return "ORG"
    elif tag_id in range(3, 5):
        return "PER"    
    elif tag_id in range(7, 9):
        return "LOC"
    elif tag_id in range(5, 7) or tag_id in range(9, 13):
        return "MIST"
    else:
        return "O"

def map_tag_HAREM(tag_id: int) -> str:
    if tag_id == 0:
        return "O"
    elif tag_id in range(1, 3):
        return "PER"
    elif tag_id in range(3, 5):
        return "ORG"
    elif tag_id in range(5, 7):
        return "LOC"
    elif tag_id in range(7, 21):
        return "MIST"
    else:
        return "O"

In [None]:
def convert_tokens_to_spacy_format(tokens, tags):
    text = ""
    entities = []

    for i, (token, tag_id) in enumerate(zip(tokens, tags)):
        tag = map_tag(tag_id)

        # Calcula os offsets
        start = len(text)
        text += token
        end = len(text)

        if i < len(tokens) - 1:
            text += " "

        if tag != "O":
            # Se for entidade, adiciona aos spans
            entities.append((start, end, tag))

    return (text.strip(), {"entities": entities})

def convert_tokens_to_spacy_format_by_dataset(tokens, tags, mapping: callable):
    text = ""
    entities = []

    for i, (token, tag_id) in enumerate(zip(tokens, tags)):
        tag = mapping(tag_id)

        # Calcula os offsets
        start = len(text)
        text += token
        end = len(text)

        if i < len(tokens) - 1:
            text += " "

        if tag != "O":
            # Se for entidade, adiciona aos spans
            entities.append((start, end, tag))

    return (text.strip(), {"entities": entities})

In [11]:
data_spacy = [
    convert_tokens_to_spacy_format_by_dataset(row["tokens"], row["ner_tags"], map_tag_MultL)
    for _, row in prqt.iterrows()
]

In [13]:
a = [1]
b = [2]

a+b

[1, 2]

In [22]:
n = len(data_spacy)
split = int(n * 0.9)
train_data = data_spacy[:split]   # 90%
dev_data   = data_spacy[split:]   # 10% restante

In [23]:
def create_docbin(data, file_path):
    nlp = spacy.blank("pt")
    db = DocBin()

    print(f"Gerando dados para {file_path}...")
    for text, annot in tqdm(data):
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is not None:
                ents.append(span)
        doc.ents = ents
        db.add(doc)

    db.to_disk(file_path)
    print(f"Dados salvos em '{file_path}'")

In [24]:
create_docbin(train_data, "../spacy/train.spacy")
train_data.clear()
create_docbin(dev_data, "../spacy/dev.spacy")
dev_data.clear()



Gerando dados para ../spacy/train.spacy...


100%|██████████| 217512/217512 [00:14<00:00, 15138.02it/s]


Dados salvos em '../spacy/train.spacy'
Gerando dados para ../spacy/dev.spacy...


100%|██████████| 24168/24168 [00:01<00:00, 13080.19it/s]


Dados salvos em '../spacy/dev.spacy'
