In [45]:
import pandas as pd
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [46]:
prqt_MultL_test = pd.read_parquet('../data/parquet/MultL/test_pt_MultL.parquet')
prqt_MultL_val = pd.read_parquet('../data/parquet/MultL/val_pt_MultL.parquet')
prqt_MultL_dev = pd.concat([prqt_MultL_test, prqt_MultL_val], ignore_index=True)

prqt_MultL_train = pd.read_parquet('../data/parquet/MultL/train_pt_MultL.parquet')

In [47]:
prqt_LeNER_test = pd.read_parquet('../data/parquet/LeNER-Br/test_pt_LeNER-Br.parquet')
prqt_LeNER_val = pd.read_parquet('../data/parquet/LeNER-Br/dev_pt_LeNER-Br.parquet')
prqt_LeNER_dev = pd.concat([prqt_LeNER_test, prqt_LeNER_val], ignore_index=True)

prqt_LeNER_train = pd.read_parquet('../data/parquet/LeNER-Br/train_pt_LeNER-Br.parquet')

In [48]:
def get_meaning_label_HAREM(text):
    if text == 'LOCAL':
        return 'LOC'
    elif text == 'ORGANIZACAO':
        return 'ORG'
    elif text == 'PESSOA':
        return 'PER'
    else:
        return 'MIST'
    
def list_to_entity_dict_HAREM(list_items: list):
    list_entities = []
    for item in list_items:
        list_entities.append((item['start_offset'], item['end_offset'], get_meaning_label_HAREM(item['label'])))
    
    return list_entities

In [49]:
mini_HAREM = pd.read_parquet('../data/parquet/HAREM/mini_HAREM.parquet')
first_HAREM_val = pd.read_parquet('../data/parquet/HAREM/primeiro_HAREM.parquet')
second_HAREM_train = pd.read_parquet('../data/parquet/HAREM/segundo_HAREM.parquet')

all_HAREM = pd.concat([mini_HAREM, first_HAREM_val, second_HAREM_train], ignore_index=True)
all_HAREM['entities'] = all_HAREM['entities'].apply(list_to_entity_dict_HAREM)

dev_HAREM = all_HAREM.sample(frac=0.1, random_state=42)
train_HAREM = all_HAREM.drop(dev_HAREM.index)

In [50]:
def map_tag_MultL(tag_id):
    if tag_id in range(1, 3):
        return "PER"
    elif tag_id in range(3, 5):
        return "ORG"
    elif tag_id in range(5, 7):
        return "LOC"
    elif tag_id in range(7, 9):
        return "MIST"
    else:
        return "O"

def map_tag_LeNER(tag_id):
    if tag_id == 0:
        return "O"
    elif tag_id in range(1, 3):
        return "ORG"
    elif tag_id in range(3, 5):
        return "PER"    
    elif tag_id in range(7, 9):
        return "LOC"
    elif tag_id in range(5, 7) or tag_id in range(9, 13):
        return "MIST"
    else:
        return "O"
    
def convert_tokens_to_spacy_format_by_dataset(tokens, tags, mapping: callable):
    text = ""
    entities = []

    for i, (token, tag_id) in enumerate(zip(tokens, tags)):
        tag = mapping(tag_id)

        # Calcula os offsets
        start = len(text)
        text += token
        end = len(text)

        if i < len(tokens) - 1:
            text += " "

        if tag != "O":
            # Se for entidade, adiciona aos spans
            entities.append((start, end, tag))

    return (text.strip(), {"entities": entities})

In [51]:
data_spacy_dev_MultL = [
    convert_tokens_to_spacy_format_by_dataset(row["tokens"], row["ner_tags"], map_tag_MultL)
    for _, row in prqt_MultL_dev.iterrows()
]

data_spacy_train_MultL = [
    convert_tokens_to_spacy_format_by_dataset(row["tokens"], row["ner_tags"], map_tag_MultL)
    for _, row in prqt_MultL_train.iterrows()
]

In [52]:
data_spacy_dev_LeNER = [
    convert_tokens_to_spacy_format_by_dataset(row["tokens"], row["ner_tags"], map_tag_LeNER)
    for _, row in prqt_LeNER_dev.iterrows()
]

data_spacy_train_LeNER = [
    convert_tokens_to_spacy_format_by_dataset(row["tokens"], row["ner_tags"], map_tag_LeNER)
    for _, row in prqt_LeNER_train.iterrows()
]

In [53]:
data_spacy_dev_HAREM = [
    (row["phase"], {"entities": row["entities"]})
    for _, row in dev_HAREM.iterrows()
]

data_spacy_train_HAREM = [
    (row["phase"], {"entities": row["entities"]})
    for _, row in train_HAREM.iterrows()
]

In [54]:
train_data = data_spacy_train_HAREM + data_spacy_train_LeNER + data_spacy_train_MultL
dev_data = data_spacy_dev_HAREM + data_spacy_dev_LeNER + data_spacy_dev_MultL

In [55]:
def create_docbin(data, file_path):
    nlp = spacy.blank("pt")
    db = DocBin()

    print(f"Gerando dados para {file_path}...")
    for text, annot in tqdm(data):
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is not None:
                ents.append(span)
        doc.ents = ents
        db.add(doc)

    db.to_disk(file_path)
    print(f"Dados salvos em '{file_path}'")

In [56]:
create_docbin(train_data, "../spacy/train.spacy")
train_data.clear()
create_docbin(dev_data, "../spacy/dev.spacy")
dev_data.clear()



Gerando dados para ../spacy/train.spacy...


100%|██████████| 88734/88734 [00:08<00:00, 10158.21it/s]


Dados salvos em '../spacy/train.spacy'
Gerando dados para ../spacy/dev.spacy...


100%|██████████| 22834/22834 [00:02<00:00, 9394.67it/s] 


Dados salvos em '../spacy/dev.spacy'
