In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path

# LeNER-Br

In [16]:
list_typs = ['dev', 'test', 'train']

In [None]:
def ler_conll(arquivo):
    tokens_list = []
    tags_list = []
    
    tokens_sent = []
    tags_sent = []
    
    with open(arquivo, 'r', encoding='utf-8') as f:
        for linha in f:
            linha = linha.strip()
            
            if linha == "":
                if tokens_sent:
                    tokens_list.append(tokens_sent)
                    tags_list.append(tags_sent)
                    tokens_sent = []
                    tags_sent = []
                continue
            
            partes = linha.split()
            if len(partes) >= 2:
                token = partes[0]
                tag = partes[1]
                
                tokens_sent.append(token)
                tags_sent.append(tag)
    
    if tokens_sent:
        tokens_list.append(tokens_sent)
        tags_list.append(tags_sent)
    
    return tokens_list, tags_list

In [None]:
def tags_para_numeros(tags_list):
    tags_unicos = set()
    for sent in tags_list:
        tags_unicos.update(sent)
    
    tag2id = {tag: i for i, tag in enumerate(sorted(tags_unicos))}
    
    tags_ids = []
    for sent in tags_list:
        ids = [tag2id[tag] for tag in sent]
        tags_ids.append(ids)
    
    print("Mapeamento de tags:")
    for tag, id_num in sorted(tag2id.items()):
        print(f"  {tag} → {id_num}")
    
    return tags_ids, tag2id

In [None]:
def salvar_parquet(tokens_list, tags_ids, arquivo_saida):

    tokens_array = pa.array(tokens_list, type=pa.list_(pa.utf8()))
    tags_array = pa.array(tags_ids, type=pa.list_(pa.int32()))
    
    table = pa.Table.from_arrays(
        [tokens_array, tags_array],
        names=["tokens", "ner_tags"]
    )
    
    pq.write_table(table, arquivo_saida)

In [None]:
for typ in list_typs:
    tokens_list, tags_list = ler_conll(f"../data/raw/LeNER-Br/{typ}_pt_LeNER-Br.conll")
    tags_ids, tag2id = tags_para_numeros(tags_list)
    salvar_parquet(tokens_list, tags_ids, f"../data/parquet/LeNER-Br/{typ}_pt_LeNER-Br.parquet")

# HAREM

In [11]:
typs_HAREM = ['mini_HAREM', 'primeiro_HAREM', 'segundo_HAREM']

In [12]:
import pandas as pd
import json

for typ in typs_HAREM:
    with open(f'../data/raw/HAREM/{typ}.json', 'r', encoding='utf-8') as arquivo:
        data = json.load(arquivo)

    df = pd.DataFrame(data)
    df.drop(['doc_id'], axis=1, inplace=True)
    df.rename(columns={'doc_text': 'phase'}, inplace=True)
    # df['entities'] = df['entities'].apply(list_to_entity_dict)

    df.to_parquet(f'../data/parquet/HAREM/{typ}.parquet', index=False)