In [10]:
import sys
import os
from glob import glob
from datasets import Dataset, Features, Value, Sequence
from transformers import AutoTokenizer
import torch

In [11]:
sys.path.append(os.path.join(os.getcwd(), ".."))

try:
    from vlsp import read_and_process
    from utils.ner_utils import tokenize_and_align 
except ImportError as e:
    print(e)
    sys.exit(1)

In [12]:
tokenizer = AutoTokenizer.from_pretrained("../tokenizer/trained_tokenizer/tokenizer-50k")

data_type = "test"

text_files = sorted(glob(os.path.join("../vlsp/data/formatted_data/2016", data_type, "**", "*.txt"), recursive=True))
df = read_and_process.read_format_and_return_df(text_files, remove_tags=False)

In [13]:
ds = Dataset.from_spark(df, split="train")
ds = ds.with_format("torch")

label_map = read_and_process.get_label_map()

ds = ds.map(tokenize_and_align, batched=True, num_proc=16, fn_kwargs={"tokenizer": tokenizer, "label_map": label_map})
print(ds)

Map (num_proc=16): 100%|██████████| 2831/2831 [00:02<00:00, 1310.92 examples/s] 


Dataset({
    features: ['words', 'tags', 'input_ids', 'attention_mask', 'token_type_ids', 'labels'],
    num_rows: 2831
})


In [14]:
ds = ds.remove_columns(["words", "tags"])

In [15]:
ds

Dataset({
    features: ['input_ids', 'attention_mask', 'token_type_ids', 'labels'],
    num_rows: 2831
})

In [16]:
ds.save_to_disk(f"../data_all/data_extra/data_ner/{data_type}")

Saving the dataset (1/1 shards): 100%|██████████| 2831/2831 [00:00<00:00, 270695.88 examples/s]
