In [20]:
import pandas as pd
from dataset import read_ner_file
import torch 
import torch.nn as nn 
from torch.utils.data import DataLoader, Dataset
import numpy as np 
import os
import matplotlib.pyplot as plt 
from transformers import AdamW
from tqdm import tqdm 

from datasets import Dataset as trDataset
from datasets import load_dataset

from transformers import AutoTokenizer

In [21]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda")

LABEL_2_ID = {'B-PATIENT_ID': 0, 
    'I-PATIENT_ID': 1, 
    'B-NAME': 2, 
    'I-NAME': 3, 
    'B-AGE': 4, 
    'I-AGE': 5, 
    'B-GENDER': 6, 
    'I-GENDER': 7, 
    'B-JOB': 8, 
    'I-JOB': 9, 
    'B-LOCATION': 10, 
    'I-LOCATION': 11, 
    'B-ORGANIZATION': 12, 
    'I-ORGANIZATION': 13, 
    'B-SYMPTOM_AND_DISEASE': 14, 
    'I-SYMPTOM_AND_DISEASE': 15, 
    'B-TRANSPORTATION': 16, 
    'I-TRANSPORTATION': 17, 
    'B-DATE': 18, 
    'I-DATE': 19, 
    'O': 20
}

ID_2_LABEL = {0: 'B-PATIENT_ID', 
    1: 'I-PATIENT_ID', 
    2: 'B-NAME', 
    3: 'I-NAME', 
    4: 'B-AGE', 
    5: 'I-AGE', 
    6: 'B-GENDER', 
    7: 'I-GENDER', 
    8: 'B-JOB', 
    9: 'I-JOB', 
    10: 'B-LOCATION', 
    11: 'I-LOCATION', 
    12: 'B-ORGANIZATION', 
    13: 'I-ORGANIZATION', 
    14: 'B-SYMPTOM_AND_DISEASE', 
    15: 'I-SYMPTOM_AND_DISEASE', 
    16: 'B-TRANSPORTATION', 
    17: 'I-TRANSPORTATION', 
    18: 'B-DATE', 
    19: 'I-DATE', 
    20: 'O'
}

In [22]:
df_train = read_ner_file("./data/syllable/train_syllable.conll")
df_test = read_ner_file("./data/syllable/test_syllable.conll")
df_eval = read_ner_file("./data/syllable/dev_syllable.conll")

df_train = pd.DataFrame(data=df_train)
df_train = df_train.convert_dtypes()

df_test = pd.DataFrame(data=df_test) 
df_test = df_test.convert_dtypes()

df_eval = pd.DataFrame(data=df_eval)
df_eval = df_eval.convert_dtypes()

In [23]:
tokens = df_train["tokens"]

def get_token_type_count(tokens: pd.Series, classname): 
    tokens = tokens.apply(func=lambda x: True if classname in x else False)
    pos = tokens[tokens == True].count()
    return pos 

total = 0

for key in LABEL_2_ID.keys(): 
    print(key)
    count = get_token_type_count(tokens=tokens, classname=key)
    total += count
    print(f"Token type: {key} has {count} occurences")

print(total)

B-PATIENT_ID
Token type: B-PATIENT_ID has 1960 occurences
I-PATIENT_ID
Token type: I-PATIENT_ID has 6 occurences
B-NAME
Token type: B-NAME has 288 occurences
I-NAME
Token type: I-NAME has 44 occurences
B-AGE
Token type: B-AGE has 611 occurences
I-AGE
Token type: I-AGE has 2 occurences
B-GENDER
Token type: B-GENDER has 503 occurences
I-GENDER
Token type: I-GENDER has 13 occurences
B-JOB
Token type: B-JOB has 196 occurences
I-JOB
Token type: I-JOB has 194 occurences
B-LOCATION
Token type: B-LOCATION has 2926 occurences
I-LOCATION
Token type: I-LOCATION has 2851 occurences
B-ORGANIZATION
Token type: B-ORGANIZATION has 983 occurences
I-ORGANIZATION
Token type: I-ORGANIZATION has 974 occurences
B-SYMPTOM_AND_DISEASE
Token type: B-SYMPTOM_AND_DISEASE has 618 occurences
I-SYMPTOM_AND_DISEASE
Token type: I-SYMPTOM_AND_DISEASE has 536 occurences
B-TRANSPORTATION
Token type: B-TRANSPORTATION has 213 occurences
I-TRANSPORTATION
Token type: I-TRANSPORTATION has 54 occurences
B-DATE
Token type: B-D

In [24]:
tokenizer_dir = "/home/hyle/Documents/vscode/NLPDataCollection/NLPDataCollection/tokenizer/trained_tokenizer/tokenizer-50k"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)

In [25]:
train_dataset = trDataset.from_pandas(df_train, split="train")
test_dataset = trDataset.from_pandas(df_test, split="test")
val_dataset = trDataset.from_pandas(df_eval, split="train")

In [26]:
tokenizer.model_max_length = 1000000

In [27]:
def convert_to_tokens(sample, tokenizer):
    text = sample["words"]
    res = tokenizer(text, truncation=False, is_split_into_words=True)

    return res

In [28]:
train_dataset = train_dataset.map(convert_to_tokens, batched=True, fn_kwargs={"tokenizer": tokenizer}, num_proc=os.cpu_count())
test_dataset = test_dataset.map(convert_to_tokens, batched=True, fn_kwargs={"tokenizer": tokenizer}, num_proc=os.cpu_count())
val_dataset = val_dataset.map(convert_to_tokens, batched=True, fn_kwargs={"tokenizer": tokenizer}, num_proc=os.cpu_count())

Map (num_proc=16): 100%|██████████| 5028/5028 [00:04<00:00, 1220.70 examples/s]
Map (num_proc=16): 100%|██████████| 3000/3000 [00:02<00:00, 1269.73 examples/s]
Map (num_proc=16): 100%|██████████| 2000/2000 [00:02<00:00, 932.88 examples/s] 


In [29]:
train_dataset = train_dataset.remove_columns(["words", "tokens"])
test_dataset = test_dataset.remove_columns(["words", "tokens"])
val_dataset = val_dataset.remove_columns(["words", "tokens"])

In [30]:
save_df_train = train_dataset.to_pandas()
save_df_test = test_dataset.to_pandas()
save_df_val = val_dataset.to_pandas()

In [31]:
train_dataset = trDataset.from_pandas(save_df_train).save_to_disk("./data/tokenized_dataset_train")
test_dataset = trDataset.from_pandas(save_df_test).save_to_disk("./data/tokenized_dataset_test")
val_dataset = trDataset.from_pandas(save_df_val).save_to_disk("./data/tokenized_dataset_val")

Saving the dataset (0/1 shards):   0%|          | 0/5028 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 5028/5028 [00:00<00:00, 867822.74 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3000/3000 [00:00<00:00, 563674.77 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2000/2000 [00:00<00:00, 578884.00 examples/s]
