In [None]:
import pandas as pd
from dataset import read_ner_file
import torch 
import torch.nn as nn 
from torch.utils.data import DataLoader, Dataset
import numpy as np 
import os
import matplotlib.pyplot as plt 
from transformers import AdamW
from tqdm import tqdm 
import torcheval 

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda")

LABEL_2_ID = {'B-PATIENT_ID': 0, 
    'I-PATIENT_ID': 1, 
    'B-NAME': 2, 
    'I-NAME': 3, 
    'B-AGE': 4, 
    'I-AGE': 5, 
    'B-GENDER': 6, 
    'I-GENDER': 7, 
    'B-JOB': 8, 
    'I-JOB': 9, 
    'B-LOCATION': 10, 
    'I-LOCATION': 11, 
    'B-ORGANIZATION': 12, 
    'I-ORGANIZATION': 13, 
    'B-SYMPTOM_AND_DISEASE': 14, 
    'I-SYMPTOM_AND_DISEASE': 15, 
    'B-TRANSPORTATION': 16, 
    'I-TRANSPORTATION': 17, 
    'B-DATE': 18, 
    'I-DATE': 19, 
    'O': 20
}

ID_2_LABEL = {0: 'B-PATIENT_ID', 
    1: 'I-PATIENT_ID', 
    2: 'B-NAME', 
    3: 'I-NAME', 
    4: 'B-AGE', 
    5: 'I-AGE', 
    6: 'B-GENDER', 
    7: 'I-GENDER', 
    8: 'B-JOB', 
    9: 'I-JOB', 
    10: 'B-LOCATION', 
    11: 'I-LOCATION', 
    12: 'B-ORGANIZATION', 
    13: 'I-ORGANIZATION', 
    14: 'B-SYMPTOM_AND_DISEASE', 
    15: 'I-SYMPTOM_AND_DISEASE', 
    16: 'B-TRANSPORTATION', 
    17: 'I-TRANSPORTATION', 
    18: 'B-DATE', 
    19: 'I-DATE', 
    20: 'O'
}

In [None]:
df_train = read_ner_file("./data/syllable/train_syllable.conll")
df_test = read_ner_file("./data/syllable/test_syllable.conll")

df_train = pd.DataFrame(data=df_train)
df_train = df_train.convert_dtypes()

df_test = pd.DataFrame(data=df_test) 
df_test = df_test.convert_dtypes()

In [None]:
def converter(tokens): 
    converted_tokens = [] 

    for token in tokens: 
        converted_tokens.append(LABEL_2_ID[token])

    return converted_tokens

df_train["tokens"] = df_train["tokens"].apply(func=converter)
df_test["tokens"] = df_test["tokens"].apply(func=converter)

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("uitnlp/visobert")
model.lm_head.decoder = nn.Linear(in_features=768, out_features=len(ID_2_LABEL), bias=True)
model = model.to(device) 

for params in model.base_model.parameters(): 
    params.requires_grad = False

In [None]:
tokenizer = AutoTokenizer.from_pretrained("uitnlp/visobert")

In [None]:
train_tokens = tokenizer(df_train["words"].to_list(), truncation=True, padding=True, return_tensors="pt", is_split_into_words=True)
test_tokens = tokenizer(df_test["words"].to_list(), truncation=True, padding=True, return_tensors="pt", is_split_into_words=True)

In [None]:
def align_tokens(tokens, df_type, label_all_tokens=True): 
    labels = [] 
    for i, label in enumerate(df_type["tokens"]):
        word_ids = tokens.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            # set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokens["labels"] = labels
    return tokens

train_tokens = align_tokens(train_tokens, df_type=df_train)
test_tokens = align_tokens(test_tokens, df_type=df_test)