In [1]:
import gc
import json
from importlib import reload

import numpy as np

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer

In [2]:
# tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
# longformer = LongformerModel.from_pretrained('allenai/longformer-base-4096')

In [3]:
target_cols = set([
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL','O'
])

label2num = {
    'O': 0,
    'B-NAME_STUDENT': 1, 
    'I-NAME_STUDENT': 2, 
    'B-STREET_ADDRESS': 3, 
    'I-STREET_ADDRESS': 4, 
    'B-USERNAME': 5,
    'I-USERNAME': 6, 
    'B-ID_NUM': 7, 
    'I-ID_NUM': 8, 
    'B-URL_PERSONAL': 9,
    'I-URL_PERSONAL': 10,
    'B-EMAIL': 11,
    'I-EMAIL': 12,
    'B-PHONE_NUM': 13,
    'I-PHONE_NUM': 14,
}


tokenizer = AutoTokenizer.from_pretrained('saved_models/tokenizer')
def tokenize_row(example):
    text = []
    token_map = []
    labels = []
    targets = []
    idx = 0
    for t, l, ws in zip(example["tokens"], example["labels"], example["trailing_whitespace"]):
        text.append(t)
        labels.extend([l]*len(t))
        token_map.extend([idx]*len(t))

        if l in target_cols:  
            targets.append(1)
        else:
            targets.append(0)
        
        if ws:
            text.append(" ")
            labels.append("O")
            token_map.append(-1)
        idx += 1


    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=2048)  # Adjust max_length if needed
    
    target_num = sum(targets)
    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        if start_idx == 0 and end_idx == 0: 
            token_labels.append(label2num["O"])
            continue
        
        if text[start_idx].isspace():
            start_idx += 1
        try:
            token_labels.append(label2num[labels[start_idx]])
        except:
            continue
    length = len(tokenized.input_ids)
    
    return {
        "input_ids": tokenized.input_ids,
        "attention_mask": tokenized.attention_mask,
        "offset_mapping": tokenized.offset_mapping,
        "labels": token_labels,
        "length": length,
        "target_num": target_num,
        "group": 1 if target_num > 0 else 0,
        "token_map": token_map,
    }

In [4]:
def fix_label_disbalance(data):
    label_idxs = {}
    for i in range(len(data)):
        unique_labels = np.unique(data[i]['labels'])
        for lab in unique_labels:
            if lab in label_idxs:
                label_idxs[lab].append(i)
            else:
                label_idxs[lab] = [i]

    idxs = []
    count_o = len(label_idxs[0])
    del label_idxs[0]

    for it in label_idxs:
        scale = count_o // len(label_idxs[it])
        idxs += label_idxs[it]*scale

    np.random.shuffle(idxs)
    data = [data[i] for i in idxs]

    return data

In [5]:
# encoding tokens and labels
with open('data/mixtral-8x7b-v1.json', 'r', encoding='utf-8') as f:
    data_1 = json.load(f)
with open('data/train.json', 'r', encoding='utf-8') as f:
    data_2 = json.load(f)
data = data_1 + data_2

tokenized_data = []
for doc in data:
    row = tokenize_row(doc)
    pad_size = 512 - len(row['input_ids'])%512
    pad_size = 0 if pad_size%512 == 0 else pad_size

    row['input_ids'] = torch.LongTensor(row['input_ids']+[0]*pad_size).reshape(1,-1)
    row['attention_mask'] = torch.LongTensor(row['attention_mask']+[0]*pad_size).reshape(1,-1)
    row['labels'] = torch.LongTensor(row['labels']+[0]*pad_size)
    tokenized_data.append(row)

N = len(tokenized_data)
train_size = 0.85
n = int(N*train_size)
train, valid = tokenized_data[:n], tokenized_data[n:]

data = None
data_tokens = None
data_labels = None
gc.collect()

0

In [6]:
train = fix_label_disbalance(train)

In [7]:
import nn_module
reload(nn_module)
from nn_module import BiLSTM_CRF

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = BiLSTM_CRF(embedding_size=768, hidden_size=128, nclasses=len(label2num))

In [8]:
model.fit(
    train, 
    nepochs=10,
    lr=1e-3,
    device=device
)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/83541 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': model.optim.state_dict(),
    }, 'saved_models/long_bilstm.pt')