In [1]:
import gc
import json
from importlib import reload

import torch
import numpy as np
import pandas as pd 
from tqdm.auto import tqdm
from sklearn.metrics import balanced_accuracy_score, f1_score, confusion_matrix

from nn_module import BiLSTM_CRF

In [2]:
def data_label_split(data, label, train_size=0.8):
    randidx = np.arange(len(data))
    data_train, data_test = train_test_split(data, randidx, train_size)
    label_train, label_test = train_test_split(label, randidx, train_size)

    return data_train, data_test, label_train, label_test

def train_test_split(data, randidx, train_size):
    N = len(data)
    return [data[i] for i in randidx[:int(train_size*N)]], [data[i] for i in randidx[int(train_size*N):]]

def shuffle_data_label_lists(data, label):
    randidx = np.arange(len(data))
    np.random.shuffle(randidx)
    return [data[i] for i in randidx], [label[i] for i in randidx]

def embedding(data, token2emb, label2num):
    tokens_lst = []
    labels_lst = []

    for doc in tqdm(data):
        tokens = map(lambda x: x.replace('\t', '').replace('\n', '').replace(' ', ''), doc['tokens'])
        emb_tokens = torch.zeros(len(doc['tokens']), 300)
        for i, token in enumerate(tokens):
            if token in token2emb:
                emb_tokens[i] = torch.FloatTensor(token2emb[token])
            else:
                for it in token:
                    emb_tokens[i] += torch.FloatTensor(token2emb[it] if it in token2emb else [0]*300)

        tokens_lst.append(torch.FloatTensor(emb_tokens))
        labels_lst.append(torch.LongTensor([label2num[it] if it in label2num else 0 for it in doc['labels']]))
        
    return tokens_lst, labels_lst

def batch_split(X, Y, batch_size=1000):
    x_batched = []
    y_batched = []

    n = len(X)
    pointer = 0
    while pointer + batch_size < n:
        x_batched.append(X[pointer:pointer+batch_size])
        y_batched.append(Y[pointer:pointer+batch_size])
        pointer += batch_size 
    
    x_batched.append(X[pointer:])
    y_batched.append(Y[pointer:])

    return x_batched, y_batched

def fix_label_disbalance(tokens, labels):
    label_idxs = {}
    for i in range(len(labels)):
        unique_labels = np.unique(labels[i])
        for lab in unique_labels:
            if lab in label_idxs:
                label_idxs[lab].append(i)
            else:
                label_idxs[lab] = [i]

    idxs = []
    count_o = len(label_idxs[0])
    del label_idxs[0]

    for it in label_idxs:
        scale = count_o // len(label_idxs[it])
        idxs += label_idxs[it]*scale

    np.random.shuffle(idxs)
    tokens = [tokens[i] for i in idxs]
    labels = [labels[i] for i in idxs]

    return tokens, labels

In [3]:
# create table for name and address
label2num = {
    'O': 0,
    'B-NAME_STUDENT': 1, 
    'I-NAME_STUDENT': 2, 
    'B-STREET_ADDRESS': 3, 
    'I-STREET_ADDRESS': 4, 
    'B-USERNAME': 5,
    'I-USERNAME': 6, 
    'B-ID_NUM': 7, 
    'I-ID_NUM': 8, 
    'B-URL_PERSONAL': 9,
    'I-URL_PERSONAL': 10,
    'B-EMAIL': 11,
    'I-EMAIL': 12,
    'B-PHONE_NUM': 13,
    'I-PHONE_NUM': 14,
    '<START>': 15, 
    '<STOP>': 16,
}

num2label = {
    0: 'O',
    1: 'B-NAME_STUDENT',  
    2: 'I-NAME_STUDENT',  
    3: 'B-STREET_ADDRESS',  
    4: 'I-STREET_ADDRESS',  
    5: 'B-USERNAME', 
    6: 'I-USERNAME',  
    7: 'B-ID_NUM',  
    8: 'I-ID_NUM', 
    9: 'B-URL_PERSONAL',
    10: 'I-URL_PERSONAL',
    11: 'B-EMAIL',
    12: 'I-EMAIL',
    13: 'B-PHONE_NUM',
    14: 'I-PHONE_NUM',
    15: '<START>', 
    16: '<STOP>',
}

In [4]:
# load fastext
token2emb = {}
with open('wiki-news-300d-1M.vec', 'r', encoding='utf-8') as f:
    next(f)
    for it in tqdm(f):
        row = it.split(' ')
        token2emb[row[0]] = list(map(float, row[1:]))

0it [00:00, ?it/s]

In [5]:
# encoding tokens and labels
with open('data/mixtral-8x7b-v1.json', 'r', encoding='utf-8') as f:
    data_1 = json.load(f)
with open('data/train.json', 'r', encoding='utf-8') as f:
    data_2 = json.load(f)
data = data_1 + data_2

data_tokens = []
data_labels = []
for doc in tqdm(data):
    row_tokens = []
    row_labels = []

    for i, (token, label) in enumerate(zip(doc['tokens'], doc['labels'])):
        emb_tokens = torch.zeros(300)
        
        if token in token2emb:
            emb_tokens = torch.FloatTensor(token2emb[token])
        else:
            for it in token:
                emb_tokens += torch.FloatTensor(token2emb[it] if it in token2emb else [0]*300)

        row_tokens.append(emb_tokens.unsqueeze(0))
        row_labels.append(label2num[label])

        if token.endswith('\n'):
            data_tokens.append(torch.cat(row_tokens))
            data_labels.append(torch.LongTensor(row_labels))

            row_tokens = []
            row_labels = []

data = []
token2emb = None
gc.collect()

N = len(data_tokens)
np.random.seed(123)
data_idx = np.arange(N)
np.random.shuffle(data_idx)
data_tokens = [data_tokens[i] for i in data_idx]
data_labels = [data_labels[i] for i in data_idx]

train_size = 0.85
n = int(N*train_size)
train_tokens, valid_tokens = data_tokens[:n], data_tokens[n:]
train_labels, valid_labels = data_labels[:n], data_labels[n:]

  0%|          | 0/9162 [00:00<?, ?it/s]

In [33]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

import nn_module    
reload(nn_module)
from nn_module import BiLSTM_CRF

# fit lstm
model = BiLSTM_CRF(
    embedding_dim=300,
    hidden_size=32,
    nclasses=17,
    label2num=label2num,
    device=device,
)

# checkpoint = torch.load('saved_models/bi_lstm.pt')
# model.load_state_dict(checkpoint['model_state_dict'])
# model.optim.load_state_dict(checkpoint['optimizer_state_dict'])

model.fit(
    train_X=train_tokens,
    train_Y=train_labels,
    valid_X=valid_tokens,
    valid_Y=valid_labels,
    nepochs=1,
    lr=1e-3,
    device=device
)

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [26]:
with torch.no_grad():
    score, predict = model.forward(train_tokens[0], device=device)

torch.Size([5, 19])


In [27]:
predict

tensor([1, 7, 5, 7, 5])

In [12]:
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': model.optim.state_dict(),
    }, 'saved_models/bi_lstm.pt')

In [6]:
token2emb = {}

with open('wiki-news-300d-1M.vec', 'r', encoding='utf-8') as f:
    next(f)
    for it in tqdm(f):
        row = it.split(' ')
        token2emb[row[0]] = list(map(float, row[1:]))

valid_tokens, valid_labels = embedding(valid_data, token2emb, label2num)

# clearing token2emb memory
token2emb = None
import gc
gc.collect()

0it [00:00, ?it/s]

  0%|          | 0/2749 [00:00<?, ?it/s]

39

In [10]:
device = 'cpu'
model.to(device)

valid_predict = []
with torch.no_grad():
    for batch_X in tqdm(valid_tokens):
        predict = model.forward(batch_X, device=device)[1]
        valid_predict += predict

  0%|          | 0/2749 [00:00<?, ?it/s]

In [11]:
real = torch.cat(valid_labels)
valid_predict = torch.FloatTensor(valid_predict)

In [18]:
f1_score(real[valid_predict != 0], valid_predict[valid_predict != 0], average='micro')

0.9706940488682356

In [10]:
with torch.no_grad():
    valid_predict = []
    for batch_X in valid_tokens:
        predict = model(batch_X.to(device)).cpu()
        valid_predict.append(torch.argmax(predict, dim=1))
    valid_predict = torch.cat(valid_predict)
    valid_real = torch.cat(valid_labels)
    print(f'BA: {balanced_accuracy_score(valid_real, valid_predict):.3f}')

BA: 0.994


In [16]:
columns = [
    'O',
    'NAME_STUDENT',
    'STREET_ADDRESS',
    'USERNAME',
    'ID_NUM',
]

pd.DataFrame(confusion_matrix(valid_real, valid_predict), columns=columns)

Unnamed: 0,O,NAME_STUDENT,STREET_ADDRESS,USERNAME,ID_NUM
0,2096363,433,10,22,16
1,48,4734,2,3,0
2,0,0,6565,0,0
3,9,0,0,732,0
4,5,0,0,0,1036


In [8]:
with torch.no_grad():
    train_predict = []
    train_proba = []
    for batch_X in train_tokens:
        predict = model(batch_X.to(device)).cpu()
        train_proba.append(predict[:, 1])
        train_predict.append(torch.argmax(predict, dim=1))
    
    train_predict = torch.cat(train_predict)
    train_proba = torch.cat(train_proba)
    train_real = torch.cat(train_labels)

    print(f'BA: {balanced_accuracy_score(train_real, train_predict):.3f}')
    print(f'F1: {f1_score(train_real, train_predict, average="micro"):.3f}')

with torch.no_grad():
    valid_predict = []
    valid_proba = []
    for batch_X in valid_tokens:
        predict = model(batch_X.to(device)).cpu()
        valid_proba.append(predict[:, 1])
        valid_predict.append(torch.argmax(predict, dim=1))

    valid_predict = torch.cat(valid_predict)
    valid_proba = torch.cat(valid_proba)
    valid_real = torch.cat(valid_labels)

    print(f'BA: {balanced_accuracy_score(valid_real, valid_predict):.3f}')
    print(f'F1: {f1_score(valid_real, valid_predict, average="micro"):.3f}')

BA: 1.000
F1: 1.000


In [11]:
columns = [
    'O',
    'NAME_STUDENT',
    'STREET_ADDRESS',
    # 'USERNAME',
    # 'ID_NUM',
]

pd.DataFrame(confusion_matrix(valid_real, valid_predict), columns=columns)

Unnamed: 0,O,NAME_STUDENT,STREET_ADDRESS
0,1671929,130,12
1,103,3815,1
2,0,0,5364


In [9]:
token2emb = []
train_tokens = []
train_labels = []
test_tokens = []
test_labels = []

import gc
gc.collect()

1119

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = BiLSTM_CRF(
    embedding_dim=300,
    hidden_size=32,
    nclasses=11,
    label2num=label2num,
    device=device,
)

checkpoint = torch.load('saved_models/bi_lstm.pt')
model.load_state_dict(checkpoint['model_state_dict'])
model.optim.load_state_dict(checkpoint['optimizer_state_dict'])
model.eval()
model.to(device)
print()




In [5]:
token2emb = {}
with open('wiki-news-300d-1M.vec', 'r', encoding='utf-8') as f:
    next(f)
    for it in tqdm(f):
        row = it.split(' ')
        token2emb[row[0]] = list(map(float, row[1:]))

0it [00:00, ?it/s]

In [6]:
def check_name(x):
    flag = False
    flag = x[0].isupper()
    for it in x[1:]:
        flag *= it.islower()
    return flag

def get_predict_table(data, token2emb, num2label, label2num):
    data_tokens, _ = embedding(data, token2emb, label2num)
    predict_table = [[], [], [], [], []] # document, token_idx, token, predict, real
    for i in tqdm(range(len(data))):
        with torch.no_grad():
            score, predict = model.forward(data_tokens[i].to(device), device=device)
            predict = [num2label[it] for it in predict]
            predict = ['O' if len(set(x)) < 2 else y for x, y in zip(data[i]['tokens'], predict)]
            predict_table[0] += [data[i]['document']]*len(predict)
            predict_table[1] += list(range(len(predict)))
            predict_table[2] += data[i]['tokens']
            predict_table[3] += predict
            predict_table[4] += data[i]['labels'] 
        
    predict_table = [[predict_table[0][i], predict_table[1][i], predict_table[2][i], predict_table[3][i], predict_table[4][i]] for i in range(len(predict_table[0]))]
    predict_table = pd.DataFrame(predict_table, columns=['document', 'token_i', 'token', 'predict', 'label'])
    predict_table = predict_table.loc[(predict_table.label != 'O') & (predict_table.token != "\n") & (predict_table.token != "\n\n") & (predict_table.token != "\t")].reset_index(drop=True)

    return predict_table.sort_values('document').reset_index(drop=True)

In [7]:
import gc
with open('data/train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

predict_table = get_predict_table(data, token2emb, num2label, label2num)
gc.collect()

  0%|          | 0/6807 [00:00<?, ?it/s]

  0%|          | 0/6807 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:
predict_table.head()

Unnamed: 0,document,token_i,token,predict,label,prefix
0,7,9,Nathalie,NAME_STUDENT,B-NAME_STUDENT,B-
1,7,10,Sylla,NAME_STUDENT,I-NAME_STUDENT,B-
2,7,482,Nathalie,NAME_STUDENT,B-NAME_STUDENT,B-
3,7,483,Sylla,NAME_STUDENT,I-NAME_STUDENT,B-
4,7,741,Nathalie,NAME_STUDENT,B-NAME_STUDENT,B-


In [10]:
label = predict_table['label'].str.slice(2).values 
predict = predict_table['predict'].values 
print(f'F1 score: {f1_score(label, predict, average="micro")}')

F1 score: 0.9199853854585314


In [11]:
for i in range(1, predict_table.shape[0]):
    if (predict_table.at[i-1, 'predict'] == predict_table.at[i, 'predict']) & (predict_table.at[i-1, 'document'] == predict_table.at[i, 'document']) & (predict_table.at[i, 'token_i'] - predict_table.at[i-1, 'token_i'] == 1):
        predict_table.at[i, 'prefix'] = 'I-'
predict_table['predict'] = predict_table['prefix'] + predict_table['predict']

print(f'F1 score: {f1_score(predict_table.label, predict_table.predict, average="micro")}')

F1 score: 0.7822433321154548


In [12]:
predict_table

Unnamed: 0,document,token_i,token,predict,label,prefix
0,7,9,Nathalie,B-NAME_STUDENT,B-NAME_STUDENT,B-
1,7,10,Sylla,I-NAME_STUDENT,I-NAME_STUDENT,I-
2,7,482,Nathalie,B-NAME_STUDENT,B-NAME_STUDENT,B-
3,7,483,Sylla,I-NAME_STUDENT,I-NAME_STUDENT,I-
4,7,741,Nathalie,B-NAME_STUDENT,B-NAME_STUDENT,B-
...,...,...,...,...,...,...
2732,15717,365,IV-8322,B-ID_NUM,B-ID_NUM,B-
2733,15717,964,IV-8322,B-ID_NUM,B-ID_NUM,B-
2734,19280,55,30407059,B-ID_NUM,I-ID_NUM,B-
2735,19280,54,Z.S.,B-ID_NUM,B-ID_NUM,B-


In [13]:
predict_table.loc[predict_table.predict != predict_table.label].groupby('predict', as_index=False).label.count()

Unnamed: 0,predict,label
0,B-ID_NUM,1
1,B-NAME_STUDENT,366
2,B-O,211
3,B-STREET_ADDRESS,10
4,B-USERNAME,1
5,I-O,7


In [12]:
predict_table

Unnamed: 0,document,token_i,token,predict,label,prefix,upper_start
0,7,9,Nathalie,B-NAME_STUDENT,B-NAME_STUDENT,B-,1
1,7,10,Sylla,I-NAME_STUDENT,I-NAME_STUDENT,I-,1
2,7,482,Nathalie,B-NAME_STUDENT,B-NAME_STUDENT,B-,1
3,7,483,Sylla,I-NAME_STUDENT,I-NAME_STUDENT,I-,1
4,7,741,Nathalie,B-NAME_STUDENT,B-NAME_STUDENT,B-,1
...,...,...,...,...,...,...,...
2639,22147,457,Francesca,B-NAME_STUDENT,O,B-,1
2640,22147,1408,Melanie,B-NAME_STUDENT,O,B-,1
2641,22181,738,Portman,B-NAME_STUDENT,O,B-,1
2642,22181,736,Natalie,B-NAME_STUDENT,O,B-,1
