In [1]:
import gc
import json
import pickle
from importlib import reload

import torch
import numpy as np
import pandas as pd 
from tqdm.auto import tqdm
from sklearn.metrics import balanced_accuracy_score, f1_score, confusion_matrix

from nn_module import BiLSTM_CRF

In [2]:
def data_label_split(data, label, train_size=0.8):
    randidx = np.arange(len(data))
    data_train, data_test = train_test_split(data, randidx, train_size)
    label_train, label_test = train_test_split(label, randidx, train_size)

    return data_train, data_test, label_train, label_test

def train_test_split(data, randidx, train_size):
    N = len(data)
    return [data[i] for i in randidx[:int(train_size*N)]], [data[i] for i in randidx[int(train_size*N):]]

def shuffle_data_label_lists(data, label):
    randidx = np.arange(len(data))
    np.random.shuffle(randidx)
    return [data[i] for i in randidx], [label[i] for i in randidx]

def batch_split(X, Y, batch_size=1000):
    x_batched = []
    y_batched = []

    n = len(X)
    pointer = 0
    while pointer + batch_size < n:
        x_batched.append(X[pointer:pointer+batch_size])
        y_batched.append(Y[pointer:pointer+batch_size])
        pointer += batch_size 
    
    x_batched.append(X[pointer:])
    y_batched.append(Y[pointer:])

    return x_batched, y_batched

def fix_label_disbalance(tokens, labels):
    label_idxs = {}
    for i in range(len(labels)):
        unique_labels = np.unique(labels[i])
        for lab in unique_labels:
            if lab in label_idxs:
                label_idxs[lab].append(i)
            else:
                label_idxs[lab] = [i]

    idxs = []
    count_o = len(label_idxs[0])
    del label_idxs[0]

    for it in label_idxs:
        scale = count_o // len(label_idxs[it])
        idxs += label_idxs[it]*scale

    np.random.shuffle(idxs)
    tokens = [tokens[i] for i in idxs]
    labels = [labels[i] for i in idxs]

    return tokens, labels

In [3]:
# create table for name and address
label2num = {
    'O': 0,
    'B-NAME_STUDENT': 1, 
    'I-NAME_STUDENT': 2, 
    'B-STREET_ADDRESS': 3, 
    'I-STREET_ADDRESS': 4, 
    'B-USERNAME': 5,
    'I-USERNAME': 6, 
    'B-ID_NUM': 7, 
    'I-ID_NUM': 8, 
    'B-URL_PERSONAL': 9,
    'I-URL_PERSONAL': 10,
    'B-EMAIL': 11,
    'I-EMAIL': 12,
    'B-PHONE_NUM': 13,
    'I-PHONE_NUM': 14,
}

num2label = {
    0: 'O',
    1: 'B-NAME_STUDENT',  
    2: 'I-NAME_STUDENT',  
    3: 'B-STREET_ADDRESS',  
    4: 'I-STREET_ADDRESS',  
    5: 'B-USERNAME', 
    6: 'I-USERNAME',  
    7: 'B-ID_NUM',  
    8: 'I-ID_NUM', 
    9: 'B-URL_PERSONAL',
    10: 'I-URL_PERSONAL',
    11: 'B-EMAIL',
    12: 'I-EMAIL',
    13: 'B-PHONE_NUM',
    14: 'I-PHONE_NUM',
}

In [4]:
# load fastext
token2emb = {}
with open('wiki-news-300d-1M.vec', 'r', encoding='utf-8') as f:
    next(f)
    for it in tqdm(f):
        row = it.split(' ')
        token2emb[row[0]] = list(map(float, row[1:]))

# encoding tokens and labels
with open('data/mixtral-8x7b-v1.json', 'r', encoding='utf-8') as f:
    data_1 = json.load(f)
with open('data/train.json', 'r', encoding='utf-8') as f:
    data_2 = json.load(f)
data = data_1 + data_2

data_tokens = []
data_labels = []
for doc in tqdm(data):
    row_tokens = []
    row_labels = []

    for i, (token, label) in enumerate(zip(doc['tokens'], doc['labels'])):
        emb_tokens = torch.zeros(303)
        
        if token in token2emb:
            emb_tokens[:300] = torch.FloatTensor(token2emb[token])
        else:
            for it in token:
                emb_tokens[:300] += torch.FloatTensor(token2emb[it] if it in token2emb else [0]*300)
        
        up_c = 0 
        num_c = 0
        for let in token:
            if let.isdigit():
                num_c += 1
            if let.isupper():
                up_c += 1

        emb_tokens[300] = token[0].isupper()
        emb_tokens[301] = up_c / len(token)
        emb_tokens[302] = num_c / len(token)

        row_tokens.append(emb_tokens.unsqueeze(0))
        row_labels.append(label2num[label])

    data_tokens.append(torch.cat(row_tokens))
    data_labels.append(torch.LongTensor(row_labels))

data = []
token2emb = None
gc.collect()

N = len(data_tokens)
np.random.seed(123)
data_idx = np.arange(N)
np.random.shuffle(data_idx)
data_tokens = [data_tokens[i] for i in data_idx]
data_labels = [data_labels[i] for i in data_idx]

with open('data/train_pool.pkl', 'wb') as f:
    pickle.dump([
        data_tokens,
        data_labels,
    ], f)

0it [00:00, ?it/s]

  0%|          | 0/9162 [00:00<?, ?it/s]

In [4]:
with open('data/train_pool.pkl', 'rb') as f:
    data_tokens, data_labels = pickle.load(f)

N = len(data_tokens)
train_size = 0.85
n = int(N*train_size)
train_tokens, valid_tokens = data_tokens[:n], data_tokens[n:]
train_labels, valid_labels = data_labels[:n], data_labels[n:]

data_tokens = None
data_labels = None
gc.collect()

train_tokens, train_labels = fix_label_disbalance(train_tokens, train_labels)
# valid_tokens, valid_labels = fix_label_disbalance(valid_tokens, valid_labels)

In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

import nn_module    
reload(nn_module)
from nn_module import BiLSTM_CRF

# fit lstm
model = BiLSTM_CRF(
    embedding_params={
        'emb_size': 300,
        'ffwd_size': 3,
        'res_size': 100
    },
    hidden_size=128,
    nclasses=15,
    device=device,
).to(device)

# checkpoint = torch.load('saved_models/bi_lstm.pt')
# model.load_state_dict(checkpoint['model_state_dict'])
# model.optim.load_state_dict(checkpoint['optimizer_state_dict'])

model.fit(
    train_X=train_tokens,
    train_Y=train_labels,
    valid_X=valid_tokens,
    valid_Y=valid_labels,
    nepochs=10,
    lr=1e-3,
    device=device
)

  0%|          | 0/10 [00:00<?, ?it/s]

Fscore micro 0.970
Fscore micro 0.969
Fscore micro 0.966
Fscore micro 0.965
Fscore micro 0.970


In [28]:
with torch.no_grad():
    predict_valid = []
    real = []
    for batch_X, batch_Y in zip(valid_tokens, valid_labels):

        predict = torch.argmax(model.forward(batch_X.to(device)), dim=1).cpu()
        # mask = (predict != 0) | (batch_Y != 0)
        predict_valid.append(predict)
        real.append(batch_Y)

    predict_valid = torch.cat(predict_valid)
    real = torch.cat(real)

print(f'Fscore micro {f1_score(real, predict_valid, average="micro"):.3f}')

Fscore micro 1.000


In [29]:
predict_df = pd.DataFrame([num2label[it.item()] for it in predict_valid], columns=['predict'])
predict_df[['prefix', 'predict']] = predict_df.predict.str.split('-', expand=True)
predict_df.loc[predict_df['predict'].isna(), 'predict'] = 'O'
predict_df['label'] = [num2label[it.item()] for it in real]

In [30]:
cut_df = predict_df.loc[((predict_df.predict != 'O') | (predict_df.label != 'O'))].reset_index(drop=True)
cut_df['predict'] = cut_df['prefix'] + '-' + cut_df['predict']
print(f'BA: {balanced_accuracy_score(cut_df.label, cut_df.predict):.3}')
print(f'F1 score: {f1_score(cut_df.label, cut_df.predict, average="micro"):.3}')

BA: 0.886
F1 score: 0.97




In [None]:
BA: 0.906
F1 score: 0.972

In [9]:
for i in range(1, predict_df.shape[0]):
    if (predict_df.at[i-1, 'predict'] != predict_df.at[i, 'predict']) & (predict_df.at[i, 'prefix'] == 'I'):
        predict_df['prefix'] = 'B'
    
    elif (predict_df.at[i-1, 'predict'] == predict_df.at[i, 'predict']) & (predict_df.at[i, 'prefix'] == 'B'):
        predict_df.at[i, 'prefix'] = 'I'
    
cut_df = predict_df.loc[((predict_df.predict != 'O') | (predict_df.label != 'O'))].reset_index(drop=True)
cut_df['predict'] = cut_df['prefix'] + '-' + cut_df['predict']
print(f'BA: {balanced_accuracy_score(cut_df.label, cut_df.predict):.3}')
print(f'F1 score: {f1_score(cut_df.label, cut_df.predict, average="micro"):.3}')

BA: 0.871
F1 score: 0.919




In [34]:
pd.DataFrame(confusion_matrix(real, predict_valid),
    columns=[
        'O',
        'B-NAME_STUDENT',
        'I-NAME_STUDENT',
        'B-STREET_ADDRESS',
        'I-STREET_ADDRESS',
        'B-USERNAME',
        'B-ID_NUM',
        'I-ID_NUM',
        'B-URL_PERSONAL',
        'B-EMAIL',
        'B-PHONE_NUM',
        'I-PHONE_NUM',
    ]            
)

Unnamed: 0,O,B-NAME_STUDENT,I-NAME_STUDENT,B-STREET_ADDRESS,I-STREET_ADDRESS,B-USERNAME,B-ID_NUM,I-ID_NUM,B-URL_PERSONAL,B-EMAIL,B-PHONE_NUM,I-PHONE_NUM
0,11390391,310,288,0,0,81,0,0,63,30,0,208
1,263,32704,18,0,0,0,0,0,0,0,0,0
2,408,272,41183,0,0,82,0,0,0,0,0,0
3,0,0,0,13616,0,0,0,0,0,0,0,0
4,0,0,0,0,116460,0,0,0,0,0,0,0
5,139,0,0,0,0,13939,0,0,0,0,0,0
6,42,0,0,0,0,104,14115,0,0,0,46,0
7,274,0,0,0,0,0,104,10195,0,0,0,0
8,28,0,0,0,0,0,0,0,18918,0,0,0
9,0,0,0,0,0,0,0,0,0,13931,0,0


In [37]:
predict_valid_str = [num2label[it.item()] for it in predict_valid]

In [None]:
predict_valid_str

In [18]:
predict_valid_sh, real_sh = predict_valid[predict_valid != real], real[predict_valid != real]

tensor([[0, 8],
        [1, 2],
        [0, 2],
        ...,
        [0, 5],
        [0, 2],
        [0, 7]])

In [21]:
df = torch.cat([predict_valid.unsqueeze(1), real.unsqueeze(1)], dim=1)
df = (pd.DataFrame(df, columns=['predict', 'real'])
    .assign(predict = lambda _df: _df.predict.apply(lambda x: num2label[x]))
    .assign(real = lambda _df: _df.real.apply(lambda x: num2label[x]))
)

df['Falses'] = df.predict.apply(str) + '-' + df.real.apply(str)
df.groupby('Falses', as_index=False).predict.count().sort_values('predict', ascending=False).reset_index(drop=True)

Unnamed: 0,Falses,predict
0,O-I-NAME_STUDENT,408
1,B-NAME_STUDENT-O,310
2,I-NAME_STUDENT-O,288
3,O-I-ID_NUM,274
4,B-NAME_STUDENT-I-NAME_STUDENT,272
5,O-B-NAME_STUDENT,263
6,I-PHONE_NUM-O,208
7,O-B-USERNAME,139
8,B-ID_NUM-I-ID_NUM,104
9,B-USERNAME-B-ID_NUM,104


In [22]:
df

Unnamed: 0,predict,real,Falses
0,O,I-ID_NUM,O-I-ID_NUM
1,B-NAME_STUDENT,I-NAME_STUDENT,B-NAME_STUDENT-I-NAME_STUDENT
2,O,I-NAME_STUDENT,O-I-NAME_STUDENT
3,B-USERNAME,I-NAME_STUDENT,B-USERNAME-I-NAME_STUDENT
4,O,I-NAME_STUDENT,O-I-NAME_STUDENT
...,...,...,...
2781,B-NAME_STUDENT,O,B-NAME_STUDENT-O
2782,O,B-USERNAME,O-B-USERNAME
2783,O,B-USERNAME,O-B-USERNAME
2784,O,I-NAME_STUDENT,O-I-NAME_STUDENT


In [27]:
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': model.optim.state_dict(),
    }, 'saved_models/bi_lstm.pt')

In [9]:
token2emb = []
train_tokens = []
train_labels = []
test_tokens = []
test_labels = []

import gc
gc.collect()

5715

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = BiLSTM_CRF(
    embedding_dim=300,
    hidden_size=32,
    nclasses=15,
    device=device,
)

checkpoint = torch.load('saved_models/bi_lstm.pt')
model.load_state_dict(checkpoint['model_state_dict'])
model.optim.load_state_dict(checkpoint['optimizer_state_dict'])
model.eval()
model.to(device)
print()




In [5]:
token2emb = {}
with open('wiki-news-300d-1M.vec', 'r', encoding='utf-8') as f:
    next(f)
    for it in tqdm(f):
        row = it.split(' ')
        token2emb[row[0]] = list(map(float, row[1:]))

0it [00:00, ?it/s]

In [6]:
def check_name(x):
    flag = False
    flag = x[0].isupper()
    for it in x[1:]:
        flag *= it.islower()
    return flag

def get_predict_table(data, token2emb, num2label):
    predict_table = [[], [], [], [], []] # document, token_idx, token, predict, real
    for i in tqdm(range(len(data))):
        with torch.no_grad():
            batch_X = []

            for token in data[i]['tokens']:
                emb_tokens = torch.zeros(300)
                if token in token2emb:
                    emb_tokens = torch.FloatTensor(token2emb[token])
                else:
                    for it in token:
                        emb_tokens += torch.FloatTensor(token2emb[it] if it in token2emb else [0]*300)
                batch_X.append(emb_tokens.unsqueeze(0))

            batch_X = torch.cat(batch_X)

            predict =  torch.argmax(model.forward(batch_X.to(device)), dim=1).cpu()
            predict = [num2label[it.item()] for it in predict]
            predict = ['O' if len(set(x)) < 2 else y for x, y in zip(data[i]['tokens'], predict)]
            predict_table[0] += [data[i]['document']]*len(predict)
            predict_table[1] += list(range(len(predict)))
            predict_table[2] += data[i]['tokens']
            predict_table[3] += predict
            predict_table[4] += data[i]['labels'] 
        
    predict_table = [[predict_table[0][i], predict_table[1][i], predict_table[2][i], predict_table[3][i], predict_table[4][i]] for i in range(len(predict_table[0]))]
    predict_table = pd.DataFrame(predict_table, columns=['document', 'token_i', 'token', 'predict', 'label'])
    predict_table = predict_table.loc[(predict_table.label != 'O') & (predict_table.token != "\n") & (predict_table.token != "\n\n") & (predict_table.token != "\t")].reset_index(drop=True)

    return predict_table.sort_values('document').reset_index(drop=True)

In [7]:
import gc
with open('data/train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

predict_table = get_predict_table(data, token2emb, num2label)
gc.collect()

  0%|          | 0/6807 [00:00<?, ?it/s]

0

In [8]:
label = predict_table['label'].str.slice(2).values 
predict = predict_table['predict'].str.slice(2).values 
print(f'F1 score: {f1_score(label, predict, average="micro")}')

F1 score: 0.9459261965655827


In [12]:
label = predict_table['label'].values 
predict = predict_table['predict'].values 
print(f'F1 score: {f1_score(label, predict, average="micro")}')

F1 score: 0.9444647424187066
