In [1]:
import json
from importlib import reload

import torch
import numpy as np
import pandas as pd 
from tqdm.auto import tqdm
from sklearn.metrics import balanced_accuracy_score

from lstm_model import BiLSTM

In [2]:
def data_label_split(data, label, train_size=0.7):
    randidx = np.arange(len(data))
    data_train, data_test = train_test_split(data, randidx, train_size)
    label_train, label_test = train_test_split(label, randidx, train_size)

    return data_train, data_test, label_train, label_test

def train_test_split(data, randidx, train_size):
    N = len(data)
    return [data[i] for i in randidx[:int(train_size*N)]], [data[i] for i in randidx[int(train_size*N):]]

def shuffle_data_label_lists(data, label):
    randidx = np.arange(len(data))
    np.random.shuffle(randidx)
    return [data[i] for i in randidx], [label[i] for i in randidx]

In [3]:
import torch 
from torch import nn 
from tqdm.auto import tqdm 

class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, nclasses, device) -> None:
        super().__init__()

        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim).to(device)
        self.lstm_model = nn.LSTM(embedding_dim, hidden_size//2, bidirectional=True).to(device)
        self.ffwd_lay = nn.Linear(hidden_size, nclasses).to(device)
        self.softmax = nn.Softmax(dim=2).to(device)

        self.optim = torch.optim.Adam(self.parameters(), lr=1e-2)
        self.criterion = nn.CrossEntropyLoss()


    def forward(self, batch):
        out = self.embedding(batch) # L x vocab_size -> L x embedding_dim
        out = self.lstm_model(out)[0] # L x hidden_size
        out = self.ffwd_lay(out) # L x nclasses
        return self.softmax(out)


    def fit(self, train_data, nepochs, lr, device):
        self.train()
        self.to(device)

        for g in self.optim.param_groups:
            g['lr'] = lr    
        
        for ep in tqdm(range(nepochs)):
            eploss = 0

            for batch in train_data:
                batch_X, batch_Y = batch[:,:,0], batch[:,:,1]
                predict = self.forward(batch_X.to(device))
                
                self.optim.zero_grad()
                loss = self.criterion(predict.swapaxes(1,2), batch_Y.to(device))
                loss.backward()
                self.optim.step()

                eploss += loss.item()
            
            printbool = ep % (nepochs//10) == 0 if nepochs > 10 else True
            if printbool:
                print(f'Train loss: {eploss/len(train_data):.3f}')

In [21]:
# encoding tokens and labels
with open('data/mixtral-8x7b-v1.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

unique_tokens, unique_labels = set(), set()
for doc_i, doc in enumerate(tqdm(data)):
    unique_tokens |= set(np.unique(doc['tokens']))
    unique_labels |= set(np.unique(doc['labels']))

token2num = dict(zip(unique_tokens, range(1, len(unique_tokens)+1)))
label2num = {
    'O': 0,
    'B-URL_PERSONAL': 0, 
    'I-URL_PERSONAL': 0, 
    'B-ID_NUM': 0, 
    'I-ID_NUM': 0, 
    'B-EMAIL': 0, 
    'I-EMAIL': 0,
    'B-NAME_STUDENT': 1, 
    'I-NAME_STUDENT': 1, 
    'B-PHONE_NUM': 0, 
    'I-PHONE_NUM': 0, 
    'B-USERNAME': 0,
    'I-USERNAME': 0, 
    'B-STREET_ADDRESS': 0, 
    'I-STREET_ADDRESS': 0, 
}
num2token = {}
for it in token2num:
    num2token[token2num[it]] = it


# load data and split by sentences
sentences = []
cur_sentence = []
sentences_labels = []
cur_sentences_labels = []

max_len = 200
for doc_i, doc in enumerate(tqdm(data)):
    for token, label in zip(data[doc_i]['tokens'], data[doc_i]['labels']):
        cur_sentence.append(token2num[token])
        cur_sentences_labels.append(label2num[label])

        if (token == '.') | (token.endswith('\n')) | (token == '?') | (token == '!'):   
            # if sum(cur_sentences_labels) > 0:
            if len(cur_sentence) < max_len:
                sentences.append(torch.LongTensor(cur_sentence))
                sentences_labels.append(torch.LongTensor(cur_sentences_labels))

            cur_sentences_labels = []
            cur_sentence = []
    
    if sum(cur_sentences_labels) > 0:
        sentences.append(cur_sentence)
        sentences_labels.append(cur_sentences_labels)

    cur_sentences_labels = []
    cur_sentence = []


# create train and test df 
name_sentences_labels = []
name_sentences = []

username_sentences_labels = []
username_sentences = []

o_sentences_labels = []
o_sentences = []

for i, it in enumerate(sentences):
    if 1 in sentences_labels[i]:
        name_sentences_labels.append(sentences_labels[i])
        name_sentences.append(sentences[i])
    if 2 in sentences_labels[i]:
        username_sentences.append(sentences[i])
        username_sentences_labels.append(sentences_labels[i])
    else:
        o_sentences.append(sentences[i])
        o_sentences_labels.append(sentences_labels[i])

  0%|          | 0/2355 [00:00<?, ?it/s]

  0%|          | 0/2355 [00:00<?, ?it/s]

In [29]:
from torch.nn.utils.rnn import pack_padded_sequence
import torch.nn.functional as F

In [37]:
seq_len = list(map(len, o_sentences))
max_len = max(seq_len)
o_sentences_ten = torch.cat([F.pad(o_sentences[i], (0, max_len-seq_len[i])).reshape(1,-1) for i in range(len(o_sentences))])
o_sentences_ten = pack_padded_sequence(o_sentences_ten, seq_len, batch_first=True, enforce_sorted=False)

In [5]:
name_sentences_train, name_sentences_test, name_sentences_labels_train, name_sentences_labels_test = data_label_split(name_sentences, name_sentences_labels, train_size=0.8)
user_sentences_train, user_sentences_test, user_sentences_labels_train, user_sentences_labels_test = data_label_split(username_sentences, username_sentences_labels, train_size=0.8)
o_sentences_train, o_sentences_test, o_sentences_labels_train, o_sentences_labels_test = data_label_split(o_sentences, o_sentences_labels, train_size=0.8)

# sentences_train = o_sentences_train + name_sentences_train*280 + user_sentences_train*40_000
# sentences_labels_train = o_sentences_labels_train + name_sentences_labels_train*280 + user_sentences_labels_train*40_000

# sentences_test = o_sentences_test + name_sentences_test*280 + user_sentences_test*40_000
# sentences_labels_test = o_sentences_labels_test + name_sentences_labels_test*280 + user_sentences_labels_test*40_000

sentences_train = o_sentences_train + name_sentences_train*280
sentences_labels_train = o_sentences_labels_train + name_sentences_labels_train*280

sentences_test = o_sentences_test + name_sentences_test*280
sentences_labels_test = o_sentences_labels_test + name_sentences_labels_test*280

sentences_train, sentences_labels_train = shuffle_data_label_lists(sentences_train, sentences_labels_train)
sentences_test, sentences_labels_test = shuffle_data_label_lists(sentences_test, sentences_labels_test)

In [6]:
sentences_train = torch.cat(sentences_train, dim=0)
sentences_labels_train = torch.cat(sentences_labels_train, dim=0)

sentences_test = torch.cat(sentences_test, dim=0)
sentences_labels_test = torch.cat(sentences_labels_test, dim=0)

In [11]:
from torch.utils.data import DataLoader

train_data = torch.cat((sentences_train.unsqueeze(2), sentences_labels_train.unsqueeze(2)), dim=2)
train_data = DataLoader(train_data, batch_size=100, shuffle=True)

test_data = torch.cat((sentences_test.unsqueeze(2), sentences_labels_test.unsqueeze(2)), dim=2)
test_data = DataLoader(test_data, batch_size=100, shuffle=True)

In [12]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# fit lstm
model = BiLSTM(
    vocab_size=len(token2num)+1,
    embedding_dim=32,
    hidden_size=16,
    nclasses=2,
    device=device
)

In [13]:
# fit bi-lstm
model.fit(
    train_data=train_data,
    nepochs=10,
    lr=1e-2,
    device=device
)

  0%|          | 0/10 [00:00<?, ?it/s]

Train loss: 0.314
Train loss: 0.313
Train loss: 0.313


KeyboardInterrupt: 

In [14]:
# get train test metrics
predict_train_label = []
train_label = []
with torch.no_grad():
    for batch in train_data:
        batch_X, batch_Y = batch[:,:,0], batch[:,:,1]
        predict = torch.argmax(model.forward(batch_X.to(device)).cpu(), dim=2).reshape(-1)
        predict_train_label.append(predict)
        train_label.append(batch_Y.reshape(-1))

train_label = torch.cat(train_label)
predict_train_label = torch.cat(predict_train_label)
print(f'Train BA: {balanced_accuracy_score(train_label, predict_train_label):.3f}')

predict_test_label = []
test_label = []
with torch.no_grad():
    for batch in test_data:
        batch_X, batch_Y = batch[:,:,0], batch[:,:,1]
        predict = torch.argmax(model.forward(batch_X.to(device)).cpu(), dim=2).reshape(-1)
        predict_test_label.append(predict)
        test_label.append(batch_Y.reshape(-1))

test_label = torch.cat(test_label)
predict_test_label = torch.cat(predict_test_label)
print(f'Test BA: {balanced_accuracy_score(test_label, predict_test_label):.3f}')

Train BA: 0.991
Test BA: 0.785


In [15]:
from sklearn.metrics import confusion_matrix
print('Train')
cfmatrix = confusion_matrix(train_label, predict_train_label)
print(cfmatrix)

print('Test')
cfmatrix = confusion_matrix(test_label, predict_test_label)
print(cfmatrix)

Train
[[102999686      2693]
 [     9470    546551]]
Test
[[25796045      235]
 [   58319    77201]]


In [69]:
# split all df on sentences
sentences_info = []
cur_sentences_info = []

sentences = []
cur_sentence = []
sentences_labels = []
cur_sentences_labels = []

for doc_i, doc in enumerate(tqdm(data)):
    sentence_document = data[doc_i]['document']
    for token_i, token, label in zip(range(len(data[doc_i]['tokens'])), data[doc_i]['tokens'], data[doc_i]['labels']):
        cur_sentence.append(token2num[token])
        cur_sentences_labels.append(label2num[label])
        cur_sentences_info.append([sentence_document, token_i])

        if (token == '.') | (token.endswith('\n')) | (token == '?') | (token == '!'):   
            # if sum(cur_sentences_labels) > 0:
            sentences_info.append(torch.LongTensor(cur_sentences_info))
            sentences.append(torch.LongTensor(cur_sentence))
            sentences_labels.append(torch.LongTensor(cur_sentences_labels))

            cur_sentences_info = []
            cur_sentences_labels = []
            cur_sentence = []
    
    if sum(cur_sentences_labels) > 0:
        sentences_info.append(torch.LongTensor(cur_sentences_info))
        sentences.append(torch.LongTensor(cur_sentence))
        sentences_labels.append(torch.LongTensor(cur_sentences_labels))

    cur_sentences_info = []
    cur_sentences_labels = []
    cur_sentence = []


# get bi-lstm-predict and create result table
results = []

for i in tqdm(range(len(sentences))):
    predict = torch.argmax(model.forward(sentences[i].unsqueeze(0).to(device)).cpu(), dim=2)[0]

    for j in range(1, len(predict)):
        if (predict[j-1] == 1) & (predict[j] == 1):
            predict[j] = 3
        elif ((predict[j-1] == 2) | (predict[j-1] == 4)) & (predict[j] == 2):
            predict[j] = 4
    
    if (predict > 0).sum() > 0:
        results.append(torch.cat((
            sentences_info[i][predict > 0],
            sentences[i][predict > 0].reshape(-1,1),
            predict[predict > 0].reshape(-1,1)
        ), dim=1))

  0%|          | 0/6807 [00:00<?, ?it/s]

  0%|          | 0/282905 [00:00<?, ?it/s]

In [70]:
results = torch.cat(results)
results = pd.DataFrame(results, columns=['document', 'token_i', 'token', 'label'])
results['token'] = results.token.apply(lambda x: num2token[x])

In [72]:
results.loc[results['label'] == 1]

Unnamed: 0,document,token_i,token,label
0,7,9,Nathalie,1
2,7,69,Éditions,1
3,7,482,Nathalie,1
5,7,741,Nathalie,1
7,10,0,Diego,1
...,...,...,...,...
12337,22679,236,purists,1
12338,22681,545,spikes,1
12339,22687,52,auditing,1
12340,22687,115,auditing,1


In [33]:
real_df = []
for doc_i, doc in enumerate(data):
    document_i = doc['document']
    labels = doc['labels']
    tokens = doc['tokens']
    for i, lab in enumerate(labels):
        if 'USERNAME' in lab:
            real_df.append([doc_i, document_i, i, tokens[i]])

In [34]:
real_df = pd.DataFrame(real_df, columns=['doc_i', 'document_i', 'row_i', 'token'])

In [36]:
for i in real_df['doc_i'].values:
    with open(f'text_{i}.txt', 'w', encoding='utf-8') as f:
        f.write(data[i]['full_text'])

In [37]:
real_df

Unnamed: 0,doc_i,document_i,row_i,token
0,114,3351,61,castanedagabriel
1,194,4462,4,fdixon
2,194,4462,582,fdixon
3,300,5716,1,meyermichelle
4,478,7786,623,jacob59
5,550,8642,8,holmespatrick
