In [27]:
import pandas as pd
from pathlib import Path
import regex as re
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from models.lstm.functions import encode, f_score

# The end of preprocessing

In [28]:
data_root = Path('../../data/NER/processed/')

In [29]:
data = pd.read_csv(data_root / 'tokens_labels_lstm.csv', sep = ';')

In [30]:
data.head()

Unnamed: 0,token,label
0,молох,1
1,сознаю,0
2,ясукава,1
3,материалами,0
4,рокуджоджи,1


In [31]:
max_len = len(max(data['token'], key=lambda i: len(i)))
data['encoded'] = data['token'].apply(lambda x: np.array(encode(x, max_len)))
data.head()

Unnamed: 0,token,label,encoded
0,молох,1,"[13, 15, 12, 15, 22, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,сознаю,0,"[18, 15, 8, 14, 1, 31, 1, 1, 1, 1, 1, 1, 1, 1]"
2,ясукава,1,"[32, 18, 20, 11, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1]"
3,материалами,0,"[13, 1, 19, 6, 17, 9, 1, 12, 1, 13, 9, 1, 1, 1]"
4,рокуджоджи,1,"[17, 15, 11, 20, 5, 7, 15, 5, 7, 9, 1, 1, 1, 1]"


# Training

In [32]:
X = list(data['encoded'])
y = list(data['label'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [33]:
from torch.utils.data import DataLoader
from models.lstm.token_dataset import TokensDataset

In [34]:
train_ds = TokensDataset(X_train, y_train)
test_ds = TokensDataset(X_test, y_test)

In [35]:
batch_size = 1000
vocab_size = 33
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True)
emb_dim = 16

In [36]:
# import numpy as np
# import sklearn
# from sklearn.metrics import precision_recall_curve

# sigmoid = nn.Sigmoid()

# def f_score(preds, labels):
#     preds2 = []
#     assert all(map(lambda preds: preds.size(1) == 2, preds))
#     to_probs = lambda preds: sigmoid(preds)[:, 0].cpu().detach().numpy()
#     to_labels = lambda labels: (1 - labels).cpu().detach().numpy()
    
#     preds = list(map(to_probs, preds))
#     labels = list(map(to_labels, labels))
    
#     preds = [item for subl in preds for item in subl]
#     labels = [item for subl in labels for item in subl]
#     precision, recall, thresholds = precision_recall_curve(labels, preds)
#     fscore = (2 * precision * recall) / (precision + recall)
#     ix = np.argmax(fscore)
#     return fscore[ix]

In [37]:
from tensorboardX import SummaryWriter
writer = SummaryWriter()

In [38]:
from tqdm import tqdm 
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        
        print('\n\n')
        print('------- EPOCH', i, '--------' )
        
        train_prebs_history = []
        train_labels_history = []
        train_accuracy_history = []
        
        test_preds_history = []
        test_labels_history = []
        
        for batch_idx, (data, label) in tqdm(enumerate(train_dl), total = len(train_dl)):

            optimizer.zero_grad()
            
            data = data.long()
            
            preds = model.forward(data)
            
            loss = nn.CrossEntropyLoss()

            output = loss(preds, label)
            
            output.backward()
            optimizer.step()

            train_prebs_history.append(preds)
            train_labels_history.append(label)
            
        print('--------TRAIN----------')
        print(f_score(train_prebs_history, train_labels_history))
        train_accuracy = (preds.argmax(dim=1) == label).float().mean()
        writer.add_scalar('Accuracy/train', train_accuracy, i)
        writer.add_scalar('Loss/train', output, i)
        writer.add_scalar('F1_score/train', f_score(train_prebs_history, train_labels_history), i) 
        
        
        for batch_idx, (data, label) in tqdm(enumerate(test_dl), total = len(test_dl)):
            
            test_preds = model.forward(data.long())
            test_preds_history.append(test_preds)
            test_labels_history.append(label)
            
            test_loss = loss(test_preds, label)
        
        print('-----------TEST----------')
        print(f_score(test_preds_history, test_labels_history))
        test_accuracy = (test_preds.argmax(dim=1) == label).float().mean()
        writer.add_scalar('Accuracy/test', test_accuracy, i)
        writer.add_scalar('Loss/test', test_loss, i)
        writer.add_scalar('F1_score/test', f_score(test_preds_history, test_labels_history), i)
            

            

In [39]:
from models.lstm.model import LSTM_fixed_len

In [40]:
model_fixed = LSTM_fixed_len(vocab_size, emb_dim, 128)

In [41]:
train_model(model_fixed, epochs=30, lr=0.01)

  0%|                                                                                           | 0/53 [00:00<?, ?it/s]




------- EPOCH 0 --------


100%|██████████████████████████████████████████████████████████████████████████████████| 53/53 [00:12<00:00,  4.33it/s]
  0%|                                                                                           | 0/14 [00:00<?, ?it/s]

--------TRAIN----------
0.7282757152739593


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:01<00:00, 11.79it/s]
  0%|                                                                                           | 0/53 [00:00<?, ?it/s]

-----------TEST----------
0.826239438864977



------- EPOCH 1 --------


 25%|████████████████████                                                              | 13/53 [00:03<00:10,  3.70it/s]


KeyboardInterrupt: 

In [16]:
model_fixed.save('./weights/model_lstm_fixed.pt')

In [17]:
import os
def load_model(model_path):
    path = os.path.abspath(model_path)
    model = LSTM_fixed_len()
    model.load_state_dict(torch.load(path))
    model.eval()
    return model 

In [18]:
loaded_model = load_model('./weights/model_lstm_fixed.pt')

In [19]:
loaded_model.prediction('юки')

tensor([[-0.3767, -0.0162]], grad_fn=<AddmmBackward>)