In [2]:
import pandas as pd
from pathlib import Path
import regex as re
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from models.lstm.functions import encode, f_score

# The end of preprocessing

In [3]:
data_root = Path('../../data/NER/processed/')

In [4]:
data = pd.read_csv(data_root / 'tokens_labels_lstm.csv', sep = ';')

In [5]:
data.head()

Unnamed: 0,token,label
0,случившееся,0
1,шейд,1
2,бессолини,1
3,сможете,0
4,такакура,1


# Training

In [6]:
X = list(data['token'])
y = list(data['label'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
from torch.utils.data import DataLoader
from models.lstm.token_dataset import TokensDataset

In [9]:
train_ds = TokensDataset(X_train, y_train)
test_ds = TokensDataset(X_test, y_test)

In [14]:
batch_size = 1000
vocab_size = 33
max_len = len(max(data['token'], key=lambda i: len(i)))
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn = TokensDataset.collate_fn)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True, collate_fn = TokensDataset.collate_fn)
emb_dim = 16

In [15]:
from tensorboardX import SummaryWriter
writer = SummaryWriter()

In [20]:
from tqdm import tqdm 
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        
        print('\n\n')
        print('------- EPOCH', i, '--------' )
        
        train_prebs_history = []
        train_labels_history = []
        train_accuracy_history = []
        
        test_preds_history = []
        test_labels_history = []
        
        for batch_idx, (data, label) in tqdm(enumerate(train_dl), total = len(train_dl)):

            optimizer.zero_grad()
            
            preds = model.forward(data)
            
            loss = nn.CrossEntropyLoss()

            output = loss(preds, label)
            
            output.backward()
            optimizer.step()

            train_prebs_history.append(preds)
            train_labels_history.append(label)
            
        print('--------TRAIN----------')
        
        print(f_score(train_prebs_history, train_labels_history))
        train_accuracy = (preds.argmax(dim=1) == label).float().mean()
        writer.add_scalar('Accuracy/train', train_accuracy, i)
        writer.add_scalar('Loss/train', output, i)
        writer.add_scalar('F1_score/train', f_score(train_prebs_history, train_labels_history), i) 
        
        
        for batch_idx, (data, label) in tqdm(enumerate(test_dl), total = len(test_dl)):
            
            test_preds = model.forward(data.long())
            test_preds_history.append(test_preds)
            test_labels_history.append(label)
            
            test_loss = loss(test_preds, label)
        
        print('-----------TEST----------')
        print(f_score(test_preds_history, test_labels_history))
        test_accuracy = (test_preds.argmax(dim=1) == label).float().mean()
        writer.add_scalar('Accuracy/test', test_accuracy, i)
        writer.add_scalar('Loss/test', test_loss, i)
        writer.add_scalar('F1_score/test', f_score(test_preds_history, test_labels_history), i)
            

            

In [21]:
from models.lstm.model import LSTMFixedLen

In [22]:
model_fixed = LSTMFixedLen(vocab_size, emb_dim, 128, max_len)

In [23]:
train_model(model_fixed, epochs=30, lr=0.01)

  0%|                                                                                           | 0/53 [00:00<?, ?it/s]




------- EPOCH 0 --------


100%|██████████████████████████████████████████████████████████████████████████████████| 53/53 [00:12<00:00,  4.38it/s]
  0%|                                                                                           | 0/14 [00:00<?, ?it/s]

--------TRAIN----------
0.7447467876039304


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:01<00:00, 11.85it/s]
  0%|                                                                                           | 0/53 [00:00<?, ?it/s]

-----------TEST----------
0.8265858056056385



------- EPOCH 1 --------


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 52/53 [00:12<00:00,  4.31it/s]


KeyboardInterrupt: 

In [None]:
model_fixed.save('./weights/model_lstm_fixed.pt')

In [16]:
loaded_model = LSTMFixedLen().load('./weights/model_lstm_fixed.pt')

In [18]:
loaded_model.prediction('юки')

tensor([[0.0549, 0.0859]], grad_fn=<AddmmBackward>)