In [1]:
import pandas as pd
from pathlib import Path
import regex as re
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from models.lstm.functions import encode, f_score

# The end of preprocessing

In [2]:
data_root = Path('../../data/NER/processed/')

In [3]:
data = pd.read_csv(data_root / 'tokens_labels_lstm.csv', sep = ';')

In [4]:
data.head()

Unnamed: 0,token,label
0,ханц,1
1,саду,0
2,мономочи,1
3,распороть,0
4,баснословных,0


# Training

In [5]:
X = list(data['token'])
y = list(data['label'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
from torch.utils.data import DataLoader
from models.lstm.token_dataset import TokensDataset

In [7]:
train_ds = TokensDataset(X_train, y_train)
test_ds = TokensDataset(X_test, y_test)

In [8]:
batch_size = 1000
vocab_size = 33
max_len = len(max(data['token'], key=lambda i: len(i)))
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn = TokensDataset.collate_fn)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True, collate_fn = TokensDataset.collate_fn)
emb_dim = 16

In [9]:
from tensorboardX import SummaryWriter
writer = SummaryWriter()

In [12]:
from tqdm import tqdm 
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        
        print('\n\n')
        print('------- EPOCH', i, '--------' )
        
        train_prebs_history = []
        train_labels_history = []
        train_accuracy_history = []
        
        test_preds_history = []
        test_labels_history = []
        
        for batch_idx, (data, label) in tqdm(enumerate(train_dl), total = len(train_dl)):

            optimizer.zero_grad()
            
            preds = model.forward(data)
            
            loss = nn.CrossEntropyLoss()

            output = loss(preds, label)
            
            output.backward()
            optimizer.step()

            train_prebs_history.append(preds)
            train_labels_history.append(label)
            
        print('--------TRAIN----------')
        
        print(f_score(train_prebs_history, train_labels_history))
        train_accuracy = (preds.argmax(dim=1) == label).float().mean()
        writer.add_scalar('Accuracy/train', train_accuracy, i)
        writer.add_scalar('Loss/train', output, i)
        writer.add_scalar('F1_score/train', f_score(train_prebs_history, train_labels_history), i) 
        
        
        for batch_idx, (data, label) in tqdm(enumerate(test_dl), total = len(test_dl)):
            
            test_preds = model.forward(data.long())
            test_preds_history.append(test_preds)
            test_labels_history.append(label)
            
            test_loss = loss(test_preds, label)
        
        print('-----------TEST----------')
        print(f_score(test_preds_history, test_labels_history))
        test_accuracy = (test_preds.argmax(dim=1) == label).float().mean()
        writer.add_scalar('Accuracy/test', test_accuracy, i)
        writer.add_scalar('Loss/test', test_loss, i)
        writer.add_scalar('F1_score/test', f_score(test_preds_history, test_labels_history), i)
            

            

In [9]:
from models.lstm.model import LSTMFixedLen

In [10]:
model_fixed = LSTMFixedLen(vocab_size, emb_dim, 128, max_len)

In [13]:
train_model(model_fixed, epochs=30, lr=0.01)

NameError: name 'train_model' is not defined

In [14]:
model_fixed.save('./weights/model_lstm_fixed.pt')

In [11]:
loaded_model = LSTMFixedLen().load('./weights/model_lstm_fixed.pt')

In [12]:
loaded_model.prediction('')

1

In [13]:
loaded_model.prediction('наруто')

1

In [21]:
loaded_model.extract_names('привет меня зовут наруто и мне много лет')

['наруто', 'и', 'лет']

In [22]:
a = 'привет меня зовут майя'
type(a.split(' '))

list

In [24]:
import numpy
a = torch.tensor([1, 2]).detach().numpy()
a.argmax()

1