In [1]:
from statistics import mean, stdev
from transformers import BertTokenizer
from tqdm.notebook import tqdm
from tensorboardX import SummaryWriter
import matplotlib.pyplot as plt
# from datasets.reader import read_conll
from datasets.conll import ConllBertDataset



# Load data

In [2]:
manga_path = '../../data/NER/processed/comments/augmented_10/'

In [3]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-conversational')
train_ds = ConllBertDataset.from_file(manga_path + 'train.txt', tokenizer)
test_ds = ConllBertDataset.from_file(manga_path + 'test.txt', tokenizer)

HBox(children=(FloatProgress(value=0.0, max=4206.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1051.0), HTML(value='')))




# Create model

In [4]:
import torch
import torch.nn as nn

from models import BertLstm
from sklearn.metrics import roc_auc_score

In [5]:
acc = lambda preds, labels: ((preds.argmax(dim=1) == labels).sum() / labels.size(0)).item()
sigmoid = nn.Sigmoid()

def auc(preds, labels):
    assert all(map(lambda preds: preds.size(1) == 2, preds))
    to_probs = lambda preds: sigmoid(preds)[:, 0].cpu().detach().numpy()
    to_labels = lambda labels: (1 - labels).cpu().detach().numpy()
    
    preds = list(map(to_probs, preds))
    labels = list(map(to_labels, labels))
    
    preds = [item for subl in preds for item in subl]
    labels = [item for subl in labels for item in subl]
    
    auc = roc_auc_score(labels, preds)
    return auc

# Train

In [6]:
device = torch.device('cuda')
labels_n = max(train_ds.labels_n, test_ds.labels_n)

simple_model = BertLstm(labels_n).to(device)
opt = torch.optim.Adam(simple_model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

writer = SummaryWriter()#'runs/16_epoches_30_augs_uncased')

In [8]:
loss_interval = 50

for e in range(10):
    print('Epoch', e+1) 
    
    losses = []
    probs_epoch = []
    labels_epoch = []
    opt.zero_grad()
    for i in tqdm(range(1, len(train_ds))):
        if i % loss_interval == 0:
            opt.step()
            opt.zero_grad()
        words, labels = train_ds[i]
        words = torch.LongTensor(words).unsqueeze(0).to(device)
        labels = torch.LongTensor(list(map(lambda label: train_ds.ne2ix[label], labels))).to(device)

        preds = simple_model(words)
        loss = criterion(preds, labels)
        loss.backward()
        
        step = e*len(train_ds)+i
        writer.add_scalar('Loss/Train', loss.item(), step)
        writer.add_scalar('Accuracy/Train', acc(preds, labels), step)
        probs_epoch.append(preds)
        labels_epoch.append(labels)
    writer.add_scalar('AUC_epoch/Train', auc(probs_epoch, labels_epoch), e) 
            
    losses = []
    probs_epoch = []
    labels_epoch = []
    with torch.no_grad():
        for i in tqdm(range(1, len(test_ds))):
            words, labels = train_ds[i]
            words = torch.LongTensor(words).unsqueeze(0).to(device)
            labels = torch.LongTensor(list(map(lambda label: test_ds.ne2ix[label], labels))).to(device)

            preds = simple_model(words)
            loss = criterion(preds, labels)
            step = e*len(test_ds)+i
            writer.add_scalar('Loss/Test', loss.item(), step)
            writer.add_scalar('Accuracy/Test', acc(preds, labels), step)
            probs_epoch.append(preds)
            labels_epoch.append(labels)
    writer.add_scalar('AUC_epoch/Test', auc(probs_epoch, labels_epoch), e) 
    

Epoch 1


HBox(children=(FloatProgress(value=0.0, max=4205.0), HTML(value='')))

ValueError: Expected input batch_size (13) to match target batch_size (11).

In [None]:
simple_model.save('./weights/model.pt')