In [1]:
!pip install nekograd nerus deli razdel > pip_log.txt
from nekograd.model import CoreModel
import numpy as np
import torch
import pandas as pd
from deli.interface import load_json, save_json
from nerus import load_nerus
from tqdm import tqdm
from pathlib import Path
from typing import Tuple
from torch.utils.data import random_split, DataLoader
import razdel

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tfx-bsl 1.12.0 requires google-api-python-client<2,>=1.7.11, but you have google-api-python-client 2.79.0 which is incompatible.
tensorflow 2.11.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.1 which is incompatible.
tensorflow-serving-api 2.11.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.1 which is incompatible.
onnx 1.13.1 requires protobuf<4,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.
grpc-google-iam-v1 0.12.6 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.
googleapis-common-protos 1.58.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.
google-cloud

In [2]:
NERUS_PATH = Path("/kaggle/input/gzip-nerus/nerus_lenta.conllu.gz")
NERUS_IDS = load_json(NERUS_PATH.parent / "nerus_ids.json")
dataset = load_nerus(NERUS_PATH)

In [3]:
len(NERUS_IDS)

739346

In [4]:
N_DOCS = 10000
dataset_sents = []
dataset_tags = []
ctr = 0
for doc in tqdm(load_nerus(NERUS_PATH), total=N_DOCS):
    if ctr == N_DOCS:
        break
    for sent in doc.sents:   
        dataset_sents.append(sent.text)
        dataset_tags.append([x.pos for x in sent.tokens])
    ctr += 1
train_sents, train_tags = dataset_sents[:int(len(dataset_sents) * 0.75)], dataset_tags[:int(len(dataset_sents) * 0.75)]
val_sents, val_tags = dataset_sents[int(len(dataset_sents) * 0.75):int(len(dataset_sents) * 0.85)], dataset_tags[int(len(dataset_sents) * 0.75):int(len(dataset_sents) * 0.85)]
test_sents, test_tags = dataset_sents[int(len(dataset_sents) * 0.85):], dataset_tags[int(len(dataset_sents) * 0.85):]
print('Train size:', len(train_sents))
print('Val size:', len(val_sents))
print('Test size:', len(test_sents))

100%|██████████| 10000/10000 [00:23<00:00, 417.44it/s]

Train size: 89448
Val size: 11927
Test size: 17890





In [5]:
def build_vocabulary(sents, tokenizer, drop_alpha=0):

    token_ctr = {}
    for sent in tqdm(train_sents):
        for token in tokenizer(sent):
            token_ctr[token] = token_ctr.get(token, 0) + 1

    most_rare_tokens = list({k: v for k, v in sorted(token_ctr.items(), key=lambda item: item[1])}.keys())
    most_rare_tokens = most_rare_tokens[:int(len(most_rare_tokens) * drop_alpha)] # drop_alpha% самых редких слов отбрасываем, чтобы иметь возможность обучаться с [UNK]
    
    token_to_idx = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[UNK]': 3}
    for sent in tqdm(train_sents):
        for token in tokenizer(sent):
            if token not in most_rare_tokens:
                token_to_idx[token] = token_to_idx.get(token, token_to_idx.__len__())
                
    return token_to_idx

In [6]:
from typing import List
class RazdelTokenizer:
    def __call__(self, sentence: str):
        return [w.text for w in razdel.tokenize(sentence)]
    def tokenize_corpus(self, corpus: List[str]):
        return [self(sentence) for sentence in corpus]

In [7]:
token_to_idx = build_vocabulary(train_sents, RazdelTokenizer(), 0.05)
tag_to_idx = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2}
for tags in train_tags:
    for tag in tags:
        tag_to_idx[tag] = tag_to_idx.get(tag, tag_to_idx.__len__())
print('WordToken number:', len(token_to_idx))
print('TagToken number:', len(tag_to_idx))

100%|██████████| 89448/89448 [00:14<00:00, 5971.14it/s]
100%|██████████| 89448/89448 [03:32<00:00, 421.47it/s]


WordToken number: 119919
TagToken number: 20


In [8]:
class Tokenizer(object):
    def __init__(self, token_to_idx, tag_to_idx, tokenizer, max_length=20):
        self.token_to_idx = token_to_idx
        self.tag_to_idx = tag_to_idx
        self.idx_to_tag = {v: k for k, v in tag_to_idx.items()}
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def encode_sent(self, sent):
        tokenized_sent = self.tokenizer(sent)
        if len(tokenized_sent) < self.max_length:
            tokenized_sent = ['[CLS]']+tokenized_sent+['[SEP]'] + ['[PAD]']*(self.max_length-len(tokenized_sent))
        else:
            tokenized_sent = ['[CLS]']+tokenized_sent[:self.max_length]+['[SEP]'] 
        indexed_sent = [self.token_to_idx.get(t, self.token_to_idx['[UNK]']) for t in tokenized_sent]
        return indexed_sent
    
    def encode_tags(self, tags):
        if len(tags) < self.max_length:
            tags = ['[CLS]']+tags+['[SEP]'] + ['[PAD]']*(self.max_length-len(tags))
        else:
            tags = ['[CLS]']+tags[:self.max_length]+['[SEP]'] 
        indexed_tags = [self.tag_to_idx[t] for t in tags]
        return indexed_tags
    
    def decode_tags(self, idxs):
        tags = [self.idx_to_tag[idx] for idx in idxs]
        return tags


In [9]:
class NERUSDataset(torch.utils.data.Dataset):
    def __init__(self, sents, tag_sents, tokenizer):
        assert isinstance(sents[0], str) and isinstance(tag_sents[0], list)
        assert len(sents) == len(tag_sents)
        
        self.X = torch.tensor([tokenizer.encode_sent(sent) for sent in sents])
        self.y = torch.tensor([tokenizer.encode_tags(tags) for tags in tag_sents])

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.y[idx]
        return x, y

In [10]:
import pytorch_lightning as pl
class NERUSDataModule(pl.LightningDataModule):
    def __init__(self, tokenizer, batch_size: int = 128):
        super().__init__()
        self.tokenizer = tokenizer
        self.batch_size = batch_size
    def prepare_data(self):
        pass
    def setup(self, stage: str):
        if stage == "fit":
            self.train_dataset = NERUSDataset(train_sents, train_tags, self.tokenizer)
            self.val_dataset = NERUSDataset(val_sents, val_tags, self.tokenizer)
        elif stage == "test":
            self.test_dataset = NERUSDataset(test_sents, test_tags, self.tokenizer)
        else:
            raise ValueError(f"Unknown stage: {stage}")
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset, self.batch_size, shuffle=True)
    def val_dataloader(self):
        return  DataLoader(self.val_dataset, self.batch_size // 2, shuffle=False)
    def test_dataloader(self):
        return DataLoader(self.test_dataset, self.batch_size // 2, shuffle=False)

In [11]:
import torch
class POSPredictor(torch.nn.Module):    
    def __init__(self, tokenizer, vocab_dim, output_dim, emb_dim=10, hidden_dim=10, 
                 num_layers=1, bidirectional=False, p=0.7):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_dim, emb_dim)
        self.lstm = torch.nn.LSTM(emb_dim, hidden_dim, num_layers, 
                                     bidirectional=bidirectional, 
                                     batch_first=True, dropout=p)
        self.linear = torch.nn.Linear((bidirectional + 1)*hidden_dim, output_dim)
        self.dropout = torch.nn.Dropout(p)
        self.output_dim = output_dim
        
    def forward(self, x):
        x = self.dropout(self.embedding(x))
        out, (h, c) = self.lstm(x)  

        return self.linear(out).transpose(1, 2)

In [12]:
tokenizer = Tokenizer(token_to_idx, tag_to_idx, RazdelTokenizer(), 32)
datamodule = NERUSDataModule(tokenizer, 512)

In [13]:
config = dict()
config['vocab_dim'] = len(token_to_idx)
config['output_dim'] = len(tag_to_idx)
config['emb_dim'] = 64
config['hidden_dim'] = 128
config['num_layers'] = 1
config['bidirectional'] = False
config['p'] = 0.4

In [14]:
from functools import wraps
def to_tensor(metric):
    @wraps(metric)
    def wrapper(y, x, *args, **kwargs):
        return metric(*map(torch.from_numpy, (y, x)), *args, **kwargs)
    return wrapper

from nekograd.metrics.utils import swap_args, argmax
from cytoolz.functoolz import compose

import torchmetrics.functional as FM

metric_decorator = compose(argmax(1), swap_args, to_tensor)

metrics = {"accuracy": lambda y, x: metric_decorator(FM.accuracy)(y, x, num_classes=config['output_dim'],
                                                                  ignore_index=token_to_idx['[PAD]'], task="multiclass")}

criterion = lambda x, y: torch.nn.functional.cross_entropy(x, y, ignore_index=token_to_idx['[PAD]'])

In [15]:
from nekograd.model import CoreModel

class Model(CoreModel):
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), 1e-3)
        lr_scheduler = {
            "scheduler": torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 1),
            "name": "lr_scheduler",
            "interval": "epoch",
        }

        return [optimizer], [lr_scheduler]

In [16]:
architecture = POSPredictor(tokenizer, **config)

model = Model(architecture, criterion, metrics)

trainer = pl.Trainer(max_epochs=10)

trainer.fit(model, datamodule=datamodule)
print(trainer.test(model, datamodule=datamodule))

  "num_layers={}".format(dropout, num_layers))


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]



Testing: 0it [00:00, ?it/s]

[{'test/accuracy': 0.9281967282295227}]
