In [None]:
import numpy as np
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import defaultdict
from sklearn.model_selection import train_test_split
from torch.utils.data.dataset import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader
from pymystem3 import Mystem

In [None]:
DATASET_NAME = "data/dataset_40163_1.txt"
TRAIN_DATA_NAME = "data/train_sentences_extended.txt"
NES_NAME = "data/train_nes_extended.txt"

In [None]:
stem = Mystem()
i2t = {1:"PERSON" , 2:"ORG"}
t2i = {'PERSON': 1, 'ORG': 2}

In [None]:
def read(name):
    with open(name, encoding='utf-8') as input_file:
        return list(map(lambda l: re.split('(\W)', l), input_file.readlines()))

In [None]:
def read_labels():
    res = []

    with open(NES_NAME, encoding='utf-8') as input_file:
        for line in input_file:
            s, l, t = None, None, None

            for w in line.split():
                if w == "EOL":
                    break
                
                if s is None:
                    s = int(w)
                elif l is None:
                    l = int(w)
                else:
                    res.append((s, l, w))
                    s, l, t = None, None, None

    return res


In [None]:
def build_voc(data):
    voc = defaultdict(int)

    for s in data:
        for w in s:
            l = stem.lemmatize(w.lower())

            if l not in voc:
                voc[l] = len(voc) + 1

    return voc

In [None]:
X = read(TRAIN_DATA_NAME)
Y = read_labels()
voc = build_voc(X)
pad_value = len(voc) + 1

In [None]:
def pad(data):
    num_data = [torch.tensor([voc[stemmer.stem(word.lower())] for word in s]) for s in data]
    return pad_sequence(num_data, batch_first=True, padding_value=pad_value)

In [None]:
def pos(data):
    def collate(sent):
        pos = []
        idx = 0

        for word in sent:
            cur_l = len(word)
            pos.append((idx, cur_l))
            idx += cur_l
        return pos
    
    return [collate(s) for s in data]

In [None]:
X_padded, X_pos = pad(X), pos(X)

In [None]:
def make_dataset():
    res = []
    
    for s, pos, tags in zip(X_padded, X_pos, Y):
        y = []
        pi, ti = 0, 0

        for word in s:
            if pi < len(pos) and ti < len(tags) and pos[pi][0] == tags[ti][0]:
                y.append(t2i[tags[ti][2]])
                ti += 1
            else:
                y.append(0)
            
            pi += 1
        
        res.append([sent.numpy(), y])

    return np.array(res)

In [None]:
dataset = make_dataset()

In [None]:
train, val = train_test_split(dataset, test_size=0.1, random_state=42)

In [None]:
class Model(nn.Module):
    def __init__(self, vs):
        super(Model, self).__init__()
        self.word_embeddings = nn.Embedding(vs, 64)
        self.lstm = nn.LSTM(64, 128, batch_first=True)
        self.fc2 = nn.Linear(128, 3)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, batch):
        lstm_, _ = self.lstm(self.word_embeddings(batch))
        return F.log_softmax(self.fc2(self.dropout(lstm_)), dim=2)

In [None]:
vocab_size = len(voc) + 3
epoch_cnt = 300
batch_size = 256

model = Model(vocab_size)
model = model.float()
model = model.to(device)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-4)

In [None]:
def train():
    train_loader = DataLoader(train, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val, batch_size=batch_size)
    loss_train, loss_val = [], []
            
    for _ in epoch_cnt:
        for batch_data in train_loader:
            x, y = batch_data[:, 0].to(device), batch_data[:, 1].to(device).reshape(-1)
            optimizer.zero_grad()
            output = model(x.long()).view(-1, 3)
            loss = loss_function(output, y.long())
            loss_train.append(loss)
            loss.backward()
            clip_grad_norm_(model.parameters(), 5)
            optimizer.step()

        with torch.no_grad():
            loss_values = []
            for batch_data in val_loader:
                x, y = batch_data[:, 0].to(device), batch_data[:, 1].to(device).reshape(-1) 
                output = model(x.long()).view(-1, 3)
                loss = loss_function(output, y.long())
                loss_values.append(loss.item())
            
            loss_val.append(np.mean(np.array(loss_values)))

    return loss_train, loss_val 


In [None]:
loss_train, loss_val = train()

In [None]:
test = read_input()
test_padded = pad(test)
test_pos = pos(test)

In [None]:
with torch.no_grad():
    test_loader = DataLoader(test_padded, batch_size=batch_size)
    ans = None
    
    for batch_data in test_loader:
        x = batch_data.to(device)
        output = model(x.long())
        _, ansx = output.max(dim=2)
        ansx = ansx.cpu().numpy()
        if ans is None:
            ans = ansx
        else:
            ans = np.append(ans, ansx, axis=0)

In [None]:
with open("data/output.txt", "w", encoding="utf-8") as output_file:
    for sent, pos, tags in zip(test_padded, test_pos, ans):
        for i in range(len(pos)):
            if tags[i] == 1 or tags[i] == 2:
                output_file.write(str(pos[i][0]) + " " + str(pos[i][1]) + " " + str(i2t[tags[i]]))
        output_file.write("EOL\n")