In [None]:
import re
import nltk

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
# nltk.download('punkt')
import numpy as np
from copy import deepcopy

In [None]:
# from google.colab import drive
# drive.mount('/content/drive/')

## 1. Классификация фамилий (RNN)

Датасет: https://disk.yandex.ru/d/frNchuaBQVLxyA?w=1

In [None]:
surnames = pd.read_csv("./data/surnames.csv")
surnames["surname"] = surnames["surname"].str.lower()

encoder = LabelEncoder().fit(surnames["nationality"])
y = encoder.transform(surnames["nationality"])
surnames.head(2)

Unnamed: 0,surname,nationality
0,woodford,English
1,coté,French


In [None]:
class Vocab:
    def __init__(self, data):
        self.max_seq_len = data.str.len().max()
        self.tokens = set()
        for item in data:
            self.tokens.update(item)
        self.idx_to_token = dict(enumerate(self.tokens, 1))
        self.idx_to_token[0] = "<PAD>"
        self.token_to_idx = {token: idx for idx, token in self.idx_to_token.items()}
        self.vocab_len = len(self.idx_to_token)

In [None]:
class SurnamesDataset(Dataset):
    def __init__(self, X, y, vocab: Vocab):
        self.X = X
        self.y = torch.LongTensor(y)
        self.vocab = vocab

    def vectorize(self, surname):
        '''Генерирует представление фамилии surname в при помощи бинарного кодирования (см. 1.2)'''
        surname_t = torch.zeros(self.vocab.max_seq_len).type(torch.long)
        for i, token in enumerate(surname):
            surname_t[i] = self.vocab.token_to_idx[token]
        return surname_t

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.vectorize(self.X.iloc[idx]), self.y[idx]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(surnames.surname, y, test_size=0.2)

In [None]:
vocab = Vocab(surnames["surname"])
trainset = SurnamesDataset(X_train, y_train, vocab)
testset = SurnamesDataset(X_test, y_test, vocab)

trainloader = DataLoader(trainset, batch_size=32)
testloader = DataLoader(testset, batch_size=32)

1.1 Используя класс `nn.RNNCell` (абстракцию для отдельного временного шага RNN), реализуйте простейшую рекуррентную сеть Элмана в виде класса `RNN`. Используя созданный класс `RNN`, решите задачу классификации фамилий. 


In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.rnncell = nn.RNNCell(input_size=input_size, hidden_size=hidden_size)

    def forward(self, x, h=None):
        '''
        x.shape = (batch_size, seq_len, feature_size) - тензор входных данных
        h.shape = (batch_size, hidden_size) - тензор со скрытым состоянием RNN
        '''
        seq_len = x.size(1)
        batch_size = x.size(0)
        # инициализация тензора скрытых состояний
        if not h:
            h = torch.zeros(batch_size, self.hidden_size)
        hs = torch.zeros(batch_size, seq_len, self.hidden_size)

        # проход по каждому элементу последовательностей s в батче и обновление скрытого состояния
        # h = RNNCell(s_t, h)
        for i in range(seq_len):
            h = self.rnncell(x[:,i,:], h)
            hs[:,i,:] = h
        # вернуть тензор всех наблюдавшихся скрытых состояний размера (batch_size, seq_len, hidden_size)
        #и тензор скрытых состояний в последний момент времени
        return h, hs

In [None]:
class MyModel(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_len, pad_idx, num_classes):
        super(MyModel, self).__init__()
        self.rnncell = RNN(input_size, hidden_size)
        self.emb = nn.Embedding(
            num_embeddings=vocab_len,
            embedding_dim=input_size,
            padding_idx=pad_idx
        )
        self.fc = nn.Linear(in_features=hidden_size, out_features=num_classes)
        
    def forward(self, x):
        x = self.emb(x)
        x, hs = self.rnncell(x)
        x = self.fc(x)
        return x

In [None]:
num_classes = len(np.unique(y))

model = MyModel(
    input_size=64,
    hidden_size=16,
    vocab_len=vocab.vocab_len,
    pad_idx=0,
    num_classes=num_classes
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.001)

In [None]:
def model_train(model, optimizer, criterion, train_loader, test_loader, n_epochs=10, epoch_step=5):
    for epoch in range(n_epochs):
        y_pred = torch.empty(0)
        y_true = torch.empty(0, dtype=torch.long)
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()
            with torch.no_grad():
                y_true = torch.cat((y_true, y_batch))
                y_pred = torch.cat((y_pred, predictions))
        with torch.no_grad():
            train_acc = accuracy_score(y_true, y_pred.argmax(dim=1)).item()
            train_loss = criterion(y_pred, y_true).item()

        model.eval()
        with torch.no_grad():
            y_pred = torch.empty(0)
            y_true = torch.empty(0, dtype=torch.long)
            for X_batch, y_batch in test_loader:
                predictions = model(X_batch)
                y_true = torch.cat((y_true, y_batch))
                y_pred = torch.cat((y_pred, predictions))
            val_acc = accuracy_score(y_true, y_pred.argmax(dim=1)).item()
            val_loss = criterion(y_pred, y_true).item()
            
            if epoch % epoch_step == 0:
                print(f'#{epoch} Training loss: {train_loss:.4f} training_acc:\
 {train_acc:.4f} val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
    return y_true, y_pred.argmax(dim=1)

In [None]:
model_train(model, optimizer, criterion, trainloader, testloader, n_epochs=51, epoch_step=5)

#0 Training loss: 2.3823 training_acc: 0.2185 val_loss: 2.2437 val_acc: 0.2709
#5 Training loss: 1.9053 training_acc: 0.3810 val_loss: 1.9093 val_acc: 0.3862
#10 Training loss: 1.8277 training_acc: 0.4109 val_loss: 1.8682 val_acc: 0.4089
#15 Training loss: 1.7919 training_acc: 0.4309 val_loss: 1.8063 val_acc: 0.4376
#20 Training loss: 1.7583 training_acc: 0.4844 val_loss: 1.7800 val_acc: 0.4977
#25 Training loss: 1.5869 training_acc: 0.5770 val_loss: 1.6078 val_acc: 0.5633
#30 Training loss: 1.4984 training_acc: 0.6116 val_loss: 1.5442 val_acc: 0.5920
#35 Training loss: 1.4650 training_acc: 0.6200 val_loss: 1.5256 val_acc: 0.6047
#40 Training loss: 1.4503 training_acc: 0.6240 val_loss: 1.5202 val_acc: 0.5984
#45 Training loss: 1.4413 training_acc: 0.6249 val_loss: 1.5157 val_acc: 0.5915
#50 Training loss: 1.4950 training_acc: 0.6011 val_loss: 1.5358 val_acc: 0.5815


(tensor([ 2,  9, 14,  ..., 10,  0,  4]),
 tensor([14,  9, 14,  ...,  9,  0,  4]))

In [None]:
def predict_surname(surname):
    surname = surname.lower()
    return encoder.inverse_transform(model(trainset.vectorize(surname).view(1,-1)).argmax().view(-1))[0]

In [None]:
predict_surname("Snow")

'English'

1.2 Замените модуль `RNN` из 1.1 на модули `nn.RNN`, `nn.LSTM` и `nn.GRU` (не забудьте указать аргумент `batch_first=True`). Сравните результаты работы.

# nn.RNN

In [None]:
class MyModel(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_len, pad_idx, num_classes):
        super(MyModel, self).__init__()
        self.rnncell = nn.RNN(input_size, hidden_size, batch_first=True)
        self.emb = nn.Embedding(
            num_embeddings=vocab_len,
            embedding_dim=input_size,
            padding_idx=pad_idx
        )
        self.fc = nn.Linear(in_features=hidden_size, out_features=num_classes)
        
    def forward(self, x):
        x = self.emb(x)
        output, h_n = self.rnncell(x)
        x = self.fc(output[:,-1,:])
        return x

In [None]:
model = MyModel(
    input_size=64,
    hidden_size=16,
    vocab_len=vocab.vocab_len,
    pad_idx=0,
    num_classes=num_classes
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.001)

In [None]:
model_train(model, optimizer, criterion, trainloader, testloader, n_epochs=51, epoch_step=5)

#0 Training loss: 2.2085 training_acc: 0.3560 val_loss: 2.0051 val_acc: 0.4144
#5 Training loss: 1.7456 training_acc: 0.5326 val_loss: 1.7672 val_acc: 0.5287
#10 Training loss: 1.6349 training_acc: 0.5552 val_loss: 1.6716 val_acc: 0.5455
#15 Training loss: 1.5670 training_acc: 0.5795 val_loss: 1.6227 val_acc: 0.5679
#20 Training loss: 1.5351 training_acc: 0.5907 val_loss: 1.5771 val_acc: 0.5760
#25 Training loss: 1.4831 training_acc: 0.6105 val_loss: 1.5595 val_acc: 0.5934
#30 Training loss: 1.4723 training_acc: 0.6149 val_loss: 1.5341 val_acc: 0.5965
#35 Training loss: 1.4276 training_acc: 0.6284 val_loss: 1.5019 val_acc: 0.6097
#40 Training loss: 1.4296 training_acc: 0.6273 val_loss: 1.5072 val_acc: 0.6029
#45 Training loss: 1.4183 training_acc: 0.6298 val_loss: 1.4824 val_acc: 0.6107
#50 Training loss: 1.3886 training_acc: 0.6354 val_loss: 1.4803 val_acc: 0.6043


(tensor([ 2,  9, 14,  ..., 10,  0,  4]),
 tensor([ 9,  9, 14,  ..., 10,  0,  4]))

In [None]:
predict_surname("Snow")

'English'

# nn.LSTM

In [None]:
class MyModel(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_len, pad_idx, num_classes):
        super(MyModel, self).__init__()
        self.rnncell = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.emb = nn.Embedding(
            num_embeddings=vocab_len,
            embedding_dim=input_size,
            padding_idx=pad_idx
        )
        self.fc = nn.Linear(in_features=hidden_size, out_features=num_classes)
        
    def forward(self, x):
        x = self.emb(x)
        output, (h_n, c_n) = self.rnncell(x)
        x = self.fc(output[:,-1,:])
        return x

In [None]:
model = MyModel(
    input_size=64,
    hidden_size=16,
    vocab_len=vocab.vocab_len,
    pad_idx=0,
    num_classes=num_classes
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.001)

In [None]:
model_train(model, optimizer, criterion, trainloader, testloader, n_epochs=51, epoch_step=5)

#0 Training loss: 2.2987 training_acc: 0.2880 val_loss: 1.9933 val_acc: 0.4048
#5 Training loss: 1.4157 training_acc: 0.6191 val_loss: 1.4490 val_acc: 0.6066
#10 Training loss: 1.2682 training_acc: 0.6492 val_loss: 1.3316 val_acc: 0.6421
#15 Training loss: 1.1781 training_acc: 0.6792 val_loss: 1.2981 val_acc: 0.6430
#20 Training loss: 1.1234 training_acc: 0.6870 val_loss: 1.2352 val_acc: 0.6571
#25 Training loss: 1.0821 training_acc: 0.6973 val_loss: 1.2120 val_acc: 0.6699
#30 Training loss: 1.0328 training_acc: 0.7088 val_loss: 1.1901 val_acc: 0.6740
#35 Training loss: 1.0009 training_acc: 0.7166 val_loss: 1.1659 val_acc: 0.6771
#40 Training loss: 0.9811 training_acc: 0.7199 val_loss: 1.1499 val_acc: 0.6799
#45 Training loss: 0.9419 training_acc: 0.7317 val_loss: 1.1386 val_acc: 0.6876
#50 Training loss: 0.9234 training_acc: 0.7350 val_loss: 1.1220 val_acc: 0.6853


(tensor([ 2,  9, 14,  ..., 10,  0,  4]),
 tensor([ 6,  9, 14,  ..., 10,  0,  4]))

In [None]:
predict_surname("Snow")

'English'

# nn.GRU

In [None]:
class MyModel(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_len, pad_idx, num_classes):
        super(MyModel, self).__init__()
        self.rnncell = nn.GRU(input_size, hidden_size, batch_first=True)
        self.emb = nn.Embedding(
            num_embeddings=vocab_len,
            embedding_dim=input_size,
            padding_idx=pad_idx
        )
        self.fc = nn.Linear(in_features=hidden_size, out_features=num_classes)
        
    def forward(self, x):
        x = self.emb(x)
        output, h_n = self.rnncell(x)
        x = self.fc(output[:,-1,:])
        return x

In [None]:
model = MyModel(
    input_size=64,
    hidden_size=16,
    vocab_len=vocab.vocab_len,
    pad_idx=0,
    num_classes=num_classes
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.001)

In [None]:
model_train(model, optimizer, criterion, trainloader, testloader, n_epochs=51, epoch_step=5)

#0 Training loss: 2.2792 training_acc: 0.2974 val_loss: 1.9990 val_acc: 0.3980
#5 Training loss: 1.4117 training_acc: 0.6223 val_loss: 1.4257 val_acc: 0.6120
#10 Training loss: 1.2042 training_acc: 0.6758 val_loss: 1.2715 val_acc: 0.6512
#15 Training loss: 1.1010 training_acc: 0.7007 val_loss: 1.1925 val_acc: 0.6648
#20 Training loss: 1.0387 training_acc: 0.7129 val_loss: 1.1567 val_acc: 0.6767
#25 Training loss: 1.0063 training_acc: 0.7194 val_loss: 1.1241 val_acc: 0.6821
#30 Training loss: 0.9768 training_acc: 0.7245 val_loss: 1.1037 val_acc: 0.6840
#35 Training loss: 0.9659 training_acc: 0.7283 val_loss: 1.1017 val_acc: 0.6890
#40 Training loss: 0.9306 training_acc: 0.7332 val_loss: 1.0889 val_acc: 0.6826
#45 Training loss: 0.9163 training_acc: 0.7370 val_loss: 1.0883 val_acc: 0.6890
#50 Training loss: 0.9094 training_acc: 0.7378 val_loss: 1.0943 val_acc: 0.6876


(tensor([ 2,  9, 14,  ..., 10,  0,  4]),
 tensor([ 2,  9, 14,  ..., 10,  0,  4]))

In [None]:
predict_surname("Snow")

'English'

1.3 Загрузите предобученные эмбеддинги (https://disk.yandex.ru/d/BHuT2tEXr_yBOQ?w=1) в модуль `nn.Embedding` и обучите модели из 1.2.

In [None]:
with open('./data/glove.6B.50d.txt', encoding="utf8") as file:
    weights = file.readlines()
weights = list(map(str.split, weights))
weights = {i[0]: torch.tensor(list(map(float, i[1:]))) for i in weights}

In [None]:
embedding = nn.Embedding(
    num_embeddings=vocab.vocab_len,
    embedding_dim=50,
    padding_idx=0
)
for token in vocab.tokens:
    try:
        curr_weight = weights[token]
        with torch.no_grad():
            embedding.weight[vocab.token_to_idx[token]] = curr_weight
    except KeyError as e:
        print(f"для {token} нет эмбеддинга")
embedding.weight

для ù нет эмбеддинга
для ż нет эмбеддинга
для ì нет эмбеддинга
для ń нет эмбеддинга


Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.2376,  0.9327,  0.6747,  ..., -0.4449,  0.2803,  1.0710],
        [ 0.3818,  1.7446,  1.2457,  ...,  0.1289, -0.1232,  0.5867],
        ...,
        [-0.9432,  1.6448,  1.2020,  ..., -0.6479,  0.6564,  1.1344],
        [ 0.2171,  0.4651, -0.4676,  ..., -0.0438,  0.4101,  0.1796],
        [ 0.7383,  0.6545,  1.0873,  ..., -0.1680,  0.6562,  1.1014]],
       requires_grad=True)

In [None]:
minidict = {
    "ż": "z",
    "ì": "i",
    "ń": "n",
    "ù": "u"
}
def костыль(char_):
    for key in minidict.keys():
        char_ = char_.replace(key, str(minidict[key]))
    return char_

In [None]:
embedding2 = nn.Embedding(
    num_embeddings=vocab.vocab_len,
    embedding_dim=50,
    padding_idx=0
)
embedding2.weight.requires_grad = False
for token in vocab.tokens:
    token = костыль(token)
    try:
        curr_weight = weights[token]
        embedding2.weight[vocab.token_to_idx[token]] = curr_weight
    except KeyError as e:
        print(f"для {token} нет эмбеддинга")
embedding2.weight

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.2376,  0.9327,  0.6747,  ..., -0.4449,  0.2803,  1.0710],
        [ 0.3818,  1.7446,  1.2457,  ...,  0.1289, -0.1232,  0.5867],
        ...,
        [-0.9432,  1.6448,  1.2020,  ..., -0.6479,  0.6564,  1.1344],
        [ 0.2171,  0.4651, -0.4676,  ..., -0.0438,  0.4101,  0.1796],
        [ 0.7383,  0.6545,  1.0873,  ..., -0.1680,  0.6562,  1.1014]])

# nn.RNN

In [None]:
class MyModel(nn.Module):
    def __init__(self, emb, hidden_size, num_classes):
        super(MyModel, self).__init__()
        self.emb = emb
        input_size = emb.weight.shape[1]
        self.rnncell = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(in_features=hidden_size, out_features=num_classes)
        
    def forward(self, x):
        x = self.emb(x)
        output, h_n = self.rnncell(x)
        x = self.fc(output[:,-1,:])
        return x

In [None]:
model = MyModel(
    emb=deepcopy(embedding),
    hidden_size=16,
    num_classes=num_classes
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.001)

In [None]:
model_train(model, optimizer, criterion, trainloader, testloader, n_epochs=51, epoch_step=5)

#0 Training loss: 2.3355 training_acc: 0.2556 val_loss: 2.2431 val_acc: 0.2709
#5 Training loss: 1.8937 training_acc: 0.3786 val_loss: 1.9002 val_acc: 0.3953
#10 Training loss: 1.8273 training_acc: 0.4010 val_loss: 1.8776 val_acc: 0.4039
#15 Training loss: 1.7800 training_acc: 0.4285 val_loss: 1.8229 val_acc: 0.4267
#20 Training loss: 1.7563 training_acc: 0.4402 val_loss: 1.8027 val_acc: 0.4349
#25 Training loss: 1.7133 training_acc: 0.4553 val_loss: 1.7803 val_acc: 0.4349
#30 Training loss: 1.5535 training_acc: 0.5850 val_loss: 1.5948 val_acc: 0.5692
#35 Training loss: 1.5075 training_acc: 0.6028 val_loss: 1.5275 val_acc: 0.5943
#40 Training loss: 1.4649 training_acc: 0.6119 val_loss: 1.5274 val_acc: 0.5911
#45 Training loss: 1.4244 training_acc: 0.6252 val_loss: 1.4815 val_acc: 0.6011
#50 Training loss: 1.4472 training_acc: 0.6149 val_loss: 1.4783 val_acc: 0.6029


(tensor([ 2,  9, 14,  ..., 10,  0,  4]),
 tensor([14,  9, 14,  ..., 10, 10, 14]))

In [None]:
predict_surname("Snow")

'English'

In [None]:
model = MyModel(
    emb=deepcopy(embedding2),
    hidden_size=16,
    num_classes=num_classes
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.001)

In [None]:
model_train(model, optimizer, criterion, trainloader, testloader, n_epochs=51, epoch_step=5)

#0 Training loss: 2.3161 training_acc: 0.2441 val_loss: 2.2431 val_acc: 0.2709
#5 Training loss: 2.2082 training_acc: 0.2714 val_loss: 2.2038 val_acc: 0.2723
#10 Training loss: 1.7931 training_acc: 0.4567 val_loss: 1.7935 val_acc: 0.4745
#15 Training loss: 1.6127 training_acc: 0.5512 val_loss: 1.6536 val_acc: 0.5369
#20 Training loss: 1.5537 training_acc: 0.5647 val_loss: 1.5818 val_acc: 0.5578
#25 Training loss: 1.5050 training_acc: 0.5860 val_loss: 1.5437 val_acc: 0.5692
#30 Training loss: 1.4834 training_acc: 0.5937 val_loss: 1.5191 val_acc: 0.5801
#35 Training loss: 1.4689 training_acc: 0.5965 val_loss: 1.5031 val_acc: 0.5797
#40 Training loss: 1.4417 training_acc: 0.6063 val_loss: 1.4756 val_acc: 0.5865
#45 Training loss: 1.4298 training_acc: 0.6085 val_loss: 1.4618 val_acc: 0.5915
#50 Training loss: 1.4240 training_acc: 0.6117 val_loss: 1.4919 val_acc: 0.5902


(tensor([ 2,  9, 14,  ..., 10,  0,  4]),
 tensor([ 4, 10, 14,  ..., 10,  0,  4]))

# nn.LSTM

In [None]:
class MyModel(nn.Module):
    def __init__(self, emb, hidden_size, num_classes):
        super(MyModel, self).__init__()
        self.emb = emb
        input_size = emb.weight.shape[1]
        self.rnncell = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(in_features=hidden_size, out_features=num_classes)
        
    def forward(self, x):
        x = self.emb(x)
        output, (h_n, c_n) = self.rnncell(x)
        x = self.fc(output[:,-1,:])
        return x

In [None]:
model = MyModel(
    emb=embedding2,
    hidden_size=16,
    num_classes=num_classes
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.001)

In [None]:
model_train(model, optimizer, criterion, trainloader, testloader, n_epochs=51, epoch_step=5)

#0 Training loss: 2.3721 training_acc: 0.2375 val_loss: 2.2310 val_acc: 0.2709
#5 Training loss: 1.5366 training_acc: 0.5718 val_loss: 1.5287 val_acc: 0.5783
#10 Training loss: 1.3457 training_acc: 0.6293 val_loss: 1.3482 val_acc: 0.6234
#15 Training loss: 1.2410 training_acc: 0.6695 val_loss: 1.2766 val_acc: 0.6462
#20 Training loss: 1.1749 training_acc: 0.6869 val_loss: 1.2141 val_acc: 0.6676
#25 Training loss: 1.1281 training_acc: 0.6935 val_loss: 1.1734 val_acc: 0.6749
#30 Training loss: 1.0839 training_acc: 0.6987 val_loss: 1.1353 val_acc: 0.6799
#35 Training loss: 1.0488 training_acc: 0.7078 val_loss: 1.1069 val_acc: 0.6885
#40 Training loss: 1.0214 training_acc: 0.7127 val_loss: 1.0904 val_acc: 0.6926
#45 Training loss: 0.9904 training_acc: 0.7226 val_loss: 1.0730 val_acc: 0.6935
#50 Training loss: 0.9704 training_acc: 0.7286 val_loss: 1.0807 val_acc: 0.6944


(tensor([ 2,  9, 14,  ..., 10,  0,  4]),
 tensor([14,  9, 14,  ..., 10,  0,  4]))

In [None]:
predict_surname("Snow")

'English'

# nn.GRU

In [None]:
class MyModel(nn.Module):
    def __init__(self, emb, hidden_size, num_classes):
        super(MyModel, self).__init__()
        self.emb = emb
        input_size = emb.weight.shape[1]
        self.rnncell = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(in_features=hidden_size, out_features=num_classes)
        
    def forward(self, x):
        x = self.emb(x)
        output, h_n = self.rnncell(x)
        x = self.fc(output[:,-1,:])
        return x

In [None]:
model = MyModel(
    emb=embedding2,
    hidden_size=16,
    num_classes=num_classes
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.001)

In [None]:
model_train(model, optimizer, criterion, trainloader, testloader, n_epochs=51, epoch_step=5)

#0 Training loss: 2.3386 training_acc: 0.2575 val_loss: 2.1376 val_acc: 0.2723
#5 Training loss: 1.3976 training_acc: 0.5986 val_loss: 1.3987 val_acc: 0.5934
#10 Training loss: 1.2280 training_acc: 0.6556 val_loss: 1.2775 val_acc: 0.6343
#15 Training loss: 1.1440 training_acc: 0.6802 val_loss: 1.2063 val_acc: 0.6571
#20 Training loss: 1.0864 training_acc: 0.7028 val_loss: 1.1502 val_acc: 0.6817
#25 Training loss: 1.0392 training_acc: 0.7128 val_loss: 1.1233 val_acc: 0.6899
#30 Training loss: 1.0025 training_acc: 0.7203 val_loss: 1.1043 val_acc: 0.6899
#35 Training loss: 0.9713 training_acc: 0.7273 val_loss: 1.0823 val_acc: 0.6981
#40 Training loss: 0.9502 training_acc: 0.7344 val_loss: 1.0659 val_acc: 0.7017
#45 Training loss: 0.9266 training_acc: 0.7407 val_loss: 1.0598 val_acc: 0.7022
#50 Training loss: 0.9106 training_acc: 0.7442 val_loss: 1.0394 val_acc: 0.7058


(tensor([ 2,  9, 14,  ..., 10,  0,  4]),
 tensor([ 2,  9, 14,  ..., 10,  0,  4]))

In [None]:
predict_surname("Snow")

'English'

## 2. Классификация обзоров на фильмы (RNN)

Датасет: https://disk.yandex.ru/d/tdinpb0nN_Dsrg

2.1 Создайте набор данных на основе файлов polarity/positive_reviews.csv (положительные отзывы) и polarity/negative_reviews.csv (отрицательные отзывы). Разбейте на обучающую и тестовую выборку.
  * токен = __слово__
  * данные для обучения в датасете представляются в виде последовательности индексов токенов
  * словарь создается на основе _только_ обучающей выборки. Для корректной обработки ситуаций, когда в тестовой выборке встретится токен, который не хранится в словаре, добавьте в словарь специальный токен `<UNK>`
  * добавьте предобработку текста

2.2. Обучите классификатор.
  
  * Для преобразования последовательности индексов в последовательность векторов используйте `nn.Embedding` 
    - подберите адекватную размерность вектора эмбеддинга: 
    - модуль `nn.Embedding` обучается

  * Используйте рекуррентные слои (`nn.RNN`, `nn.LSTM`, `nn.GRU`)


2.3 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)
* Целевое значение accuracy на валидации - 70+%

In [None]:
positive = pd.read_csv("./data/positive_reviews.txt", sep='%-%', header=None, engine="python")
positive["type"] = "positive"
negative = pd.read_csv("./data/negative_reviews.txt", sep='%-%', header=None, engine="python")
negative["type"] = "negative"
df = pd.concat((positive, negative), ignore_index=True)
df.columns = ["review", "type"]
df.head(2)

Unnamed: 0,review,type
0,"simplistic , silly and tedious .",positive
1,"it's so laddish and juvenile , only teenage bo...",positive


In [None]:
encoder2 = LabelEncoder()
y = encoder2.fit_transform(df.type)

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()
puttern = re.compile("^[a-z]+$")
def preprocess(text):
    text = text.lower().strip()
    words = [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text) if (puttern.search(word)) and (word not in stopwords)]
    return words

In [None]:
X = df.review.apply(preprocess)

In [None]:
n_classes = np.unique(y).size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
class Vocab:
    def __init__(self, data):
        self.max_seq_len = data.apply(lambda x: len(x)).max()
        tokens = set()
        for item in data:
            tokens.update(item)
        self.idx_to_token = dict(enumerate(tokens, 2))
        self.idx_to_token[0] = "<PAD>"
        self.idx_to_token[1] = "<UNK>"
        self.token_to_idx = {token: idx for idx, token in self.idx_to_token.items()}
        self.vocab_len = len(self.idx_to_token)

In [None]:
class ReviewsDataset(Dataset):
    def __init__(self, X, y, vocab: Vocab):
        self.X = X
        self.y = torch.LongTensor(y)
        self.vocab = vocab

    def vectorize(self, review):
        '''Генерирует представление отзыва'''
        m_len = self.vocab.max_seq_len
        review = review[:m_len]
#         review_t = torch.zeros(self.vocab.max_seq_len).type(torch.long)
        review_t = torch.zeros(m_len).type(torch.long)
        shift = m_len - len(review)
        for i, token in enumerate(review):
            try:
                review_t[shift+i] = self.vocab.token_to_idx[token]
            except KeyError as ke:
                review_t[shift+i] = self.vocab.token_to_idx["<UNK>"]
#         review_t = torch.cat((torch.zeros(m_len-review_t.size(0)), review_t))
#         review_t
        return review_t

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.vectorize(self.X.iloc[idx]), self.y[idx]

In [None]:
vocab = Vocab(X_train)
train_dataset = ReviewsDataset(X_train, y_train, vocab)
test_dataset = ReviewsDataset(X_test, y_test, vocab)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [None]:
class MyModel(nn.Module):
    def __init__(self, vocab_len, emb_dim, hidden_size, num_classes, n_layers=1):
        super(MyModel, self).__init__()
        self.emb = nn.Embedding(vocab_len, emb_dim, padding_idx=0)
        self.rnncell = nn.RNN(emb_dim, hidden_size, num_layers=n_layers, batch_first=True)
        self.fc = nn.Linear(in_features=hidden_size, out_features=num_classes)
  
    def forward(self, x):
        x = self.emb(x)
        x, h_n = self.rnncell(x)
#         print(h_n.shape, x.shape)
        x = self.fc(x[:, -1, :])
#         x = self.fc(h_n[0].squeeze(0))
#         print(x.shape)
        return x

In [None]:
model = MyModel(
    vocab.vocab_len,
    emb_dim=64,
    hidden_size=16,
    num_classes=2,
    n_layers=2,
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), weight_decay=0.01)

In [None]:
y_true, y_pred = model_train(model, optimizer, criterion, train_loader, test_loader, n_epochs=101, epoch_step=10)

#0 Training loss: 0.6971 training_acc: 0.5081 val_loss: 0.6918 val_acc: 0.5246
#5 Training loss: 0.6891 training_acc: 0.5416 val_loss: 0.6917 val_acc: 0.5180
#10 Training loss: 0.6881 training_acc: 0.5460 val_loss: 0.6916 val_acc: 0.5157
#15 Training loss: 0.6857 training_acc: 0.5488 val_loss: 0.6913 val_acc: 0.5204
#20 Training loss: 0.6781 training_acc: 0.5683 val_loss: 0.6916 val_acc: 0.5218
#25 Training loss: 0.6704 training_acc: 0.5881 val_loss: 0.6925 val_acc: 0.5340
#30 Training loss: 0.6517 training_acc: 0.6199 val_loss: 0.7125 val_acc: 0.5518
#35 Training loss: 0.6052 training_acc: 0.6730 val_loss: 0.6787 val_acc: 0.5968
#40 Training loss: 0.5181 training_acc: 0.7477 val_loss: 0.6441 val_acc: 0.6512
#45 Training loss: 0.3994 training_acc: 0.8220 val_loss: 0.6274 val_acc: 0.6906
#50 Training loss: 0.2704 training_acc: 0.8931 val_loss: 0.6980 val_acc: 0.6910


In [None]:
confusion_matrix(y_true, y_pred)

array([[843, 224],
       [435, 631]], dtype=int64)

In [None]:
def test_review(review):
    x = train_dataset.vectorize(review).unsqueeze(0)
    variety, predictions = model(x).softmax(1).topk(k=1, dim=1)
    variety = variety.cpu().detach().view(-1)
    pred_ = predictions.cpu().detach().view(-1)
    pred_ = encoder2.inverse_transform(pred_)
    out_ = ", ".join([f"{nat}:{frac:.2f}" for nat, frac in zip(pred_, variety)])
    print(f"{review} --- {out_}")

In [None]:
reviews_ = [
    "The last time I had lunch here, I really liked the soup. Excellent. I'll come again.",
    "It feels like I dined on frogs. I will not visit this institution again."
]

In [None]:
for review in reviews_:
     test_review(preprocess(review))

['last', 'time', 'lunch', 'really', 'liked', 'soup', 'excellent', 'come'] --- positive:0.98
['feel', 'like', 'dined', 'frog', 'visit', 'institution'] --- positive:0.60
