<img src="https://s8.hostingkartinok.com/uploads/images/2018/08/308b49fcfbc619d629fe4604bceb67ac.jpg" width=500, height=450>
<h3 style="text-align: center;"><b>Физтех-Школа Прикладной математики и информатики (ФПМИ) МФТИ</b></h3>

---

# Задание 3

## Классификация текстов

В этом задании вам предстоит попробовать несколько методов, используемых в задаче классификации, а также понять насколько хорошо модель понимает смысл слов и какие слова в примере влияют на результат.

In [83]:
import pandas as pd
import numpy as np
import torch

from torchtext import datasets

from torchtext.vocab import Vectors, GloVe

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from tqdm.autonotebook import tqdm

In [2]:
pip install -q torchtext==0.4

In [2]:
import torchtext
torchtext.__version__

'0.4.0'

In [3]:
from torchtext.data import Field, LabelField
from torchtext.data import BucketIterator

В этом задании мы будем использовать библиотеку torchtext. Она довольна проста в использовании и поможет нам сконцентрироваться на задаче, а не на написании Dataloader-а.

In [4]:
TEXT = Field(sequential=True, lower=True, include_lengths=True)  # Поле текста
LABEL = LabelField(dtype=torch.float)  # Поле метки

In [5]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Датасет на котором мы будем проводить эксперементы это комментарии к фильмам из сайта IMDB.

In [6]:
train, test = datasets.IMDB.splits(TEXT, LABEL)  # загрузим датасет
train, valid = train.split(random_state=random.seed(SEED))  # разобьем на части

In [7]:
TEXT.build_vocab(train)
LABEL.build_vocab(train)

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"

train_iter, valid_iter, test_iter = BucketIterator.splits(
    (train, valid, test), 
    batch_size = 64,
    sort_within_batch = True,
    device = device)

## RNN

Для начала попробуем использовать рекурентные нейронные сети. На семинаре вы познакомились с GRU, вы можете также попробовать LSTM. Можно использовать для классификации как hidden_state, так и output последнего токена.

In [9]:
class RNNBaseline(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.rnn = torch.nn.GRU(embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, dropout=dropout, bidirectional=bidirectional)  # YOUR CODE GOES HERE
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2 , output_dim)
        )  
        
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        text_lengths = text_lengths.cpu().long()

        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        # cell arg for LSTM, remove for GRU
        packed_output, hidden = self.rnn(packed_embedded)

        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)  
        
        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1) # YOUR CODE GOES HERE
                
        #hidden = [batch size, hid dim * num directions] or [batch_size, hid dim * num directions]
            
        return self.fc(output[-1, :, :]).reshape(-1), hidden
        #return output[-1, :, :], hidden

In [16]:
import numpy as np

def train_RNN(model, loss_func, opt, train_iter, valid_iter, max_epochs):
    min_loss = np.inf
    cur_patience = 0

    for epoch in range(1, max_epochs + 1):
        train_loss = 0.0
        model.train()
        pbar_t = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
        pbar_t.set_description(f"Epoch {epoch}")
        for it, batch in pbar_t: 

            opt.zero_grad()
            (text, text_l), labels = batch
            pred, hidden = model(text, text_l)
            loss = loss_func(pred, labels)
            loss.backward()
            opt.step()
            train_loss += loss.item()
        train_loss /= len(train_iter)

        val_loss = 0.0
        model.eval()
        pbar_v = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
        pbar_v.set_description(f"Epoch {epoch}")
        for it, batch in pbar_v:

            (text, text_l), labels = batch
            pred, hidden = model(text, text_l)
            val_loss += loss_func(pred, labels).item()


        val_loss /= len(valid_iter)
        if val_loss < min_loss:
           min_loss = val_loss
           best_model = model.state_dict()
        else:
           cur_patience += 1
           if cur_patience == patience:
              cur_patience = 0
              break
    
        print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
    model.load_state_dict(best_model)

Поиграйтесь с гиперпараметрами

In [35]:
vocab_size = len(TEXT.vocab)
emb_dim = 100
hidden_dim = 256
output_dim = 1
n_layers = 1
bidirectional = True
dropout = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
patience=3

In [36]:
model = RNNBaseline(
    vocab_size=vocab_size,
    embedding_dim=emb_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    n_layers=n_layers,
    bidirectional=bidirectional,
    dropout=dropout,
    pad_idx=PAD_IDX
    )
model = model.to(device)

  "num_layers={}".format(dropout, num_layers))


In [37]:
opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

max_epochs = 20

In [38]:
train_RNN(model, loss_func, opt, train_iter, valid_iter, max_epochs)

HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 1, Training Loss: 0.6920342780377743, Validation Loss: 0.6876797489190506


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 2, Training Loss: 0.685691155873946, Validation Loss: 0.6821977945707612


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 3, Training Loss: 0.6788777430562207, Validation Loss: 0.6967750232098466


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 4, Training Loss: 0.663794336419036, Validation Loss: 0.6577189812215708


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 5, Training Loss: 0.6473730784045518, Validation Loss: 0.6471321840407485


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 6, Training Loss: 0.6396356680749977, Validation Loss: 0.6425690486774607


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 7, Training Loss: 0.6238581617600727, Validation Loss: 0.6413605144973529


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 8, Training Loss: 0.6145165343462986, Validation Loss: 0.6459622820050029


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Обучите сетку! Используйте любые вам удобные инструменты, Catalyst, PyTorch Lightning или свои велосипеды.

In [39]:
from sklearn.metrics import f1_score

In [40]:
answers = {'predictions': np.array([]), 'targets': np.array([])}

model.eval()
for batch in test_iter:
    (text, text_l), labels = batch
    pred, h = model(text, text_l)
    pred = nn.Sigmoid()(pred).detach().cpu().numpy()
    predictions = np.where(pred>.5, 1, 0)
    answers['predictions'] = np.append(answers['predictions'], predictions) 
    answers['targets'] = np.append(answers['targets'], labels.detach().cpu().numpy()) 

Посчитайте f1-score вашего классификатора на тестовом датасете.

**Ответ**:

In [41]:
f1_score(answers['targets'], answers['predictions'])

0.7188774058974116

Если уменьшить количество слоев, то качество получается лучше, при двух словя F1_score ~ 0.689

## CNN

![](https://www.researchgate.net/publication/333752473/figure/fig1/AS:769346934673412@1560438011375/Standard-CNN-on-text-classification.png)

Для классификации текстов также часто используют сверточные нейронные сети. Идея в том, что как правило сентимент содержат словосочетания из двух-трех слов, например "очень хороший фильм" или "невероятная скука". Проходясь сверткой по этим словам мы получим какой-то большой скор и выхватим его с помощью MaxPool. Далее идет обычная полносвязная сетка. Важный момент: свертки применяются не последовательно, а параллельно. Давайте попробуем!

In [46]:
TEXT = Field(sequential=True, lower=True, batch_first=True)  # batch_first тк мы используем conv  
LABEL = LabelField(batch_first=True, dtype=torch.float)

train, tst = datasets.IMDB.splits(TEXT, LABEL)
trn, vld = train.split(random_state=random.seed(SEED))

TEXT.build_vocab(trn)
LABEL.build_vocab(trn)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [47]:
train_iter, val_iter, test_iter = BucketIterator.splits(
        (trn, vld, tst),
        batch_sizes=(128, 256, 256),
        sort=False,
        sort_key= lambda x: len(x.src),
        sort_within_batch=False,
        device=device,
        repeat=False,
)

Вы можете использовать Conv2d с `in_channels=1, kernel_size=(kernel_sizes[0], emb_dim))` или Conv1d c `in_channels=emb_dim, kernel_size=kernel_size[0]`. Но хорошенько подумайте над shape в обоих случаях.

In [48]:
class CNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        emb_dim,
        out_channels,
        kernel_sizes,
        dropout=0.5,
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.conv_0 = nn.Sequential(
            nn.Conv1d(emb_dim, out_channels, kernel_size=kernel_sizes[0], padding=1, stride=2),
            nn.BatchNorm1d(out_channels)) # YOUR CODE GOES HERE
        
        self.conv_1 = nn.Sequential(
            nn.Conv1d(emb_dim, out_channels, kernel_size=kernel_sizes[1], padding=1, stride=2),
            nn.BatchNorm1d(out_channels))  # YOUR CODE GOES HERE
        
        self.conv_2 = nn.Sequential(
            nn.Conv1d(emb_dim, out_channels, kernel_size=kernel_sizes[2], padding=1, stride=2),
            nn.BatchNorm1d(out_channels))  # YOUR CODE GOES HERE
        
        self.fc = nn.Sequential(
            nn.Linear(len(kernel_sizes) * out_channels, 1)
        )
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        
        embedded = embedded.permute(0, 2, 1) # may be reshape here

        conved_0 = F.relu(self.conv_0(embedded))  # may be reshape here
        conved_1 = F.relu(self.conv_1(embedded))  # may be reshape here
        conved_2 = F.relu(self.conv_2(embedded))  # may be reshape here
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))

        return self.fc(cat).reshape(-1)

In [49]:
kernel_sizes = [3, 4, 5]
vocab_size = len(TEXT.vocab)
out_channels=64
dropout = 0.2
dim = 300

model = CNN(vocab_size=vocab_size, emb_dim=dim, out_channels=out_channels,
            kernel_sizes=kernel_sizes, dropout=dropout)

In [50]:
model.to(device)
None

In [51]:
opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

In [52]:
max_epochs = 30

Обучите!

In [53]:
import numpy as np

min_loss = np.inf
cur_patience = 0
patience=3

for epoch in range(1, max_epochs + 1):
    # TRAINING 
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar: 
        
        opt.zero_grad()
        inputs, targets = batch
        predictions = model(inputs)
        loss = loss_func(predictions, targets)
        loss.backward()
        train_loss += loss.item()
        opt.step()

    train_loss /= len(train_iter)

    # VALIDATION  
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(val_iter), total=len(val_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")

    for it, batch in pbar:
        inputs, targets = batch
        predictions = model(inputs)
        loss = loss_func(predictions, targets)
        val_loss += loss.item()

    val_loss /= len(val_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)

HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 1, Training Loss: 0.7651349562798103, Validation Loss: 0.6588816603024801


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 2, Training Loss: 0.6334536658586377, Validation Loss: 0.5676853199799855


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 3, Training Loss: 0.542757039957673, Validation Loss: 0.49866224428017936


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 4, Training Loss: 0.45936765788245376, Validation Loss: 0.4330736011266708


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 5, Training Loss: 0.37663135498109523, Validation Loss: 0.3970426271359126


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 6, Training Loss: 0.2976094051434176, Validation Loss: 0.3659355541070302


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 7, Training Loss: 0.24593976314050436, Validation Loss: 0.35147182643413544


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 8, Training Loss: 0.17915722956187533, Validation Loss: 0.33517371912797295


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 9, Training Loss: 0.1402345925678302, Validation Loss: 0.3301479950547218


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 10, Training Loss: 0.10884179938992444, Validation Loss: 0.32583688348531725


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 11, Training Loss: 0.08451209190118052, Validation Loss: 0.3272054602702459


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 12, Training Loss: 0.06163704864354464, Validation Loss: 0.32701600591341656


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

<All keys matched successfully>

In [54]:
from sklearn.metrics import f1_score

In [55]:
answers = {'predictions': np.array([]), 'targets': np.array([])}

model.eval()
for batch in test_iter:
    input_embedd, targets = batch
    pred = model(input_embedd)
    pred = nn.Sigmoid()(pred).detach().cpu().numpy()
    predictions = np.where(pred>.5, 1, 0)
    answers['predictions'] = np.append(answers['predictions'], predictions) 
    answers['targets'] = np.append(answers['targets'], targets.detach().cpu().numpy()) 

Посчитайте f1-score вашего классификатора.

**Ответ**:

In [56]:
f1_score(answers['targets'], answers['predictions'])

0.8621615402316484

## Интерпретируемость

Посмотрим, куда смотрит наша модель. Достаточно запустить код ниже.

In [42]:
!pip install -q captum

[K     |████████████████████████████████| 4.4MB 6.3MB/s 
[?25h

In [58]:
from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization

PAD_IND = TEXT.vocab.stoi['pad']

token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)
lig = LayerIntegratedGradients(model, model.embedding)

In [59]:
def forward_with_softmax(inp):
    logits = model(inp)
    return torch.softmax(logits, 0)[0][1]

def forward_with_sigmoid(input):
    return torch.sigmoid(model(input))


# accumalate couple samples in this array for visualization purposes
vis_data_records_ig = []

def interpret_sentence(model, sentence, min_len = 7, label = 0):
    model.eval()
    text = [tok for tok in TEXT.tokenize(sentence)]
    if len(text) < min_len:
        text += ['pad'] * (min_len - len(text))
    indexed = [TEXT.vocab.stoi[t] for t in text]

    model.zero_grad()

    input_indices = torch.tensor(indexed, device=device)
    input_indices = input_indices.unsqueeze(0)
    
    # input_indices dim: [sequence_length]
    seq_length = min_len

    # predict
    pred = forward_with_sigmoid(input_indices).item()
    pred_ind = round(pred)

    # generate reference indices for each sample
    reference_indices = token_reference.generate_reference(seq_length, device=device).unsqueeze(0)

    # compute attributions and approximation delta using layer integrated gradients
    attributions_ig, delta = lig.attribute(input_indices, reference_indices, \
                                           n_steps=5000, return_convergence_delta=True)

    print('pred: ', LABEL.vocab.itos[pred_ind], '(', '%.2f'%pred, ')', ', delta: ', abs(delta))

    add_attributions_to_visualizer(attributions_ig, text, pred, pred_ind, label, delta, vis_data_records_ig)
    
def add_attributions_to_visualizer(attributions, text, pred, pred_ind, label, delta, vis_data_records):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()

    # storing couple samples in an array for visualization purposes
    vis_data_records.append(visualization.VisualizationDataRecord(
                            attributions,
                            pred,
                            LABEL.vocab.itos[pred_ind],
                            LABEL.vocab.itos[label],
                            LABEL.vocab.itos[1],
                            attributions.sum(),       
                            text,
                            delta))

In [60]:
interpret_sentence(model, 'It was a fantastic performance !', label=1)
interpret_sentence(model, 'Best film ever', label=1)
interpret_sentence(model, 'Such a great show!', label=1)
interpret_sentence(model, 'It was a horrible movie', label=0)
interpret_sentence(model, 'I\'ve never watched something as bad', label=0)
interpret_sentence(model, 'It is a disgusting movie!', label=0)

pred:  pos ( 0.94 ) , delta:  tensor([0.0003], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.01 ) , delta:  tensor([0.0001], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.42 ) , delta:  tensor([0.0003], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.00 ) , delta:  tensor([8.5251e-06], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.00 ) , delta:  tensor([0.0002], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.15 ) , delta:  tensor([0.0002], device='cuda:0', dtype=torch.float64)


Попробуйте добавить свои примеры!

In [61]:
print('Visualize attributions based on Integrated Gradients')
visualization.visualize_text(vis_data_records_ig)

Visualize attributions based on Integrated Gradients


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
pos,pos (0.94),pos,1.64,It was a fantastic performance ! pad
,,,,
pos,neg (0.01),pos,1.32,Best film ever pad pad pad pad
,,,,
pos,neg (0.42),pos,1.55,Such a great show! pad pad pad
,,,,
neg,neg (0.00),pos,-0.6,It was a horrible movie pad pad
,,,,
neg,neg (0.00),pos,0.08,I've never watched something as bad pad
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
pos,pos (0.94),pos,1.64,It was a fantastic performance ! pad
,,,,
pos,neg (0.01),pos,1.32,Best film ever pad pad pad pad
,,,,
pos,neg (0.42),pos,1.55,Such a great show! pad pad pad
,,,,
neg,neg (0.00),pos,-0.6,It was a horrible movie pad pad
,,,,
neg,neg (0.00),pos,0.08,I've never watched something as bad pad
,,,,


## Эмбэдинги слов

Вы ведь не забыли, как мы можем применить знания о word2vec и GloVe. Давайте попробуем!

In [84]:
TEXT.build_vocab(trn, vectors='glove.840B.300d')# YOUR CODE GOES HERE
# подсказка: один из импортов пока не использовался, быть может он нужен в строке выше :)
LABEL.build_vocab(trn)

word_embeddings = TEXT.vocab.vectors

kernel_sizes = [3, 4, 5]
vocab_size = len(TEXT.vocab)
dropout = 0.5
dim = 300


100%|█████████▉| 2195587/2196017 [04:48<00:00, 7379.21it/s][A

In [85]:
train, tst = datasets.IMDB.splits(TEXT, LABEL)
trn, vld = train.split(random_state=random.seed(SEED))

device = "cuda" if torch.cuda.is_available() else "cpu"

train_iter, val_iter, test_iter = BucketIterator.splits(
        (trn, vld, tst),
        batch_sizes=(128, 256, 256),
        sort=False,
        sort_key= lambda x: len(x.src),
        sort_within_batch=False,
        device=device,
        repeat=False,
)

In [86]:
model = CNN(vocab_size=vocab_size, emb_dim=dim, out_channels=64,
            kernel_sizes=kernel_sizes, dropout=dropout)

word_embeddings = TEXT.vocab.vectors

prev_shape = model.embedding.weight.shape

model.embedding.weight = nn.Parameter(word_embeddings, requires_grad=True) # инициализируйте эмбэдинги

assert prev_shape == model.embedding.weight.shape
model.to(device)

opt = torch.optim.Adam(model.parameters())

Вы знаете, что делать.

In [87]:
import numpy as np

min_loss = np.inf

cur_patience = 0
patience=3

for epoch in range(1, max_epochs + 1):
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar: 

        opt.zero_grad()
        inputs, targets = batch
        predictions = model(inputs)
        loss = loss_func(predictions, targets)
        loss.backward()
        train_loss += loss.item()
        opt.step()

    train_loss /= len(train_iter)

    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(val_iter), total=len(val_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")

    for it, batch in pbar:
        inputs, targets = batch
        predictions = model(inputs)
        loss = loss_func(predictions, targets)
        val_loss += loss.item()

    val_loss /= len(val_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)

HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 1, Training Loss: 0.727304629383296, Validation Loss: 0.35073896249135333


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 2, Training Loss: 0.370015519161294, Validation Loss: 0.3193125486373901


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 3, Training Loss: 0.2612980617223865, Validation Loss: 0.29683961073557535


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 4, Training Loss: 0.17839275376640096, Validation Loss: 0.2879015564918518


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 5, Training Loss: 0.11801812093514595, Validation Loss: 0.28879838486512505


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 6, Training Loss: 0.07000768124839685, Validation Loss: 0.30371296107769014


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

<All keys matched successfully>

In [88]:
from sklearn.metrics import f1_score

In [89]:
answers = {'predictions': np.array([]), 'targets': np.array([])}

model.eval()
for batch in test_iter:
    input_embedd, targets = batch
    pred = model(input_embedd)
    pred = nn.Sigmoid()(pred).detach().cpu().numpy()
    predictions = np.where(pred>.5, 1, 0)
    answers['predictions'] = np.append(answers['predictions'], predictions) 
    answers['targets'] = np.append(answers['targets'], targets.detach().cpu().numpy()) 

Посчитайте f1-score вашего классификатора.

**Ответ**:

In [90]:
f1_score(answers['targets'], answers['predictions'])

0.8674875045714983

Проверим насколько все хорошо!

In [91]:
PAD_IND = TEXT.vocab.stoi['pad']

token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)
lig = LayerIntegratedGradients(model, model.embedding)
vis_data_records_ig = []

interpret_sentence(model, 'It was a fantastic performance !', label=1)
interpret_sentence(model, 'Best film ever', label=1)
interpret_sentence(model, 'Such a great show!', label=1)
interpret_sentence(model, 'It was a horrible movie', label=0)
interpret_sentence(model, 'I\'ve never watched something as bad', label=0)
interpret_sentence(model, 'It is a disgusting movie!', label=0)

pred:  pos ( 0.98 ) , delta:  tensor([6.4950e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.00 ) , delta:  tensor([8.9722e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.29 ) , delta:  tensor([0.0001], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.00 ) , delta:  tensor([3.5691e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.02 ) , delta:  tensor([3.5394e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.00 ) , delta:  tensor([0.0001], device='cuda:0', dtype=torch.float64)


In [92]:
print('Visualize attributions based on Integrated Gradients')
visualization.visualize_text(vis_data_records_ig)

Visualize attributions based on Integrated Gradients


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
pos,pos (0.98),pos,1.7,It was a fantastic performance ! pad
,,,,
pos,neg (0.00),pos,1.51,Best film ever pad pad pad pad
,,,,
pos,neg (0.29),pos,1.6,Such a great show! pad pad pad
,,,,
neg,neg (0.00),pos,-0.5,It was a horrible movie pad pad
,,,,
neg,neg (0.02),pos,1.24,I've never watched something as bad pad
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
pos,pos (0.98),pos,1.7,It was a fantastic performance ! pad
,,,,
pos,neg (0.00),pos,1.51,Best film ever pad pad pad pad
,,,,
pos,neg (0.29),pos,1.6,Such a great show! pad pad pad
,,,,
neg,neg (0.00),pos,-0.5,It was a horrible movie pad pad
,,,,
neg,neg (0.02),pos,1.24,I've never watched something as bad pad
,,,,
