<a href="https://colab.research.google.com/github/MatienkoAndrew/notebooks/blob/main/%22_%5Bhomework%5Dclassification_ipynb%22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img src="https://s8.hostingkartinok.com/uploads/images/2018/08/308b49fcfbc619d629fe4604bceb67ac.jpg" width=500, height=450>
<h3 style="text-align: center;"><b>Физтех-Школа Прикладной математики и информатики (ФПМИ) МФТИ</b></h3>

---

# Задание 3

## Классификация текстов

В этом задании вам предстоит попробовать несколько методов, используемых в задаче классификации, а также понять насколько хорошо модель понимает смысл слов и какие слова в примере влияют на результат.

In [1]:
##-- if GPU
! pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [2]:
##-- if CPU
# ! pip install torch==1.7.0
# ! pip install torchtext==0.9.0

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from tqdm.autonotebook import tqdm

# import pytorch_lightning as pl

В этом задании мы будем использовать библиотеку torchtext. Она довольна проста в использовании и поможет нам сконцентрироваться на задаче, а не на написании Dataloader-а.

Датасет на котором мы будем проводить эксперементы это комментарии к фильмам из сайта IMDB.

## enable_reproducibility

In [4]:
import os
import random

import numpy as np
import torch


SEED = 1234


def enable_reproducibility(
        seed=SEED, raise_if_no_deterministic=True,
        cudnn_deterministic=True, disable_cudnn_benchmarking=True):
    # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
    torch.use_deterministic_algorithms(raise_if_no_deterministic)

    # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"
    
    torch.backends.cudnn.benchmark = not disable_cudnn_benchmarking
    torch.backends.cudnn.deterministic = cudnn_deterministic

    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [5]:
enable_reproducibility()

## Get dataset

In [6]:
import shelve
import time

import torchtext
from torch.utils.data import random_split

start = time.time()
with shelve.open('imdb_dataset_fast_cache') as imdb_dataset_fast_cache:
    if any(split not in imdb_dataset_fast_cache for split in ('train', 'valid', 'test') ):
        print("Loading dataset from slow torchtext files...")
        train_valid, test = torchtext.datasets.IMDB(split=('train', 'test'))
        train_valid, test = list(train_valid), list(test)
        # default value of the argument split_ratio in torchtext.legacy.data.Data.split()
        split_ratio = 0.7
        num_train = int(len(train_valid) * split_ratio)  
        train, valid = random_split(train_valid, [num_train, len(train_valid) - num_train])
        train = list(train)
        valid = list(valid)
        imdb_dataset_fast_cache['train'] = train
        imdb_dataset_fast_cache['valid'] = valid
        imdb_dataset_fast_cache['test'] = test
        print("Dataset cached.")
    else:
        train = imdb_dataset_fast_cache['train']
        valid = imdb_dataset_fast_cache['valid']
        test = imdb_dataset_fast_cache['test']
        print("Dataset loaded from cache.")
print(f"Dataset downloaded. Time spent: {time.time() - start}")

Dataset loaded from cache.
Dataset downloaded. Time spent: 0.27844905853271484


## tokenize

In [7]:
from collections import Counter
from itertools import chain

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab

# get tokenizer as used in torchtext.legacy.data.Field by default (string.split)
tokenizer = get_tokenizer(None) 
counter = Counter(chain.from_iterable(tokenizer(line) for _, line in train))
vocab = Vocab(counter)
PAD_TOKEN = '<pad>'  # default special padding token in Vocab

In [8]:
labels = set([label for (label, _) in chain(train, valid)])
num_classes = len(labels)
num_classes, labels

(2, {'neg', 'pos'})

In [9]:
def label_transform(label):
    if label == 'pos':
        return 1
    elif label == 'neg':
        return 0
    raise ValueError(f"unknown label {label}")

def label_inverse_transform(idx):
    if idx == 1:
        return 'pos'
    elif idx == 0:
        return 'neg'
    raise ValueError(f"unknown idx {idx}")

def text_transform(text, lower=True):
    if lower:
        text = text.lower()
    return [vocab[token] for token in tokenizer(text)]

## collate_batch

In [10]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    label_list, text_list, texts_lengths = [], [], []
    for (label, text) in batch:
        label_list.append(label_transform(label))
        token_indices = text_transform(text)
        texts_lengths.append(len(token_indices))
        processed_text = torch.tensor(token_indices)
        text_list.append(processed_text)
    return torch.tensor(label_list), pad_sequence(text_list, batch_first=True, padding_value=vocab[PAD_TOKEN]), texts_lengths

## BatchSamplerMimickingBucketIterator

In [11]:
import math
import random

from torch.utils.data import DataLoader, Sampler


class BatchSamplerMimickingBucketIterator(Sampler):
    def __init__(self, raw_dataset_list, tokenizer, batch_size, drop_last=False, pool_size_multiplier=1, decreasing_order_within_batch=True):
        self._batch_size = batch_size
        self._drop_last = drop_last
        self._pool_size_multiplier = pool_size_multiplier
        self._indices_and_lengths = [(i, len(tokenizer(text))) for i, (_, text) in enumerate(raw_dataset_list)]
        self._decreasing_order_within_batch = decreasing_order_within_batch
    
    def __len__(self):
        round_ = math.floor if self._drop_last else math.ceil
        return round_(len(self._indices_and_lengths) / self._batch_size)
    
    def __iter__(self):
        batch_size = self._batch_size
        drop_last = self._drop_last
        pool_size = batch_size * self._pool_size_multiplier
        indices = self._indices_and_lengths
        reverse = self._decreasing_order_within_batch
        random.shuffle(indices)
        pooled_indices = []
        # create pool of indices with similar lengths
        for i in range(0, len(indices), batch_size * pool_size):
            pooled_indices.extend(sorted(indices[i:i + batch_size * pool_size], key=lambda x: x[1], reverse=reverse))

        pooled_indices = [x[0] for x in pooled_indices]

        # yield indices for current batch
        last_index = len(pooled_indices) - len(pooled_indices) % batch_size
        for i in range(0, len(pooled_indices), batch_size):
            if drop_last and i == last_index:
                break
            yield pooled_indices[i:i + batch_size]

## DataLoader

In [12]:
from torch.utils.data import DataLoader

batch_size = 64
# 8 * 100 is taken from here:
# https://github.com/pytorch/text/blob/master/examples/legacy_tutorial/migration_tutorial.ipynb
pool_size_multiplier = 8 * 100 // batch_size

batch_sampler = BatchSamplerMimickingBucketIterator(train, tokenizer, batch_size, pool_size_multiplier=pool_size_multiplier, drop_last=True)
train_loader = DataLoader(train, batch_sampler=batch_sampler, collate_fn=collate_batch)
valid_loader = DataLoader(valid, batch_size=batch_size, collate_fn=collate_batch)
test_loader = DataLoader(test, batch_size=batch_size, collate_fn=collate_batch)

In [13]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [14]:
device

'cuda'

## RNN

Для начала попробуем использовать рекурентные нейронные сети. На семинаре вы познакомились с GRU, вы можете также попробовать LSTM. Можно использовать для классификации как hidden_state, так и output последнего токена.

### RNNBaseline

In [15]:
from torch import nn

class RNNBaseline(nn.Module):
    def __init__(
            self, vocab_size, embedding_dim, hidden_dim, output_dim,
            n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.GRU(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=n_layers, dropout=dropout, bidirectional=bidirectional)
        self.fc = nn.Linear((bidirectional + 1) * hidden_dim, output_dim)

    def forward(self, texts, texts_lengths):
        embedded = self.embedding(texts)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, texts_lengths, batch_first=True, enforce_sorted=False)
        _, hidden = self.rnn(packed_embedded)
        features = torch.hstack((hidden[-2], hidden[-1]))
        return self.fc(features)

### training

In [16]:
from copy import deepcopy

import numpy as np
from tqdm.notebook import tqdm

def training(model, train_loader, valid_loader, patience):
    min_loss = np.inf
    cur_patience = 0
    for epoch in range(1, max_epochs + 1):
        train_loss = 0.0
        model.train()
        pbar = tqdm(enumerate(train_loader), total=len(train_loader), leave=True)
        pbar.set_description(f"epoch {epoch}, training")
        for it, batch in pbar: 
            labels, texts, texts_lengths = batch
            labels, texts = labels.to(device), texts.to(device)
            if labels.ndim == 1:
                labels = labels.unsqueeze(1)
            opt.zero_grad()
            output = model(texts, texts_lengths)
            labels = labels.type_as(output)
            loss = loss_func(output, labels)
            loss.backward()
            opt.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)
        val_loss = 0.0
        model.eval()
        pbar = tqdm(enumerate(valid_loader), total=len(valid_loader), leave=True)
        pbar.set_description(f"epoch {epoch}, validation")
        with torch.no_grad():
            for it, batch in pbar:
                labels, texts, texts_lengths = batch
                labels, texts = labels.to(device), texts.to(device)
                if labels.ndim == 1:
                    labels = labels.unsqueeze(1)
                output = model(texts, texts_lengths)
                labels = labels.type_as(output)
                loss = loss_func(output, labels)
                val_loss += loss.item()
        val_loss /= len(valid_loader)
        spam = False
        if val_loss < min_loss:
            min_loss = val_loss
            best_model_state_dict = deepcopy(model.state_dict())
            cur_patience = 0
        else:
            cur_patience += 1
            if cur_patience > patience:
                spam = True
        print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
        if spam:
            print(f"Patience is over. Training stopped after {patience + 1} epochs "
                  "without decreasing validation loss.")
            break
    return best_model_state_dict

### Hyperparams

In [17]:
# enable_reproducibility()

vocab_size = len(vocab)
pad_idx = vocab[PAD_TOKEN]
emb_dim = 100
hidden_dim = 256
output_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.2
patience = 3

model = RNNBaseline(
    vocab_size=vocab_size,
    embedding_dim=emb_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    n_layers=n_layers,
    bidirectional=bidirectional,
    dropout=dropout,
    pad_idx=pad_idx
)
model = model.to(device)

opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

max_epochs = 20

### fit

In [18]:
# %%time
# enable_reproducibility(raise_if_no_deterministic=False)
# best_model_state_dict = training(model, train_loader, valid_loader, patience)
# enable_reproducibility()

### Metrics

In [19]:
from sklearn.metrics import f1_score as sk_f1_score

@torch.no_grad()
def testing(model, test_loader, device):
    all_results = []
    all_labels = []
    model.eval()
    for labels, texts, texts_lengths in tqdm(test_loader, desc="testing"):
        all_labels.append(labels)
        texts = texts.to(device)
        all_results.append(model(texts, texts_lengths))
    all_results = torch.cat(all_results)
    all_labels = torch.cat(all_labels).view(all_results.shape)
    return all_results, all_labels

def binary_predict(input, output_type=torch.long):
    return (torch.sigmoid(input) > 0.5).type(output_type)

def f1_score(y_pred, y_true):
    y_pred = y_pred.cpu().numpy()
    y_true = y_true.cpu().numpy()
    return sk_f1_score(y_true, y_pred)

In [20]:
# model.load_state_dict(best_model_state_dict)
# outputs, labels = testing(model, test_loader, device)
# preds = binary_predict(outputs)
# print(f"f1-score of the RNNBaseline: {f1_score(preds, labels)}")

Посчитайте f1-score вашего классификатора на тестовом датасете.

**Ответ**: 0.8066

---

## CNN

![](https://www.researchgate.net/publication/333752473/figure/fig1/AS:769346934673412@1560438011375/Standard-CNN-on-text-classification.png)

Для классификации текстов также часто используют сверточные нейронные сети. Идея в том, что как правило сентимент содержат словосочетания из двух-трех слов, например "очень хороший фильм" или "невероятная скука". Проходясь сверткой по этим словам мы получим какой-то большой скор и выхватим его с помощью MaxPool. Далее идет обычная полносвязная сетка. Важный момент: свертки применяются не последовательно, а параллельно. Давайте попробуем!

Вы можете использовать Conv2d с `in_channels=1, kernel_size=(kernel_sizes[0], emb_dim))` или Conv1d c `in_channels=emb_dim, kernel_size=kernel_size[0]`. Но хорошенько подумайте над shape в обоих случаях.

### CNN

In [21]:
class CNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        emb_dim,
        out_channels,
        kernel_sizes,
        dropout=0.5,
    ):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.conv_0 = nn.Conv1d(in_channels=emb_dim,
                                out_channels=out_channels,
                                kernel_size=kernel_sizes[0])

        self.conv_1 = nn.Conv1d(in_channels=emb_dim,
                                out_channels=out_channels,
                                kernel_size=kernel_sizes[1])
        
        self.conv_2 = nn.Conv1d(in_channels=emb_dim,
                                out_channels=out_channels,
                                kernel_size=kernel_sizes[2])
        
        self.fc = nn.Linear(len(kernel_sizes) * out_channels, 1)
        
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, text, *_):
        ## ~ [batch_size x seq_length x embed_size]: 64 предложения x 2098 слов в предложении x 300 размер эмбединга
        embedded = self.embedding(text)
        ## ~ [batch_size x embed_size x embed_size]
        embedded_permute = embedded.permute(0, 2, 1)  # may be reshape here
        conved_0 = F.relu(self.conv_0(embedded_permute))  # may be reshape here
        conved_1 = F.relu(self.conv_1(embedded_permute))  # may be reshape here
        conved_2 = F.relu(self.conv_2(embedded_permute))  # may be reshape here
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))

        # print(f"text.shape={text.shape}")
        # print(f"embedded.shape={embedded.shape}")
        # print(f"embedded_permute.shape={embedded_permute.shape}")
        # print(f"conved_0.shape={conved_0.shape}")
        # print(f"conved_1.shape={conved_1.shape}")
        # print(f"conved_2.shape={conved_2.shape}")
        # print(f"pooled_0.shape={pooled_0.shape}")
        # print(f"pooled_1.shape={pooled_1.shape}")
        # print(f"pooled_2.shape={pooled_2.shape}")
        # print(f"cat.shape={cat.shape}")
        # print('-------------------------------')
            
        return self.fc(cat)

### HYPERPARAMS

In [22]:
kernel_sizes = [3, 4, 5]
vocab_size = len(vocab)
out_channels=64
dropout = 0.5
dim = 300
max_epochs=20

model = CNN(vocab_size=vocab_size, emb_dim=dim, out_channels=out_channels,
            kernel_sizes=kernel_sizes, dropout=dropout)

model.to(device)

opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()
model.to(device)

CNN(
  (embedding): Embedding(223801, 300)
  (conv_0): Conv1d(300, 64, kernel_size=(3,), stride=(1,))
  (conv_1): Conv1d(300, 64, kernel_size=(4,), stride=(1,))
  (conv_2): Conv1d(300, 64, kernel_size=(5,), stride=(1,))
  (fc): Linear(in_features=192, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

### FIT

In [23]:
! pip install torchsummaryX

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
from torchsummaryX import summary as summaryx

# sample input tensor
input_size = (64, 2098) ##-- ~ [batch_size x seq_length]: 64 предложения (размер батча) x 2098 слов в предложении
x_sample = torch.zeros(input_size, dtype=torch.long, device=device)

summaryx(model, x_sample)

              Kernel Shape     Output Shape    Params  Mult-Adds
Layer                                                           
0_embedding  [300, 223801]  [64, 2098, 300]  67.1403M   67.1403M
1_conv_0      [300, 64, 3]   [64, 64, 2096]   57.664k  120.7296M
2_conv_1      [300, 64, 4]   [64, 64, 2095]   76.864k   160.896M
3_conv_2      [300, 64, 5]   [64, 64, 2094]   96.064k   201.024M
4_dropout                -        [64, 192]         -          -
5_fc              [192, 1]          [64, 1]     193.0      192.0
--------------------------------------------------------------------
                           Totals
Total params           67.371085M
Trainable params       67.371085M
Non-trainable params          0.0
Mult-Adds             549.790092M


  df_sum = df.sum()


Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_embedding,"[300, 223801]","[64, 2098, 300]",67140300.0,67140300.0
1_conv_0,"[300, 64, 3]","[64, 64, 2096]",57664.0,120729600.0
2_conv_1,"[300, 64, 4]","[64, 64, 2095]",76864.0,160896000.0
3_conv_2,"[300, 64, 5]","[64, 64, 2094]",96064.0,201024000.0
4_dropout,-,"[64, 192]",,
5_fc,"[192, 1]","[64, 1]",193.0,192.0


In [25]:
%%time
enable_reproducibility(raise_if_no_deterministic=False)
best_model_state_dict = training(model, train_loader, valid_loader, patience)
enable_reproducibility()

  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.6345128034934019, Validation Loss: 0.4849814227071859


  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.49413217522285796, Validation Loss: 0.4131424442186194


  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 3, Training Loss: 0.4141363228087897, Validation Loss: 0.3749266096343428


  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 4, Training Loss: 0.3470385466521476, Validation Loss: 0.36521806593163536


  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 5, Training Loss: 0.28061647230124737, Validation Loss: 0.3833679090869629


  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 6, Training Loss: 0.20395518285341752, Validation Loss: 0.38653888932224045


  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 7, Training Loss: 0.1411144500018367, Validation Loss: 0.41770208323911084


  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 8, Training Loss: 0.09571035475139217, Validation Loss: 0.49331608225228424
Patience is over. Training stopped after 4 epochs without decreasing validation loss.
CPU times: user 2min 15s, sys: 1.22 s, total: 2min 16s
Wall time: 2min 19s


### Metrics

In [26]:
model.load_state_dict(best_model_state_dict)
outputs, labels = testing(model, test_loader, device)
preds = binary_predict(outputs)
print(f"f1-score of the CNNBaseline: {f1_score(preds, labels)}")

testing:   0%|          | 0/391 [00:00<?, ?it/s]

f1-score of the CNNBaseline: 0.8409940985786717


Посчитайте f1-score вашего классификатора.

**Ответ**: 0.84

## Интерпретируемость

Посмотрим, куда смотрит наша модель. Достаточно запустить код ниже.

In [None]:
!pip install -q captum

In [None]:
from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization

PAD_IND = TEXT.vocab.stoi['pad']

token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)
lig = LayerIntegratedGradients(model, model.embedding)

In [None]:
def forward_with_softmax(inp):
    logits = model(inp)
    return torch.softmax(logits, 0)[0][1]

def forward_with_sigmoid(input):
    return torch.sigmoid(model(input))


# accumalate couple samples in this array for visualization purposes
vis_data_records_ig = []

def interpret_sentence(model, sentence, min_len = 7, label = 0):
    model.eval()
    text = [tok for tok in TEXT.tokenize(sentence)]
    if len(text) < min_len:
        text += ['pad'] * (min_len - len(text))
    indexed = [TEXT.vocab.stoi[t] for t in text]

    model.zero_grad()

    input_indices = torch.tensor(indexed, device=device)
    input_indices = input_indices.unsqueeze(0)
    
    # input_indices dim: [sequence_length]
    seq_length = min_len

    # predict
    pred = forward_with_sigmoid(input_indices).item()
    pred_ind = round(pred)

    # generate reference indices for each sample
    reference_indices = token_reference.generate_reference(seq_length, device=device).unsqueeze(0)

    # compute attributions and approximation delta using layer integrated gradients
    attributions_ig, delta = lig.attribute(input_indices, reference_indices, \
                                           n_steps=5000, return_convergence_delta=True)

    print('pred: ', LABEL.vocab.itos[pred_ind], '(', '%.2f'%pred, ')', ', delta: ', abs(delta))

    add_attributions_to_visualizer(attributions_ig, text, pred, pred_ind, label, delta, vis_data_records_ig)
    
def add_attributions_to_visualizer(attributions, text, pred, pred_ind, label, delta, vis_data_records):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()

    # storing couple samples in an array for visualization purposes
    vis_data_records.append(visualization.VisualizationDataRecord(
                            attributions,
                            pred,
                            LABEL.vocab.itos[pred_ind],
                            LABEL.vocab.itos[label],
                            LABEL.vocab.itos[1],
                            attributions.sum(),       
                            text,
                            delta))

In [None]:
interpret_sentence(model, 'It was a fantastic performance !', label=1)
interpret_sentence(model, 'Best film ever', label=1)
interpret_sentence(model, 'Such a great show!', label=1)
interpret_sentence(model, 'It was a horrible movie', label=0)
interpret_sentence(model, 'I\'ve never watched something as bad', label=0)
interpret_sentence(model, 'It is a disgusting movie!', label=0)

Попробуйте добавить свои примеры!

In [None]:
print('Visualize attributions based on Integrated Gradients')
visualization.visualize_text(vis_data_records_ig)

## Эмбеддинги слов

Вы ведь не забыли, как мы можем применить знания о word2vec и GloVe. Давайте попробуем!

In [None]:
TEXT.build_vocab(trn, vectors=)# YOUR CODE GOES HERE
# подсказка: один из импортов пока не использовался, быть может он нужен в строке выше :)
LABEL.build_vocab(trn)

word_embeddings = TEXT.vocab.vectors

kernel_sizes = [3, 4, 5]
vocab_size = len(TEXT.vocab)
dropout = 0.5
dim = 300

In [None]:
train, tst = datasets.IMDB.splits(TEXT, LABEL)
trn, vld = train.split(random_state=random.seed(SEED))

device = "cuda" if torch.cuda.is_available() else "cpu"
        return self.fc(hidden)
train_iter, val_iter, test_iter = BucketIterator.splits(
        (trn, vld, tst),
        batch_sizes=(128, 256, 256),
        sort=False,
        sort_key= lambda x: len(x.src),
        sort_within_batch=False,
        device=device,
        repeat=False,
)

In [None]:
model = CNN(vocab_size=vocab_size, emb_dim=dim, out_channels=64,
            kernel_sizes=kernel_sizes, dropout=dropout)

word_embeddings = TEXT.vocab.vectors

prev_shape = model.embedding.weight.shape

model.embedding.weight = # инициализируйте эмбэдинги

assert prev_shape == model.embedding.weight.shape
model.to(device)

opt = torch.optim.Adam(model.parameters())

Вы знаете, что делать.

In [None]:
import numpy as np

min_loss = np.inf

cur_patience = 0

for epoch in range(1, max_epochs + 1):
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar: 
        #YOUR CODE GOES HERE

    train_loss /= len(train_iter)
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        # YOUR CODE GOES HERE
    val_loss /= len(valid_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)

Посчитайте f1-score вашего классификатора.

**Ответ**:

Проверим насколько все хорошо!

In [None]:
PAD_IND = TEXT.vocab.stoi['pad']

token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)
lig = LayerIntegratedGradients(model, model.embedding)
vis_data_records_ig = []

interpret_sentence(model, 'It was a fantastic performance !', label=1)
interpret_sentence(model, 'Best film ever', label=1)
interpret_sentence(model, 'Such a great show!', label=1)
interpret_sentence(model, 'It was a horrible movie', label=0)
interpret_sentence(model, 'I\'ve never watched something as bad', label=0)
interpret_sentence(model, 'It is a disgusting movie!', label=0)

In [None]:
print('Visualize attributions based on Integrated Gradients')
visualization.visualize_text(vis_data_records_ig)