<a href="https://colab.research.google.com/github/Neafiol/Tinkoff/blob/master/Rnn/Rnn_sentiment_dz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import numpy as np

## Как выглядит классификация с RNN в общем виде 

<img src="https://cdn-images-1.medium.com/max/1600/1*vhAfRLlaeOXZ-bruv7Ostg.png" width="400">

In [0]:
class ImageRNN(nn.Module):
    def __init__(self, batch_size, n_steps, n_inputs, n_neurons, n_outputs):
        super(ImageRNN, self).__init__()
        
        self.n_neurons = n_neurons
        self.batch_size = batch_size
        self.n_steps = n_steps
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        
        self.basic_rnn = nn.RNN(self.n_inputs, self.n_neurons) 
        
        self.FC = nn.Linear(self.n_neurons, self.n_outputs)
        
    def init_hidden(self,):
        # (num_layers, batch_size, n_neurons)
        return (torch.zeros(1, self.batch_size, self.n_neurons))
        
    def forward(self, X):
        # transforms X to dimensions: n_steps X batch_size X n_inputs
        X = X.permute(1, 0, 2) 
        
        self.batch_size = X.size(1)
        self.hidden = self.init_hidden()
        
        lstm_out, self.hidden = self.basic_rnn(X, self.hidden)      
        out = self.FC(self.hidden)
        
        return out.view(-1, self.n_outputs) # batch_size X n_output

# Сентимент анализ по аналогии

пишем сами с нуля

<img src="https://github.com/bentrevett/pytorch-sentiment-analysis/raw/bf8cc46e4823ebf9af721b595501ad6231c73632/assets/sentiment1.png">

In [0]:
import torch
from torchtext import data

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [3]:
from torchtext import datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL, root="./data")

aclImdb_v1.tar.gz:   0%|          | 164k/84.1M [00:00<00:59, 1.40MB/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:01<00:00, 64.7MB/s]


In [4]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [5]:
print(vars(train_data.examples[0])['text'])
len(train_data.examples)

['I', 'saw', 'Heartland', 'when', 'it', 'was', 'first', 'released', 'in', '1980', 'and', 'I', 'have', 'just', 'seen', 'it', 'again', '.', 'It', 'improves', 'with', 'age', '.', 'Heartland', 'is', 'not', 'just', 'for', 'lovers', 'of', '"', 'indie', '"', 'films', '.', 'At', 'a', 'time', 'when', 'most', 'American', 'films', 'are', 'little', 'more', 'than', 'cynical', 'attempts', 'to', 'make', 'money', 'with', 'CGI', ',', 'pyrotechnics', ',', 'and/or', 'vulgarity', ',', 'Heartland', 'holds', 'up', 'as', 'a', 'slice', 'of', 'American', 'history', '.', 'It', 'is', 'also', 'a', 'reminder', 'of', 'how', 'spoiled', 'most', 'of', 'us', 'modern', ',', 'urbanized', 'Americans', 'are.<br', '/><br', '/>Nothing', 'in', 'this', 'film', 'is', 'overstated', 'or', 'stagey', '.', 'No', 'one', 'declaims', 'any', 'Hollywood', 'movie', 'speeches', '.', 'The', 'actors', 'really', 'inhabit', 'their', 'roles', '.', 'This', 'really', 'feels', 'like', 'a', '"', 'small', '"', 'film', 'but', 'really', 'it', 'is', 'b

25000

In [6]:
# Сделаем еще eval
import random

train_data, valid_data = train_data.split(random_state=random.seed(SEED))

# Сделаем словарь
TEXT.build_vocab(train_data, max_size=25000)
LABEL.build_vocab(train_data)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [8]:
print(TEXT.vocab.itos[:10])
vars(LABEL.vocab)

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']


{'freqs': Counter({'neg': 8810, 'pos': 8690}),
 'itos': ['neg', 'pos'],
 'stoi': defaultdict(<function torchtext.vocab._default_unk_index>,
             {'neg': 0, 'pos': 1}),
 'vectors': None}

In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# собираем батчи так, чтобы в каждом батче были примеры наиболее похожей длины
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    device=device)

## Делаем модель

<img src="https://github.com/bentrevett/pytorch-sentiment-analysis/raw/bf8cc46e4823ebf9af721b595501ad6231c73632/assets/sentiment7.png" width="450">

* В эмбеддер (emb = [torch.nn.Embedding(num_embeddings, embedding_dim)](https://pytorch.org/docs/stable/nn.html?highlight=embedding#torch.nn.Embedding)) запихиваем тензор размерностью **[sentence length, batch size]**
* Эмбеддер возвращает тензор размерностью **[sentence length, batch size, embedding dim]**
* RNN (torch.nn.RNN(embedding_dim, hidden_dim)) возвращает 2 тензора, *output* размера [sentence length, batch size, hidden dim] и *hidden* размера [1, batch size, hidden dim]

In [0]:
class RNN(nn.Module):
    def __init__(self, batch_size,input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        
        self.batch_size = batch_size

        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.basic_rnn = nn.RNN(self.embedding_dim, self.hidden_dim) 
        self.clas = nn.Linear(self.hidden_dim, self.output_dim)
        self.sigm = nn.Sigmoid()
        
        
    def init_hidden(self):
        # (num_layers, batch_size, n_neurons)
        return (torch.zeros(1, self.batch_size, self.hidden_dim))
      
    def forward (self, text):

        #text = [sent len, batch size]
        
        h=self.init_hidden()
        
        embedded = self.embedding (text)
        out, hidden = self.basic_rnn(embedded,h)        

        output=self.clas(hidden)
        output=self.sigm(output)

        return output[0].view(-1,self.output_dim)

In [0]:
input_dim = len(TEXT.vocab.freqs)

batch_size=64
embedding_dim = 12
hidden_dim = 32
output_dim = 2
N_EPHOCS = 100

In [0]:
import torch.optim as optim

# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Model instance
model = RNN(batch_size,input_dim, embedding_dim, hidden_dim, output_dim)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def get_accuracy(logit, target, batch_size):
    ''' Obtain accuracy for training round '''
    corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
    accuracy = 100.0 * corrects/batch_size
    return accuracy.item()
  

In [0]:
for epoch in range(N_EPHOCS):  # loop over the dataset multiple times
    train_running_loss = 0.0
    train_acc = 0.0
    model.train()
    
    # TRAINING ROUND
    for i, data in enumerate(train_iterator):
         # zero the parameter gradients
        optimizer.zero_grad()
        
        # get the inputs
        inputs, labels = data.text, data.label.long()
        
        if(inputs.shape[1]!=batch_size):
          continue

        # forward + backward + optimize
        
        out = model(inputs)
        

        loss = criterion(out, labels)
        
        loss.backward()
        optimizer.step()

        train_running_loss += loss.detach().item()
        train_acc += get_accuracy(out, labels, BATCH_SIZE)
         
    model.eval()
    print('Epoch:  %d | Loss: %.4f | Train Accuracy: %.2f' 
          %(epoch, train_running_loss / i, train_acc/i))

Epoch:  0 | Loss: 0.6934 | Train Accuracy: 49.20
Epoch:  1 | Loss: 0.6932 | Train Accuracy: 49.56
Epoch:  2 | Loss: 0.6931 | Train Accuracy: 49.21
Epoch:  3 | Loss: 0.6930 | Train Accuracy: 49.58
Epoch:  4 | Loss: 0.6935 | Train Accuracy: 49.33
Epoch:  5 | Loss: 0.6935 | Train Accuracy: 50.02
Epoch:  6 | Loss: 0.6935 | Train Accuracy: 49.34
Epoch:  7 | Loss: 0.6934 | Train Accuracy: 49.64


In [35]:
inputs.shape[1]

28

In [32]:
labels.long().dtype

torch.int64