# Импорт нужных библиотек.

In [1]:
import os
import codecs
import numpy as np
import re
import collections
from torchtext.data.utils import get_tokenizer
import torch
from torch import nn
from torch.utils.data import TensorDataset
from torch.nn import functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch import optim

# Загрузка данных

In [2]:
data_path = 'C:\\Users\\TOPKEK\\Documents\\Python_Directory\\aclImdb'
train_texts = []
train_labels = []
for category in ['pos', 'neg']:
    train_data_path = os.path.join(data_path, 'train', category)
    for fname in sorted(os.listdir(train_data_path)):
        if fname.endswith('.txt'):
            with codecs.open(os.path.join(train_data_path, fname), 'r', 'utf_8_sig') as f:
                train_texts.append(f.read())
            train_labels.append(0 if category == 'neg' else 1)

test_texts = []
test_labels = []
for category in ['pos', 'neg']:
    test_data_path = os.path.join(data_path, 'test', category)
    for fname in sorted(os.listdir(test_data_path)):
        if fname.endswith('.txt'):
            with codecs.open(os.path.join(test_data_path, fname), 'r', 'utf_8_sig') as f:
                test_texts.append(f.read())
            test_labels.append(0 if category == 'neg' else 1)

In [3]:
print('Наибольшая длина отзыва', max([len(rewiew) for rewiew in train_texts]))
print('Средняя длина отзыва', int(np.mean([len(rewiew) for rewiew in train_texts])))

Наибольшая длина отзыва 13704
Средняя длина отзыва 1325


# Обработка данных

## Токенизатор

Для обработки текста мной был написан такой токенизатор. Он заменяет некоторые не имеющие для нас смысла символы в тексте на пробелы, также делает ещё пару упрощений. Ну и, соответсвенно, делает саму токенизацию. Также поставил минимальный размер токена равный трём, чтобы учитывать важные для классификации слова как "not", "bad", и так далее.

In [5]:
_patterns = [r'\"',
             r'<br /><br />',
             r'\;',
             r'\:',
             r'\s+',
             r'\(',
             r'\)']

_replacements = ['',
                 ' ',
                 ' ',
                 ' ',
                 ' ',
                '',
                '']
_patterns_dict = list((re.compile(p), r) for p, r in zip(_patterns, _replacements))

def normalize(line):
    line = line.lower()
    for pattern_re, replaced_str in _patterns_dict:
        line = pattern_re.sub(replaced_str, line)
    return line

TOKEN_RE = re.compile(r'[a-z]+|\d+[.,]\d+|\d+')

def tokenize(txt, min_token_size = 3):
    txt = normalize(txt)
    all_tokens = TOKEN_RE.findall(txt)
    return [token for token in all_tokens if len(token) >= min_token_size]

def tokenize_corpus(texts, tokenizer=tokenize, **tokenizer_kwargs):
    return [tokenizer(text, **tokenizer_kwargs) for text in texts]

In [60]:
train_texts = train_texts + test_texts
train_labels = train_labels + test_labels

In [61]:
from string import punctuation

# Функция, собирающая в себе нормализацию и токенизацию всего корпуса одновременно
def preprocess(text):
    text = [normalize(texti) for texti in text]
    all_reviews = tokenize_corpus(text, min_token_size = 3)
    text = " ".join(text)
    all_words = tokenize(text, min_token_size = 3)
    
    return all_reviews, all_words


all_reviews, all_words = preprocess(train_texts)

## Построение словаря

Также необходимо занумеровать токены. Для этого строятся два обратных словаря, сопоставляющие друг другу номер токена и сам токен.

In [64]:
from collections import Counter

word_counts = Counter(all_words)
word_list = sorted(word_counts, key = word_counts.get, reverse = True)
vocab_to_int = {word:idx+1 for idx, word in enumerate(word_list)}
int_to_vocab = {idx:word for word, idx in vocab_to_int.items()}
encoded_reviews = [[vocab_to_int[word] for word in review] for review in all_reviews]


Метки были уже закодированы на процессе считывания данных, так что их оставляем.

In [70]:
encoded_labels = train_labels

In [71]:
encoded_labels = np.array( [label for idx, label in enumerate(encoded_labels) if len(encoded_reviews[idx]) > 0] )
encoded_reviews = [review for review in encoded_reviews if len(review) > 0]

Функция ниже дополняет нулями закодированные тексты так, чтобы они все имели одинаковую длину.

In [72]:
def pad_text(encoded_reviews, seq_length):
    
    reviews = []
    
    for review in encoded_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append([0]*(seq_length-len(review)) + review)
        
    return np.array(reviews)


padded_reviews = pad_text(encoded_reviews, seq_length = 200)

Простенькая функция, перемешивающая тексты и метки

In [73]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]
padded_reviews, encoded_labels = unison_shuffled_copies(padded_reviews, encoded_labels)

Пример закодированного текста

In [106]:
padded_reviews[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,   345,    58,     3,     6,
          42,  1599,   308,   161,     1,   800,    45,   141,   984,
           2,     5,  9871,     7,   188,     6,    61,    51,   318,
         104,   257,

Ниже мы разбиваем __все__ наши данные на три сета: тренировочный, валидационный и тестовый.

In [74]:
train_ratio = 0.8
valid_ratio = (1 - train_ratio)/2
total = padded_reviews.shape[0]
train_cutoff = int(total * train_ratio)
valid_cutoff = int(total * (1 - valid_ratio))

train_x, train_y = padded_reviews[:train_cutoff], encoded_labels[:train_cutoff]
valid_x, valid_y = padded_reviews[train_cutoff : valid_cutoff], encoded_labels[train_cutoff : valid_cutoff]
test_x, test_y = padded_reviews[valid_cutoff:], encoded_labels[valid_cutoff:]

from torch.utils.data import TensorDataset, DataLoader
#---------------
train_x = torch.Tensor(train_x).long()
train_y = torch.Tensor(train_y).long()
valid_x = torch.Tensor(valid_x).long()
valid_y = torch.Tensor(valid_y).long()
test_x = torch.Tensor(test_x).long()
test_y = torch.Tensor(test_y).long()
#---------------
train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)

batch_size = 50
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

# Построение модели

In [75]:
class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.5):
        super().__init__()
        self.n_vocab = n_vocab     # количесво уникальных слов
        self.n_layers = n_layers   # слои
        self.n_hidden = n_hidden   # нейронов в слоях
        
        self.embedding = nn.Embedding(n_vocab + 1, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
                                 # Размерности матриц   :  (batch_size, seq_length)
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        lstm_out, h = self.lstm(embedded_words)         # (batch_size, seq_length, n_hidden)
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # (batch_size*seq_length, n_hidden)
        fc_out = self.fc(lstm_out)                      # (batch_size*seq_length, n_output)
        sigmoid_out = self.sigmoid(fc_out)              # (batch_size*seq_length, n_output)
        sigmoid_out = sigmoid_out.view(batch_size, -1)  # (batch_size, seq_length*n_output)
        
        # extract the output of ONLY the LAST output of the LAST element of the sequence
        # Искомое число - лишь последний выход последнего элемента рекуррентной последовательности
        sigmoid_last = sigmoid_out[:, -1]               # (batch_size, 1)
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):  # initialize hidden weights (h,c) to 0
        
        device = 'cpu'
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return h

Настройка параметров. Из-за не самого большого количества как видео, так и оперативной памяти в моём расположении сейчас, сеть будет очень скромная.

In [76]:
n_vocab = len(vocab_to_int)
n_embed = 32
n_hidden = 8
n_output = 1   
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)

In [77]:
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)


In [78]:
print_every = 100
step = 0
n_epochs = 10  
clip = 5  # предотвращение градиентного скачка
device = 'cpu'

for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        step += 1
        inputs, labels = inputs.to(device), labels.to(device)
        
        h = tuple([each.data for each in h])   
        
        net.zero_grad()
        output, h = net(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        
        # Наблюдение за процессом обучения.
        if (step % print_every) == 0:            
            net.eval()
            valid_losses = []
            v_h = net.init_hidden(batch_size)
            
            for v_inputs, v_labels in valid_loader:
                v_inputs, v_labels = inputs.to(device), labels.to(device)
        
                v_h = tuple([each.data for each in v_h])
                
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()



Epoch: 1/10 Step: 100 Training Loss: 0.6938 Validation Loss: 0.6974
Epoch: 1/10 Step: 200 Training Loss: 0.6912 Validation Loss: 0.6929
Epoch: 1/10 Step: 300 Training Loss: 0.6890 Validation Loss: 0.6938
Epoch: 1/10 Step: 400 Training Loss: 0.6829 Validation Loss: 0.6879
Epoch: 1/10 Step: 500 Training Loss: 0.6687 Validation Loss: 0.6753
Epoch: 1/10 Step: 600 Training Loss: 0.6808 Validation Loss: 0.6678
Epoch: 1/10 Step: 700 Training Loss: 0.6288 Validation Loss: 0.5631
Epoch: 1/10 Step: 800 Training Loss: 0.6152 Validation Loss: 0.5583
Epoch: 2/10 Step: 900 Training Loss: 0.6278 Validation Loss: 0.5878
Epoch: 2/10 Step: 1000 Training Loss: 0.5969 Validation Loss: 0.5670
Epoch: 2/10 Step: 1100 Training Loss: 0.5802 Validation Loss: 0.6017
Epoch: 2/10 Step: 1200 Training Loss: 0.5283 Validation Loss: 0.4994
Epoch: 2/10 Step: 1300 Training Loss: 0.6690 Validation Loss: 0.6441
Epoch: 2/10 Step: 1400 Training Loss: 0.6589 Validation Loss: 0.6270
Epoch: 2/10 Step: 1500 Training Loss: 0.529

# Оценка качества модели.

In [79]:
net.eval()
test_losses = []
num_correct = 0
test_h = net.init_hidden(batch_size)

for inputs, labels in test_loader:
    test_h = tuple([each.data for each in test_h])
    test_output, test_h = net(inputs)
    loss = criterion(test_output, labels.float())
    test_losses.append(loss.item())
    
    preds = torch.round(test_output.squeeze())
    correct_tensor = preds.eq(labels.float().view_as(preds))
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)
    
print("Test Loss: {:.4f}".format(np.mean(test_losses)))
print("Test Accuracy: {:.2f}".format(num_correct/len(test_loader.dataset)))

Test Loss: 0.5193
Test Accuracy: 0.85


In [172]:
def predict(net, review, seq_length = 200):
    device = 'cpu' 
    
    words, _ = preprocess([review])
    #print(words)
    encoded_words = [vocab_to_int[word] for word in words[0]]
    padded_words = pad_text([encoded_words], seq_length)
    padded_words = torch.from_numpy(padded_words).to(device)
    
    if(len(padded_words) == 0):
        "Your review must contain at least 1 word!"
        return None
    
    net.eval()
    h = net.init_hidden(1)
    padded_words = torch.Tensor(padded_words.float()).long()
    output, h = net(padded_words)
    pred = output.squeeze()
    
    return pred

def rating_pred(out):
    if out > .5:
        return round(7 + 6 * (out - 0.5))
    else:
        return round(4 - 6 * (-out + 0.5))
    
    
review = "I think that this movie can be better. But, unfortunately, it is not."

print(predict(net, review)[-1].detach().numpy(), rating_pred(predict(net, review1)[-1].detach().numpy())) 


0.37367857 3.0
