In [1]:
import pandas
import re
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from collections import Counter
from torch.utils.data import DataLoader, Dataset

df = pandas.read_csv('IMDB_Dataset.csv')

class Vocabulary(object):
    def __init__(self, train_data, special_chars = ['<pad>', '<unk>'], size=10000):
        voc = Counter()

        for record in train_data.iloc[:25000]:
            preprocessed = self.preprocess(record)
            voc.update(preprocessed)
            
        self.vocabulary = special_chars + [word for word, count in voc.most_common()[:size]]
        
    def text2ids(self, text):
        if not isinstance(text, list):
            text = self.preprocess(text)
        out = []
        for word in text:
            if word in self.vocabulary:
                out.append(self.vocabulary.index(word))
            else:
                out.append(self.vocabulary.index('<unk>'))
        return out    
    
    def make_batch(self, texts):
        preprocessed = [self.preprocess(text) for text in texts]
        lengths = [len(text) for text in preprocessed]
        max_len = max(lengths)
        for p in preprocessed:
            if len(p) < max_len:
                p += ['<pad>'] * (max_len - len(p))
                
        tokens = [self.text2ids(text) for text in preprocessed]
        tokens = torch.tensor(tokens, dtype=torch.int32)
        lengths = torch.tensor(lengths)
        return tokens, lengths
    
    def preprocess(self, text, length=400):
        text = text[:length]
        only_words = re.compile("[^a-zA-Z']")
        remove_br = re.compile('<br\\s*/?>')
        only_one_space = re.compile("[\s]+")
        text = remove_br.sub(' ', text) 
        text = only_words.sub(' ', text)
        text = text.lower()
        text = only_one_space.sub(' ', text)
        text = text.split(' ')
        return text
    
    def __len__(self):
        return len(self.vocabulary)

class Reviews(Dataset):
    def __init__(self, data):
        self.reviews = data['review']
        self.labels = data['sentiment']
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        label = 0 if self.labels[idx] == 'negative' else 1
        return self.reviews[idx], label
    
def collate_fn(batch):
    text = [b[0] for b in batch]
    labels = [b[1] for b in batch]
    text, lengths = voc.make_batch(text)
    return text, lengths, torch.tensor(labels).float()

voc_size = 10000
voc = Vocabulary(df['review'], size=voc_size)
dataset = Reviews(df.iloc[:25000])
loader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)
next(iter(loader))

(tensor([[5253,    1,    8,  ...,    0,    0,    0],
         [  51,    7, 1580,  ...,    0,    0,    0],
         [   2,    1,    5,  ...,    0,    0,    0],
         ...,
         [ 485,  315,   11,  ...,    0,    0,    0],
         [ 134,  190,   50,  ...,  757,    0,    0],
         [   9,    8,   22,  ...,    0,    0,    0]], dtype=torch.int32),
 tensor([58, 73, 66, 68, 70, 69, 74, 68, 67, 54, 72, 70, 70, 76, 66, 44, 72, 72,
         78, 74, 69, 69, 70, 74, 71, 75, 72, 70, 49, 68, 76, 38]),
 tensor([1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
         1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1.]))

In [2]:
class LSTM(nn.Module):

    def __init__(self, input_size, hidden_size=128, num_layers=3):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.GRU(input_size=hidden_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first=True)
        
        self.drop = nn.Dropout(p=0.5)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, text, text_len):
        text_emb = self.embedding(text)

        packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.rnn(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), text_len - 1, :self.hidden_size]
        out_reverse = output[:, 0, self.hidden_size:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        text_fea = self.drop(out_reduced)

        text_fea = self.fc(text_fea)
        text_fea = torch.squeeze(text_fea, 1)
        text_out = torch.sigmoid(text_fea)

        return text_out

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM(len(voc)).to(device)
criterion = nn.BCELoss()
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train=False
if train:
    for epoch in range(1,10):
        losses = []
        for data, lengths, label in loader:
            optimizer.zero_grad()
            data = data.to(device)
            label = label.to(device)
            output = model(data, lengths)
            loss = criterion(output, label)
            loss.backward()
            losses.append(loss.item())
            print('\r', sum(losses) / len(losses), end='')
            optimizer.step()
        torch.save(model.state_dict(), './model.pth')
        print("epoch " , epoch)
else:
    model.load_state_dict(torch.load('model.pth', map_location=device))

In [5]:
test_dataset = Reviews(df.iloc[25000:].reset_index(drop=True))
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)
model.eval()
preds = []
labels = []
for i, (text, lengths, label) in enumerate(test_loader):
    print('\r', i, end='')
    text = text.to(device)
    output = model(text, lengths)
    for p in output:
        a = 1 if p>0.5 else 0
        preds.append(a)
    for l in label:
        labels.append(l)

 781

In [6]:
import numpy as np
preds = np.array(preds)
labels = np.array(labels)
print(sum(preds == labels) / len(preds))

0.79556


In [7]:
print(preds)
print(labels)

[0 1 1 ... 1 1 0]
[0. 0. 1. ... 0. 0. 0.]
