# Sentiment Analysis of Movie Reviews

In [132]:
import os

import spacy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
nlp = spacy.load('en')


from torchtext.data import Field, LabelField, TabularDataset, Iterator
from torchtext.vocab import Vectors


In [133]:
class MyDataset(object):

    def __init__(self, root_dir='data/sentiment/', batch_size=64, use_vector=True):
        self.TEXT = Field(sequential=True, use_vocab=True,
                          tokenize='spacy', lower=True, batch_first=True)
        self.LABEL = LabelField()
        vectors = Vectors(name='data/sentiment/mr_vocab.txt', cache='./')

        dataset_path = os.path.join(root_dir, '{}.tsv')
        self.dataset = {}
        self.dataloader = {}
        for target in ['train', 'dev', 'test']:
            self.dataset[target] = TabularDataset(
                path=dataset_path.format(target),
                format='tsv',
                fields=[('text', self.TEXT), ('label', self.LABEL)]
            )
            if use_vector:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors)
            else:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000)

            self.LABEL.build_vocab(self.dataset[target])
            self.dataloader[target] = Iterator(self.dataset[target],
                                               batch_size=batch_size,
                                               device=None,
                                               repeat=False,
                                               sort_key=lambda x: len(x.text),
                                               shuffle=True)

In [134]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes,
                 output_dim, use_dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters,
                      kernel_size=(fs, embedding_dim)) for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(0.5 if use_dropout else 0.)

    def forward(self, x):
        x = x.permute(1, 0)
        embedded = self.embedding(x)
        embedded = embedded.unsqueeze(1)

        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]

        cat = self.dropout(torch.cat(pooled, dim=1))

        return self.fc(cat)

In [135]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    # round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    # convert into float for division
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [136]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# device = 'cpu'
dataset = MyDataset(batch_size=64, use_vector=True)


In [137]:
model = CNN(len(dataset.TEXT.vocab), 300, 100, [3, 4, 5], 1, True).to(device)

In [138]:
model.embedding.weight.data.copy_(dataset.TEXT.vocab.vectors)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0120,  0.2075, -0.1258,  ...,  0.1387, -0.3605, -0.0350],
        ...,
        [ 0.2679,  0.1430,  0.4684,  ...,  0.0252, -0.1573,  0.4174],
        [-0.1195,  0.3609,  0.0184,  ..., -0.6871, -0.6613, -0.2103],
        [ 0.2707, -0.0874, -0.3683,  ...,  0.0368, -0.2172, -0.3406]],
       device='cuda:0')

In [139]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [140]:
criterion = nn.BCEWithLogitsLoss(size_average=True).to(device)



In [141]:
def train(model, iterator, optimizer, criterion, device):
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        batch.text = batch.text.permute(1, 0)
        pred = model(batch.text.to(device)).squeeze(1)
        loss = criterion(pred, batch.label.double().to(device))
        acc = binary_accuracy(pred, batch.label.double().to(device))

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion, device):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    for batch in iterator:
        batch.text = batch.text.permute(1, 0)
        pred = model(batch.text.to(device)).squeeze(1)
        loss = criterion(pred, batch.label.double().to(device))
        acc = binary_accuracy(pred, batch.label.double().to(device))

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [142]:
best_acc = 0
for epoch in range(100):
    train_loss, train_acc = train(model, dataset.dataloader['train'],
                                  optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, dataset.dataloader['dev'],
                                     criterion, device)
    test_loss, test_acc = evaluate(model, dataset.dataloader['test'],
                                   criterion, device)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc * 100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc * 100:.2f}%, Test Loss: {test_loss:.3f}, Test Acc: {test_acc * 100:.2f}%')

    if best_acc <= valid_acc:
        best_acc = valid_acc
        acc_result = test_acc
        pth = model.state_dict()
        filename = "checkpoints/{}_{}_bs{}_filter{}_acc{:.03f}.pth".format(
                "CNN", "SGD", "64", 100, test_acc
            )
torch.save(model, filename)

print("training done")



Epoch: 01, Train Loss: 0.699, Train Acc: 48.30%, Val. Loss: 0.694, Val. Acc: 48.44%, Test Loss: 0.696, Test Acc: 46.11%
Epoch: 02, Train Loss: 0.699, Train Acc: 49.25%, Val. Loss: 0.694, Val. Acc: 48.34%, Test Loss: 0.695, Test Acc: 48.13%
Epoch: 03, Train Loss: 0.697, Train Acc: 49.83%, Val. Loss: 0.693, Val. Acc: 50.18%, Test Loss: 0.694, Test Acc: 49.96%
Epoch: 04, Train Loss: 0.697, Train Acc: 49.95%, Val. Loss: 0.692, Val. Acc: 51.34%, Test Loss: 0.693, Test Acc: 51.38%
Epoch: 05, Train Loss: 0.695, Train Acc: 50.73%, Val. Loss: 0.691, Val. Acc: 52.39%, Test Loss: 0.692, Test Acc: 52.21%
Epoch: 06, Train Loss: 0.694, Train Acc: 50.59%, Val. Loss: 0.691, Val. Acc: 53.40%, Test Loss: 0.691, Test Acc: 53.43%
Epoch: 07, Train Loss: 0.693, Train Acc: 51.71%, Val. Loss: 0.690, Val. Acc: 54.51%, Test Loss: 0.690, Test Acc: 55.01%
Epoch: 08, Train Loss: 0.693, Train Acc: 52.66%, Val. Loss: 0.689, Val. Acc: 56.07%, Test Loss: 0.690, Test Acc: 55.90%
Epoch: 09, Train Loss: 0.691, Train Acc:

# Inference

In [143]:
model = torch.load('checkpoints/CNN_SGD_bs64_filter100_acc0.730.pth')

In [157]:
sentence = """Holland’s film manages to really get under one’s skin on the whole, 
remaining a compelling and engaging watch throughout in spite of its rambling feel. 
This is in large part thanks to the awesome first-time writer Andrea Chalupa’s 
clear-eyed resolve to find present-day relevancy in Jones’s heroic 
commitment to publishing the facts and agitating those in power, 
when today’s world has almost become numb to fake news. It's a 
fun and exciting movie to watch. Overall we give the movie a 7/10 score
to enjoy with your friends and family. 👍"""

dataset = MyDataset(batch_size=1, use_vector=True)
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [dataset.TEXT.vocab.stoi[t] for t in tokenized]
tensor = torch.LongTensor(indexed).to(device)
tensor = tensor.unsqueeze(1)
prediction = F.sigmoid(model(tensor))
print(prediction.item())

0.6402595639228821


In [145]:
sentence = """While “Mr. Jones” isn’t close to being in the same league as 
great journalism films such as “All The President’s Men” and “Spotlight,” 
it takes a noble page from their book in not delivering a conclusive ending. 
Instead, it leaves things at a place where the work to uncover “only one version of the truth” 
(as often cited by characters in the film) seems to be just beginning. 
This might not be the ultimate movie to honor the idealistic legacy of Gareth Jones, but it’s a dignified one all the same, with an uncompromising moral compass."""

dataset = MyDataset(batch_size=1, use_vector=True)
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [dataset.TEXT.vocab.stoi[t] for t in tokenized]
tensor = torch.LongTensor(indexed).to(device)
tensor = tensor.unsqueeze(1)
prediction = F.sigmoid(model(tensor))
print(prediction.item())

0.5298455357551575


In [146]:
sentence = """They say it takes a village to raise a child, but in “Safety,” 
it takes an entire university campus to do so. Overly sentimental traps line 
the plot of the film, streaming on Disney+. But it scores points for giving 
its lead characters complicated situations, emotional depth and political dimension."""

dataset = MyDataset(batch_size=1, use_vector=True)
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [dataset.TEXT.vocab.stoi[t] for t in tokenized]
tensor = torch.LongTensor(indexed).to(device)
tensor = tensor.unsqueeze(1)
prediction = F.sigmoid(model(tensor))
print(prediction.item())

0.4953272044658661


In [160]:
sentence = """Not only does Ruth succeed in fulfilling the promise, 
but also the child is subsequently cured of his cancer. Baseball player 
Ted Williams believed it to be the worst movie he had ever seen 👎 and 
The Washington Times stated that it "stands as possibly the worst movie ever made" 😒.
The film has been called one of the worst sports films ever by Newsday and The A.V. Club"""

dataset = MyDataset(batch_size=1, use_vector=True)
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [dataset.TEXT.vocab.stoi[t] for t in tokenized]
tensor = torch.LongTensor(indexed).to(device)
tensor = tensor.unsqueeze(1)
prediction = F.sigmoid(model(tensor))
print(prediction.item())

0.4291599690914154
