In [1]:
import random
from pathlib import Path
from collections import Counter

import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from google_drive_downloader import GoogleDriveDownloader as gdd
from sklearn.feature_extraction.text import CountVectorizer
# from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook

In [2]:
random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7f0a6a3d5b30>

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
DATA_PATH = 'data/imdb_reviews.csv'
if not Path(DATA_PATH).is_file():
    gdd.download_file_from_google_drive(
        file_id='1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz',
        dest_path=DATA_PATH,
    )

In [5]:
df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()

(62155, 2)


Unnamed: 0,review,label
0,Once again Mr. Costner has dragged out a movie...,0
1,This is an example of why the majority of acti...,0
2,"First of all I hate those moronic rappers, who...",0
3,Not even the Beatles could write songs everyon...,0
4,Brass pictures (movies is not a fitting word f...,0


In [6]:
class IMDBData(Dataset):
    def __init__(self, df, max_seq_len):
        self.max_seq_len = max_seq_len
        reviews = df.review.tolist()
        labels = df.label.tolist()
        vectorizer = CountVectorizer(stop_words='english', min_df=0.015)
        vectorizer.fit(reviews)

        self.token2idx = vectorizer.vocabulary_
        self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1

        tokenizer = vectorizer.build_analyzer()

        # map the token to its idx in token2idx (if existed)
        self.encode = lambda x: [self.token2idx[token] for token in tokenizer(x) if token in self.token2idx]
        self.pad = lambda x: x + (self.max_seq_len - len(x)) * [self.token2idx['<PAD>']]

        texts = [self.encode(text)[:self.max_seq_len] for text in reviews]
        texts, self.labels = zip(*[(text, label) for text, label in zip(texts, labels) if text])

        self.texts = [self.pad(text) for text in texts]

    def __getitem__(self, i):
        assert len(self.texts[i]) == self.max_seq_len
        return self.texts[i], self.labels[i]
    
    def __len__(self):
        return len(self.texts)

In [7]:
dataset = IMDBData(df, max_seq_len=150)
print(len(dataset.token2idx))

1104


In [8]:
def collate(batch):
    inputs = torch.LongTensor([item[0] for item in batch])
    target = torch.FloatTensor([item[1] for item in batch])
    return inputs, target

batch_size = 1024
train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate)

In [9]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
!ls

In [10]:
glove_embeddings = {}
glove_file = open('glove.6B.300d.txt')
print('Start getting words and coefficients')
for line in tqdm(glove_file):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_embeddings[word] = coefs
glove_file.close()
print(f'Found {len(glove_embeddings)} word vectors.')

3026it [00:00, 15246.84it/s]

Start getting words and coefficients


400000it [00:27, 14481.19it/s]

Found 400000 word vectors.





In [11]:
embedding_matrix = np.zeros((len(dataset.token2idx) + 1, 300))
for word, idx in dataset.token2idx.items():
    if word in glove_embeddings.keys():
        embedding_matrix[idx] = glove_embeddings[word]
print(embedding_matrix.shape)

(1105, 300)


In [19]:
class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dims, hidden_size, embedding_weights):
        super(GRU, self).__init__()
        
        self.n_layers = 1
        self.hidden_size = hidden_size
        self.batch_size = batch_size

        self.embeddings = nn.Embedding(vocab_size, embedding_dims)
        """Learning the embeddings from scratch here gave better results than using glove.
        If you want to use glove, just uncomment these two lines below.
        """
        # self.embeddings.weight = nn.Parameter(torch.tensor(embedding_weights, dtype=torch.float32))
        # self.embeddings.weight.requires_grad = False

        self.gru = nn.GRU(
            embedding_dims,
            hidden_size,
            batch_first=True,
        )

        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        batch_size = x.size(0)

        out_embeddings = self.embeddings(x)
        out_gru, _ = self.gru(out_embeddings)
        output = self.fc(out_gru[:, -1, :]).ravel()
        return output

In [20]:
model = GRU(
    embedding_matrix.shape[0],
    embedding_matrix.shape[1],
    128,
    embedding_matrix
)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criteron = nn.BCEWithLogitsLoss()
model

GRU(
  (embeddings): Embedding(1105, 300)
  (gru): GRU(300, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [21]:
model.train()
train_losses = []
for epoch in range(10):
    losses = []
    total_iters = 0
    for input, label in tqdm(train_loader):
        input, label = input.to(device), label.to(device)
        optimizer.zero_grad()
        y_pred = model(input)
        loss = criteron(y_pred, label)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 3)
        optimizer.step()

        losses.append(loss.item())
        total_iters += 1

    epoch_loss = sum(losses) / total_iters
    train_losses.append(epoch_loss)

    print(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

100%|██████████| 61/61 [00:04<00:00, 13.02it/s]
  3%|▎         | 2/61 [00:00<00:04, 13.33it/s]

Epoch #1	Train Loss: 1.503


100%|██████████| 61/61 [00:04<00:00, 13.08it/s]
  3%|▎         | 2/61 [00:00<00:04, 13.68it/s]

Epoch #2	Train Loss: 0.859


100%|██████████| 61/61 [00:04<00:00, 13.01it/s]
  3%|▎         | 2/61 [00:00<00:04, 13.62it/s]

Epoch #3	Train Loss: 0.754


100%|██████████| 61/61 [00:04<00:00, 13.06it/s]
  3%|▎         | 2/61 [00:00<00:04, 13.26it/s]

Epoch #4	Train Loss: 0.722


100%|██████████| 61/61 [00:04<00:00, 13.01it/s]
  3%|▎         | 2/61 [00:00<00:04, 13.28it/s]

Epoch #5	Train Loss: 0.699


100%|██████████| 61/61 [00:04<00:00, 12.99it/s]
  3%|▎         | 2/61 [00:00<00:04, 13.78it/s]

Epoch #6	Train Loss: 0.624


100%|██████████| 61/61 [00:04<00:00, 13.00it/s]
  3%|▎         | 2/61 [00:00<00:04, 13.07it/s]

Epoch #7	Train Loss: 0.602


100%|██████████| 61/61 [00:04<00:00, 12.89it/s]
  3%|▎         | 2/61 [00:00<00:04, 13.31it/s]

Epoch #8	Train Loss: 0.533


100%|██████████| 61/61 [00:04<00:00, 12.90it/s]
  3%|▎         | 2/61 [00:00<00:04, 13.58it/s]

Epoch #9	Train Loss: 0.497


100%|██████████| 61/61 [00:04<00:00, 12.89it/s]

Epoch #10	Train Loss: 0.459





In [22]:
def predict_sentiment(text):
    model.eval()
    with torch.no_grad():
        test_vector = torch.LongTensor([dataset.pad(dataset.encode(text))]).to(device)
    
        output = model(test_vector)
        prediction = torch.sigmoid(output).item()

        if prediction > 0.5:
            print(f'{prediction:0.3}: Positive sentiment')
        else:
            print(f'{prediction:0.3}: Negative sentiment')

In [23]:
test_text = """
This poor excuse for a movie is terrible. It has been 'so good it's bad' for a
while, and the high ratings are a good form of sarcasm, I have to admit. But
now it has to stop. Technically inept, spoon-feeding mundane messages with the
artistic weight of an eighties' commercial, hypocritical to say the least, it
deserves to fall into oblivion. Mr. Derek, I hope you realize you are like that
weird friend that everybody know is lame, but out of kindness and Christian
duty is treated like he's cool or something. That works if you are a good
decent human being, not if you are a horrible arrogant bully like you are. Yes,
Mr. 'Daddy' Derek will end on the history books of the internet for being a
delusional sour old man who thinks to be a good example for kids, but actually
has a poster of Kim Jong-Un in his closet. Destroy this movie if you all have a
conscience, as I hope IHE and all other youtube channel force-closed by Derek
out of SPITE would destroy him in the courts.This poor excuse for a movie is
terrible. It has been 'so good it's bad' for a while, and the high ratings are
a good form of sarcasm, I have to admit. But now it has to stop. Technically
inept, spoon-feeding mundane messages with the artistic weight of an eighties'
commercial, hypocritical to say the least, it deserves to fall into oblivion.
Mr. Derek, I hope you realize you are like that weird friend that everybody
know is lame, but out of kindness and Christian duty is treated like he's cool
or something. That works if you are a good decent human being, not if you are a
horrible arrogant bully like you are. Yes, Mr. 'Daddy' Derek will end on the
history books of the internet for being a delusional sour old man who thinks to
be a good example for kids, but actually has a poster of Kim Jong-Un in his
closet. Destroy this movie if you all have a conscience, as I hope IHE and all
other youtube channel force-closed by Derek out of SPITE would destroy him in
the courts.
"""
predict_sentiment(test_text)

0.157: Negative sentiment


In [24]:
test_text = """
Cool Cat Saves The Kids is a symbolic masterpiece directed by Derek Savage that
is not only satirical in the way it makes fun of the media and politics, but in
the way in questions as how we humans live life and how society tells us to
live life.

Before I get into those details, I wanna talk about the special effects in this
film. They are ASTONISHING, and it shocks me that Cool Cat Saves The Kids got
snubbed by the Oscars for Best Special Effects. This film makes 2001 look like
garbage, and the directing in this film makes Stanley Kubrick look like the
worst director ever. You know what other film did that? Birdemic: Shock and
Terror. Both of these films are masterpieces, but if I had to choose my
favorite out of the 2, I would have to go with Cool Cat Saves The Kids. It is
now my 10th favorite film of all time.

Now, lets get into the symbolism: So you might be asking yourself, Why is Cool
Cat Orange? Well, I can easily explain. Orange is a color. Orange is also a
fruit, and its a very good fruit. You know what else is good? Good behavior.
What behavior does Cool Cat have? He has good behavior. This cannot be a
coincidence, since cool cat has good behavior in the film.

Now, why is Butch The Bully fat? Well, fat means your wide. You wanna know who
was wide? Hitler. Nuff said this cannot be a coincidence.

Why does Erik Estrada suspect Butch The Bully to be a bully? Well look at it
this way. What color of a shirt was Butchy wearing when he walks into the area?
I don't know, its looks like dark purple/dark blue. Why rhymes with dark? Mark.
Mark is that guy from the Room. The Room is the best movie of all time. What is
the opposite of best? Worst. This is how Erik knew Butch was a bully.

and finally, how come Vivica A. Fox isn't having a successful career after
making Kill Bill.

I actually can't answer that question.

Well thanks for reading my review.
"""
predict_sentiment(test_text)

0.687: Positive sentiment


In [25]:
test_text = """
Don't let any bullies out there try and shape your judgment on this gem of a
title.

Some people really don't have anything better to do, except trash a great movie
with annoying 1-star votes and spread lies on the Internet about how "dumb"
Cool Cat is.

I wouldn't be surprised to learn if much of the unwarranted negativity hurled
at this movie is coming from people who haven't even watched this movie for
themselves in the first place. Those people are no worse than the Butch the
Bully, the film's repulsive antagonist.

As it just so happens, one of the main points of "Cool Cat Saves the Kids" is
in addressing the attitudes of mean naysayers who try to demean others who
strive to bring good attitudes and fun vibes into people's lives. The message
to be learned here is that if one is friendly and good to others, the world is
friendly and good to one in return, and that is cool. Conversely, if one is
miserable and leaving 1-star votes on IMDb, one is alone and doesn't have any
friends at all. Ain't that the truth?

The world has uncovered a great, new, young filmmaking talent in "Cool Cat"
creator Derek Savage, and I sure hope that this is only the first of many
amazing films and stories that the world has yet to appreciate.

If you are a cool person who likes to have lots of fun, I guarantee that this
is a movie with charm that will uplift your spirits and reaffirm your positive
attitudes towards life.
"""
predict_sentiment(test_text)

0.953: Positive sentiment
