In [1]:
## Bag-of-Words Text Classification using PyTorch
# Import libraries
from pathlib import Path

import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from google_drive_downloader import GoogleDriveDownloader as gdd
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm, tqdm_notebook

In [2]:
# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
# Load data 
DATA_PATH = 'data/imdb_reviews.csv'
if not Path(DATA_PATH).is_file():
    gdd.download_file_from_google_drive(
        file_id='1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz',
        dest_path=DATA_PATH,
    )

Downloading 1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz into data/imdb_reviews.csv... Done.


In [4]:
# View some example records
pd.read_csv(DATA_PATH).sample(5)

Unnamed: 0,review,label
29982,James Gandolfini is a good actor so what ever ...,0
54409,"Look,I'm reading and reading this comments and...",0
13329,Movie about two Australian girls--Debbie (Nell...,1
10046,I gave Soul Plane the benefit of the doubt and...,0
60943,What's in here ?! Let me tell you. It's the pr...,0


In [5]:
# Pre-processing the data
class Sequences(Dataset):
    def __init__(self, path):
        df = pd.read_csv(path)
        self.vectorizer = CountVectorizer(stop_words='english', max_df=0.99, min_df=0.005)
        self.sequences = self.vectorizer.fit_transform(df.review.tolist())
        self.labels = df.label.tolist()
        self.token2idx = self.vectorizer.vocabulary_
        self.idx2token = {idx: token for token, idx in self.token2idx.items()}
        
    def __getitem__(self, i):
        return self.sequences[i, :].toarray(), self.labels[i]
    
    def __len__(self):
        return self.sequences.shape[0]

In [6]:
dataset = Sequences(DATA_PATH)
train_loader = DataLoader(dataset, batch_size=4096)

print(dataset[5][0].shape)

(1, 3028)


In [7]:
# Create Bag-of-Words classifier
class BagOfWordsClassifier(nn.Module):
    def __init__(self, vocab_size, hidden1, hidden2):
        super(BagOfWordsClassifier, self).__init__()
        self.fc1 = nn.Linear(vocab_size, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 1)
    
    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float()))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [8]:
model = BagOfWordsClassifier(len(dataset.token2idx), 128, 64)
model

BagOfWordsClassifier(
  (fc1): Linear(in_features=3028, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
)

In [9]:
# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.001)

In [10]:
# Training
model.train()
train_losses = []
for epoch in range(10):
    progress_bar = tqdm_notebook(train_loader, leave=False)
    losses = []
    total = 0
    for inputs, target in progress_bar:
        model.zero_grad()

        output = model(inputs)
        loss = criterion(output.squeeze(), target.float())
        
        loss.backward()
              
        nn.utils.clip_grad_norm_(model.parameters(), 3)

        optimizer.step()
        
        progress_bar.set_description(f'Loss: {loss.item():.3f}')
        
        losses.append(loss.item())
        total += 1
    
    epoch_loss = sum(losses) / total
    train_losses.append(epoch_loss)
        
    tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/16 [00:00<?, ?it/s]

Epoch #1	Train Loss: 0.706


  0%|          | 0/16 [00:00<?, ?it/s]

Epoch #2	Train Loss: 0.676


  0%|          | 0/16 [00:00<?, ?it/s]

Epoch #3	Train Loss: 0.643


  0%|          | 0/16 [00:00<?, ?it/s]

Epoch #4	Train Loss: 0.584


  0%|          | 0/16 [00:00<?, ?it/s]

Epoch #5	Train Loss: 0.503


  0%|          | 0/16 [00:00<?, ?it/s]

Epoch #6	Train Loss: 0.427


  0%|          | 0/16 [00:00<?, ?it/s]

Epoch #7	Train Loss: 0.371


  0%|          | 0/16 [00:00<?, ?it/s]

Epoch #8	Train Loss: 0.335


  0%|          | 0/16 [00:00<?, ?it/s]

Epoch #9	Train Loss: 0.312


  0%|          | 0/16 [00:00<?, ?it/s]

Epoch #10	Train Loss: 0.296


In [11]:
def predict_sentiment(text):
    model.eval()
    with torch.no_grad():
        test_vector = torch.LongTensor(dataset.vectorizer.transform([text]).toarray())

        output = model(test_vector)
        prediction = torch.sigmoid(output).item()

        if prediction > 0.5:
            print(f'{prediction:0.3}: Positive sentiment')
        else:
            print(f'{prediction:0.3}: Negative sentiment')

In [12]:
# Make some predictions on Goodreads reviews of Lord of the Rings: The Return of the King
# Start with a 1 star review
test_text = """
As one who thinks great characters are what makes a book great, Lord of the Rings was a huge disappointment. 
All the characters, without exception, could be described as two-dimensional at best. 
They seem to lack the emotional complexity normally found in intelligent beings, and instead seem more like characters from a fairy tale, where everybody is either 100% Good or 100% Evil. 
Despite having 1,349 pages with which to work, most of the characters' back stories are never really explored, save the odd one paragraph anecdote about a past incident. 
And nor do they ever really change despite their journey and experiences. I found each character to be so vague, I was never able to develop any sympathy or attachment to any of them.
That there wasn't really any story behind most of the characters was only part of the problem however. 
With a couple of different plot lines unfolding in the third book, there would be times that I wouldn't read of Frodo or Mary for nearly a hundred pages. 
It would be so long that I would have trouble remembering what had happened to them or where they were. 
Any attachment I had been developing was long gone, as I found myself flipping back dozens and dozens of pages, trying to refresh myself on what had happened so long ago.
My next beef with this book was the plot. Sure, as I mentioned above, there was magic, there was war, there was adventure, and there's nothing wrong with those things. 
But the story was just too formulaic for my tastes. 
Everything always seemed to reach the only possible conclusion, and any hardships the main players did face (which was usually that they hadn't eaten for twenty minutes), was the briefest of problems, resolved almost immediately, allowing them to continue on their way to a predictable outcome.
"""
predict_sentiment(test_text)

0.107: Negative sentiment


In [14]:
# Continue with 3-star
test_text = """
The only reason he sometimes gets away with breaking such sensible rules of storytelling is that he often has a purpose for breaking them, and is capable of drawing on his wealth of knowledge to instill further depth and richness in his world. 
Sometimes, when he slowed his story down with such asides, they did not have enough purpose to merit inclusion, a flaw in pacing which has only increased with modern authors.
But underneath all of that, Tolkien does have an appealing and exciting story to tell, of war and succession and moral struggles--the same sort of story that has been found in our myths since the very earliest writings of man.
He does not create a straight monomyth, because, like Milton, he presents a hero divided. Frodo takes after the Adam, placing strength in humility and piety, not martial might or wit. 
Aragorn is an attempt to save the warlike, aristocratic hero whom Milton criticized in his portrayal of Satan.
Yet unlike Satan, we do not get an explanation of what makes Strider superior, worthy, or--more importantly--righteous. 
And in this, Tolkien's attempt to recreate the form of the Eddas is completely at odds with the Christian, romantic moral content with which he fills the story. 
This central schism makes his work much less true to the tradition than Anderson's The Broken Sword , which was published the same year.
Not only does Tolkien put forth a vision of chaste, humble, 'everyman' heroes who persevere against temptation through piety, he also presents a world of dualistic good and evil, of eternal, personal morality, prototypical of the Christian worldview, particularly the post-Miltonic view. 
His characters are bloodless, chaste, and noble--and if that nobility is sometimes that of simple, hard-working folk, all the better for his Merrie England analogue.
More interesting than these is his portrayal of Gollum, one of the few characters with a deep psychological contradiction. In some ways, his central, conflicted role resembles Eddison's Lord Gro, whose work inspired Tolkien. 
But even this internal conflict is dualistic. Unlike Gro, Gollum is not a character with an alternative view of the world, but fluctuates between the hyperbolic highs and lows of Tolkien's morality.
It is unfortunate that both good and evil seem to be external forces at work upon man, because it removes much of the agency and psychological depth of the characters. 
There is a hint of very alien morality in the out-of-place episode of Tom Bombadil, expressing the separation between man and fairy that Dunsany's work epitomized. 
Bombadil is the most notorious remainder of the fantastical roots of Tolkien's story which he painstakingly removed in editing in favor of Catholic symbology.
Yet despite internal conflicts, there is something respectable in what he achieved, and no fantasy author has yet been capable of comprehending what Tolkien was trying to do and innovating upon it. 
The best modern writers of fantasy have instead avoided Tolkien, concentrating on other sources of inspiration. 
The dullards of fantasy have merely rehashed and reshuffled the old tropes back and forth, imagining that they are creating something.
"""
predict_sentiment(test_text)

0.901: Positive sentiment


In [15]:
# End-up with 5-star
test_text = """
I was living with my uncle 20 years ago and he is a vast reader in many genres. I was looking for something new to read when he told me to read this series. 
He had these beautiful hardback books with fold-out maps. I had heard of the hobbit and saw the cartoon as a kid. I thought it was an okay movie, but it didn't really impress me. 
He convinced me that I needed to read this. So, I started with the hobbit and read one book after the other until I was done with this book.
I remember being awed by this series. I couldn't wait for the last book. The world was so grand and epic. 
I loved the race of elves and the Ents. I tore through this story as fast as I could go. I read the appendix not being able to get enough. I was hooked on this series. 
I had a major book hangover and it was a bit sad not to have anymore story. I did go on to read the Silmarillion a bit later.
I felt like this was an amazing ending to a beloved series. I loved the whole story. I think it will be time for a re-read soon. 
The movies are stellar and the written story still has so much details to offer. I love a soft-magic system, they are the best.
"""
predict_sentiment(test_text)

0.949: Positive sentiment
