# Sentimental Analysis with PyTorch

In [2]:
# Imports
import torch 
import pandas as pd 
import torch.nn as nn 
import torch.optim as optim 
import torch.nn.functional as F 
import sklearn
from torch.utils.data import DataLoader, Dataset 
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.notebook import tqdm, tqdm_notebook

In [3]:
# Package versions
%reload_ext watermark
%watermark -a "Data'snow" --iversions

torch    1.5.1
json     2.0.9
sklearn  0.23.1
autopep8 1.4.4
pandas   1.0.1
numpy    1.18.1
Data'snow


In [6]:
# Defining device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

## Loading and Exploring Data

We are using two datasets: <br />
https://www.imdb.com/interfaces/ to movie's opnions <br />
https://ai.stanford.edu/~amaas/data/sentiment/ to sentimental labels

In [8]:
# Loading dataset
nomes_colunas = ['Review', 'Sentimento']
dados_filmes = pd.read_csv('../../ia-lucas-dados/dados/imdb_reviews.csv', 
                           sep='\t',
                           names=nomes_colunas)

In [9]:
# Visualizing
dados_filmes.head()

Unnamed: 0,Review,Sentimento
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [10]:
# Shape
dados_filmes.shape

(748, 2)

In [11]:
# Verifyng proportion between labels 
dados_filmes['Sentimento'].value_counts()

1    386
0    362
Name: Sentimento, dtype: int64

## Bag Of Words Representation

In [12]:
# Text Manipulation
# Converting a group of text in a matrix of tokens counts
# Creating the vectorizer
vectorizer = CountVectorizer(stop_words = 'english', max_df = 0.99, min_df = 0.005)
vectorizer

CountVectorizer(max_df=0.99, min_df=0.005, stop_words='english')

In [49]:
# Extracting the sequences of texts applying vectorizer
sequences = vectorizer.fit_transform(dados_filmes.Review.tolist())
sequences

<748x320 sparse matrix of type '<class 'numpy.int64'>'
	with 2931 stored elements in Compressed Sparse Row format>

In [15]:
# Visualizing sparce matrix as data frame
print(pd.DataFrame(sequences).head())

                                                   0
0    (0, 248)\t1\n  (0, 185)\t1\n  (0, 183)\t1\n ...
1    (0, 162)\t1\n  (0, 41)\t1\n  (0, 14)\t1\n  (...
2    (0, 183)\t1\n  (0, 28)\t1\n  (0, 305)\t1\n  ...
3                         (0, 157)\t1\n  (0, 186)\t1
4    (0, 183)\t1\n  (0, 24)\t1\n  (0, 234)\t1\n  ...


In [16]:
# Labels
labels = dados_filmes.Sentimento.tolist()
labels[:5]

[0, 0, 0, 0, 1]

In [17]:
# Creating bag of words (vocabulary)
token2idx = vectorizer.vocabulary_

In [18]:
# Type
type(token2idx)

dict

In [19]:
# Total
print(len(token2idx))

320


In [20]:
print(token2idx)

{'slow': 248, 'moving': 185, 'movie': 183, 'young': 319, 'man': 171, 'lost': 162, 'characters': 41, 'audience': 14, 'half': 119, 'black': 28, 'white': 305, 'clever': 48, 'camera': 35, 'disappointed': 73, 'ridiculous': 227, 'acting': 3, 'poor': 211, 'plot': 209, 'lines': 156, 'non': 190, 'little': 157, 'music': 186, 'best': 24, 'scene': 234, 'trying': 285, 'rest': 226, 'lacks': 147, 'art': 12, 'works': 310, 'guess': 118, 'wasted': 300, 'saw': 232, 'today': 278, 'thought': 275, 'good': 115, 'kids': 144, 'bit': 27, 'predictable': 213, 'loved': 165, 'casting': 38, 'adorable': 8, 'lot': 163, 'look': 160, 'songs': 251, 'hilarious': 122, 'cool': 55, 'right': 228, 'face': 92, 'low': 167, 'budget': 33, 'long': 159, 'consider': 53, 'tale': 266, 'single': 247, 'film': 101, 'll': 158, 'cinematography': 46, 'production': 218, 'editing': 78, 'directing': 70, 'making': 170, 'perfect': 200, 'true': 283, 'history': 123, 'cinema': 45, 'think': 274, 'level': 151, 'films': 102, 'mind': 180, 'quite': 219, 

In [21]:
# How many times the term 'movie' shows in the texts
token2idx['movie']

183

In [39]:
# Inverting the format of dictionary
idx2token = {idx: token for token,idx in token2idx.items()}
print(idx2token)

{248: 'slow', 185: 'moving', 183: 'movie', 319: 'young', 171: 'man', 162: 'lost', 41: 'characters', 14: 'audience', 119: 'half', 28: 'black', 305: 'white', 48: 'clever', 35: 'camera', 73: 'disappointed', 227: 'ridiculous', 3: 'acting', 211: 'poor', 209: 'plot', 156: 'lines', 190: 'non', 157: 'little', 186: 'music', 24: 'best', 234: 'scene', 285: 'trying', 226: 'rest', 147: 'lacks', 12: 'art', 310: 'works', 118: 'guess', 300: 'wasted', 232: 'saw', 278: 'today', 275: 'thought', 115: 'good', 144: 'kids', 27: 'bit', 213: 'predictable', 165: 'loved', 38: 'casting', 8: 'adorable', 163: 'lot', 160: 'look', 251: 'songs', 122: 'hilarious', 55: 'cool', 228: 'right', 92: 'face', 167: 'low', 33: 'budget', 159: 'long', 53: 'consider', 266: 'tale', 247: 'single', 101: 'film', 158: 'll', 46: 'cinematography', 218: 'production', 78: 'editing', 70: 'directing', 170: 'making', 200: 'perfect', 283: 'true', 123: 'history', 45: 'cinema', 274: 'think', 151: 'level', 102: 'films', 180: 'mind', 219: 'quite', 

In [40]:
# Creating a class to handle these transformations
class Sequences():
    def __init__(self):
        self.vectorizer = CountVectorizer(stop_words='english',
                                         max_df = 0.99,
                                         min_df = 0.055)
        self.sequences = self.vectorizer.fit_transform(dados_filmes.Review.tolist())
        self.labels = dados_filmes.Sentimento.tolist()
        self.token2idx = self.vectorizer.vocabulary_
        self.idx2token = {idx:token for token, idx in self.token2idx.items()}
        
    def __getitem__(self, i):
        return self.sequences[i,:].toarray(), self.labels[i]
    
    def __len__(self):
        return self.sequences.shape[0]

In [41]:
# Instance of class Sequences
dados_frases = Sequences()

In [46]:
type(dados_frases)

__main__.Sequences

In [44]:
print(dados_frases[5][0].shape)

(1, 5)


In [45]:
# Preparing the trainning data to PyTorch format: DataLoader()
train_loader = DataLoader(dados_frases, batch_size=4096)
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7f6dc4c3d990>

## Definition and Building Model

In [53]:
# Classifier Class
class BagOfWordsClassifier(nn.Module):
    
    def __init__(self, vocab_size, hidden1, hidden2):
        super(BagOfWordsClassifier, self).__init__()
        self.fc1 = nn.Linear(vocab_size, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 1)
        
    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float()))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [55]:
# Creating model from the class Classifier
modelo = BagOfWordsClassifier(len(dados_frases.token2idx), 128, 64)
modelo

BagOfWordsClassifier(
  (fc1): Linear(in_features=5, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
)

In [56]:
# Defining loss function
criterion = nn.BCEWithLogitsLoss()

In [58]:
# Defining optimizer
# Adam algorithm change LR dinamically
optimizer = optim.Adam([p for p in modelo.parameters() if p.requires_grad], lr = 0.001)

In [59]:
# Trainning the model

# Instance of train model
modelo.train()

# List to save errors for each epoch
train_losses = []

# Number of epochs
epochs = 12

# Trainning Loop:
for epoch in range(epochs):
    
    # Progress Bar:
    progress_bar = tqdm_notebook(train_loader, leave = False)
    
    # Control List
    losses = []
    total = 0
    
    # Inner Loop
    for inputs, target in progress_bar:
        
        # model
        modelo.zero_grad()
        
        # Output
        output = modelo(inputs)
        
        # Error
        loss = criterion(output.squeeze(), target.float())
        
        # Backpropagation Instace
        loss.backward()
        
        # Preparing update of params values
        nn.utils.clip_grad_norm_(modelo.parameters(),3)
        
        # Optimizer
        optimizer.step()
        
        # Update Progress Bar
        progress_bar.set_description(f'\nErro no Modelo: {loss.item():.3f}')
        
        # Error and Total
        losses.append(loss.item())
        total += 1
        
    # Epoch Error
    epoch_loss = sum(losses)/total
    
    # Trainning Error
    train_losses.append(epoch_loss)
    
    tqdm.write(f'Epoch #{epoch+1}\t Error em Treinamento: {epoch_loss:.3f}')

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Epoch #1	 Error em Treinamento: 0.692


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Epoch #2	 Error em Treinamento: 0.690


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Epoch #3	 Error em Treinamento: 0.688


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Epoch #4	 Error em Treinamento: 0.686


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Epoch #5	 Error em Treinamento: 0.684


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Epoch #6	 Error em Treinamento: 0.683


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Epoch #7	 Error em Treinamento: 0.681


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Epoch #8	 Error em Treinamento: 0.680


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Epoch #9	 Error em Treinamento: 0.678


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Epoch #10	 Error em Treinamento: 0.677


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Epoch #11	 Error em Treinamento: 0.675


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Epoch #12	 Error em Treinamento: 0.674


## Predicting Sentimental

In [65]:
# Function to predict sentimental
def predict_sentiment(text):
    
    # Load Model
    modelo.eval()
    
    # Extracting predictions from model
    with torch.no_grad():
        
        # Text received converted to vector
        test_vector = torch.LongTensor(dados_frases.vectorizer.transform([text]).toarray())
        
        # Prediction
        output = modelo(test_vector)
        
        # Checking output and transform between 0 to 1
        prediction = torch.sigmoid(output).item()
        
        # Check the probability with threshold with 0.5
        if prediction >= 0.5:
            print(f'{prediction:0.3}: Sentimento Positivo')
        else:
            print(f'{prediction:0.3}: Sentimento Negativo')

In [66]:
# Testing

# Texto de avaliação de filme
test_text = """
This poor excuse for a movie is terrible. It has been 'so good it's bad' for a
while, and the high ratings are a good form of sarcasm, I have to admit. But
now it has to stop. Technically inept, spoon-feeding mundane messages with the
artistic weight of an eighties' commercial, hypocritical to say the least, it
deserves to fall into oblivion. Mr. Derek, I hope you realize you are like that
weird friend that everybody know is lame, but out of kindness and Christian
duty is treated like he's cool or something. That works if you are a good
decent human being, not if you are a horrible arrogant bully like you are. Yes,
Mr. 'Daddy' Derek will end on the history books of the internet for being a
delusional sour old man who thinks to be a good example for kids, but actually
has a poster of Kim Jong-Un in his closet. Destroy this movie if you all have a
conscience, as I hope IHE and all other youtube channel force-closed by Derek
out of SPITE would destroy him in the courts.This poor excuse for a movie is
terrible. It has been 'so good it's bad' for a while, and the high ratings are
a good form of sarcasm, I have to admit. But now it has to stop. Technically
inept, spoon-feeding mundane messages with the artistic weight of an eighties'
commercial, hypocritical to say the least, it deserves to fall into oblivion.
Mr. Derek, I hope you realize you are like that weird friend that everybody
know is lame, but out of kindness and Christian duty is treated like he's cool
or something. That works if you are a good decent human being, not if you are a
horrible arrogant bully like you are. Yes, Mr. 'Daddy' Derek will end on the
history books of the internet for being a delusional sour old man who thinks to
be a good example for kids, but actually has a poster of Kim Jong-Un in his
closet. Destroy this movie if you all have a conscience, as I hope IHE and all
other youtube channel force-closed by Derek out of SPITE would destroy him in
the courts.
"""

predict_sentiment(test_text)

0.614: Sentimento Positivo


In [67]:
# Texto de avaliação de filme
test_text = """
Cool Cat Saves The Kids is a symbolic masterpiece directed by Derek Savage that
is not only satirical in the way it makes fun of the media and politics, but in
the way in questions as how we humans live life and how society tells us to
live life.

Before I get into those details, I wanna talk about the special effects in this
film. They are ASTONISHING, and it shocks me that Cool Cat Saves The Kids got
snubbed by the Oscars for Best Special Effects. This film makes 2001 look like
garbage, and the directing in this film makes Stanley Kubrick look like the
worst director ever. You know what other film did that? Birdemic: Shock and
Terror. Both of these films are masterpieces, but if I had to choose my
favorite out of the 2, I would have to go with Cool Cat Saves The Kids. It is
now my 10th favorite film of all time.

Now, lets get into the symbolism: So you might be asking yourself, Why is Cool
Cat Orange? Well, I can easily explain. Orange is a color. Orange is also a
fruit, and its a very good fruit. You know what else is good? Good behavior.
What behavior does Cool Cat have? He has good behavior. This cannot be a
coincidence, since cool cat has good behavior in the film.

Now, why is Butch The Bully fat? Well, fat means your wide. You wanna know who
was wide? Hitler. Nuff said this cannot be a coincidence.

Why does Erik Estrada suspect Butch The Bully to be a bully? Well look at it
this way. What color of a shirt was Butchy wearing when he walks into the area?
I don't know, its looks like dark purple/dark blue. Why rhymes with dark? Mark.
Mark is that guy from the Room. The Room is the best movie of all time. What is
the opposite of best? Worst. This is how Erik knew Butch was a bully.

and finally, how come Vivica A. Fox isn't having a successful career after
making Kill Bill.

I actually can't answer that question.

Well thanks for reading my review.
"""

# Previsão
predict_sentiment(test_text)

0.568: Sentimento Positivo


In [68]:
# Texto de avaliação de filme
test_text = """
What the heck is this ? There is not one redeeming quality about this terrible
and very poorly done "movie". I can't even say that it's a "so bad it's good
movie".It is undeniably pointless to address all the things wrong here but
unfortunately even the "life lessons" about bullies and stuff like this are so
wrong and terrible that no kid should hear them.The costume is also horrible
and the acting...just unbelievable.No effort whatsoever was put into this thing
and it clearly shows,I have no idea what were they thinking or who was it even
meant for. I feel violated after watching this trash and I deeply recommend you
stay as far away as possible.This is certainly one of the worst pieces of c***
I have ever seen.
"""

# Previsão
predict_sentiment(test_text)

0.473: Sentimento Negativo


In [69]:
# Texto de avaliação de filme
test_text = """
Don't let any bullies out there try and shape your judgment on this gem of a
title.

Some people really don't have anything better to do, except trash a great movie
with annoying 1-star votes and spread lies on the Internet about how "dumb"
Cool Cat is.

I wouldn't be surprised to learn if much of the unwarranted negativity hurled
at this movie is coming from people who haven't even watched this movie for
themselves in the first place. Those people are no worse than the Butch the
Bully, the film's repulsive antagonist.

As it just so happens, one of the main points of "Cool Cat Saves the Kids" is
in addressing the attitudes of mean naysayers who try to demean others who
strive to bring good attitudes and fun vibes into people's lives. The message
to be learned here is that if one is friendly and good to others, the world is
friendly and good to one in return, and that is cool. Conversely, if one is
miserable and leaving 1-star votes on IMDb, one is alone and doesn't have any
friends at all. Ain't that the truth?

The world has uncovered a great, new, young filmmaking talent in "Cool Cat"
creator Derek Savage, and I sure hope that this is only the first of many
amazing films and stories that the world has yet to appreciate.

If you are a cool person who likes to have lots of fun, I guarantee that this
is a movie with charm that will uplift your spirits and reaffirm your positive
attitudes towards life.
"""

# Previsão
predict_sentiment(test_text)

0.683: Sentimento Positivo


In [70]:
test_text = """
Marvel's The Avengers (2012) is an awesome descent MCU superhero action flick that is spectacular and a real good one. The first time I watched it on my computer I loved it. I have seen this movie many times in the row I have this film on DVD and Blu-ray. The movie features an ensemble cast that includes Robert Downey Jr., Chris Evans, Mark Ruffalo, Chris Hemsworth, Scarlett Johansson, Jeremy Renner, Tom Hiddleston, Clark Gregg, Cobie Smulders, Stellan Skarsgård, and Samuel L. Jackson. Joss Whedon did a fine job directing the film and I loved it is definitely in my top 10 favorite MCU superhero films. In the finale battle in New York the moment right there.. WILL NEVER... EVER.. BE AS COOL.. LIKE THIS WAS A COMIC MASTERPIECE.. I don't even read comics, but I did superheroes when I was younger.. I have a feeling justice league will never have this cohesion... this team work.. and will never be able to recreate this feeling.

"""

# Previsão
predict_sentiment(test_text)

0.617: Sentimento Positivo


In [71]:
predict_sentiment('Awesome movie that was terrific production and actors')

0.531: Sentimento Positivo


In [75]:
predict_sentiment('Awful history and production was bad, wasting of time')

0.42: Sentimento Negativo
