In [1]:
# Import libraries

import warnings
warnings.filterwarnings("ignore")

import torch
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

import random
import re

from parrot import Parrot
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA


import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import math

In [2]:
torch.manual_seed(4042)

<torch._C.Generator at 0x227ce3972d0>

# IMDB data set exploration

- Experiments and explorations are first done on the IMDB data set.
- Findings and methods will be applied similarly to the SST data set in the later sections.

In [3]:
imdb = pd.read_csv("IMDB Dataset.csv")
imdb

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
# Clean reviews by removing HTML tags
imdb['review'] = imdb['review'].apply(lambda x: re.sub(r'<br\s*/?>', '', x, flags=re.IGNORECASE))

# Exploring Paraphrasing models

In [5]:
# Humarin's paraphraser based on chatgpt's paraphrases

tokenizer_p = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

n = 4 # Generate n paraphrases

def paraphrase(
    question,
    num_beams= n,
    num_beam_groups= n,
    num_return_sequences= n,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=200
):
    input_ids = tokenizer_p(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids
    
    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer_p.batch_decode(outputs, skip_special_tokens=True)

    return res


In [6]:
# Prithivida's Parrot paraphraser

parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# Testing out paraphrasers with a simple sentence

phrase = "this model is not performing up to my expectations"

print("Parrot output:")
para_phrases = parrot.augment(input_phrase=phrase, use_gpu=False, max_return_phrases = 4) # returns (string, len(string))
for para_phrase in para_phrases:
    print(para_phrase[0])
    
print("--------------------------------------------------------")

print("ChatGPT paraphraser output:")
para_phrases2 = paraphrase(phrase)
for para_phrase in para_phrases2:
    print(para_phrase)


Parrot output:
this model doesn't perform like i expected
this model does not meet my expectations
this model isn't up to my expectations
--------------------------------------------------------
ChatGPT paraphraser output:
This model is not meeting my expectations.
I am not satisfied with the performance of this model.
The quality of this model is not satisfactory.
My impressions of this model are not up to par.


In [7]:
# Testing using a sample sentence from the reviews

phrase = " A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story ."

print("Parrot output:")
para_phrases = parrot.augment(input_phrase=phrase, use_gpu=False,max_length=len(phrase)) # returns (string, len(string))
for para_phrase in para_phrases:
    print(para_phrase[0])
    
print("--------------------------------------------------------")

print("Humarin's paraphraser output:")
para_phrases2 = paraphrase(phrase)
for para_phrase in para_phrases2:
    print(para_phrase)

Parrot output:
a welcome relief from baseball films that try too hard to be mythic this is a sweet and modest and ultimately winning story
a welcome relief from baseball films that try too hard to be mythic this one is a sweet and modest and ultimately winning story
a welcome relief from baseball movies that try too hard to be mythic this one is a sweet and modest and ultimately winning story
--------------------------------------------------------
Humarin's paraphraser output:
Unlike baseball movies that strive to be overhyped, this story is both humble and ultimately successful.
This baseball movie is a welcome change from the overly ambitious and overblown tale of triumphant team members, as it's genuinely sweet and modest.
It's a welcome change from baseball movies that strive to be mythical, as it'll end up being genuinely sweet, modest, and ultimately successful.


In [8]:
# Testing using a sample review

phrase = imdb["review"].values[random.randint(0,50000)]
print(f'review: {phrase}')
print("--------------------------------------------------------")

print("Parrot output:")
para_phrases = parrot.augment(input_phrase=phrase, use_gpu=False) # returns (string, len(string))
for para_phrase in para_phrases:
    print(para_phrase[0])
    
print("--------------------------------------------------------")

print("Humarin's paraphraser output:")
para_phrases2 = paraphrase(phrase)
for para_phrase in para_phrases2:
    print(para_phrase)

review: I watched the beginning twice, could NOT make sense of it, and it bothered me for the whole movie.So, work this out with me: Wayne (the GOOD guy) jumps on the stagecoach, disarms the drivers (!), steals the money (?!), and takes off.Disarmed, one driver is then killed and the other wounded by the bad guys. Thanks to Wayne, who disarmed them, and then watched it happen.Then Wayne drops the money in the dirt, rescues the girl, rides into town, chuckles it up with Yak (too bad about the dead guy, I guess)...and then later says he "found" the money back at the scene. And everyone's okay with that.And he's the good guy? And I'm pretty sure there weren't small, hand-held flashlights at the time. And Bell did his first phone demo in 1876... were they in houses then? Am I thinking too hard about this one? Normally, I'm happy to suspend judgment to enjoy a movie, but this one bothered me. And that's a sign the move didn't really work for me.
---------------------------------------------

### Observations

- Parrot cannot paraphrase multiple sentences.
- Some reviews are extremely lengthy and only some of the sentences will be captured in the ChatGPT paraphraser. However, this may not be an issue as for the LSTM model to be built. Reviews longer than 200 characters will be truncated as well.

# Validating paraphrasing models on imdb data set

### Review wise paraphrasing

In [9]:
# Using Nltk.vader's Sentiment intensity Analyser for validation
sia = SIA()

In [16]:
# A random review is picked for validation
# A base polarity score of the review is computed
# Polarity of each paraphrase is computed
# The absolute difference between the base and paraphrase is then calculated

phrase = imdb["review"].values[random.randint(0,50000)]
base_score = sia.polarity_scores(phrase)['compound'] 
print(f'base score: {base_score}')

print("--------------------------------------------------------")

print("Humarin's paraphraser output:")
para_phrases2 = paraphrase(phrase)
for para_phrase in para_phrases2:
    para_score = sia.polarity_scores(para_phrase)['compound']
    print(f'para score: {para_score}')
    print(f'difference: {abs(para_score - base_score)}')

base score: -0.9971
--------------------------------------------------------
Humarin's paraphraser output:
para score: 0.3506
difference: 1.3477000000000001
para score: -0.6652
difference: 0.3319
para score: -0.9705
difference: 0.026599999999999957
para score: -0.9921
difference: 0.0050000000000000044


- Review wise paraphrsing can result in similar polarity score. Although some paraphrases may differ greatly

# Sentencewise paraphrasing

In [18]:
sentences = phrase.split(".")

res = [''] * n
for s in sentences:
    para_phrase =paraphrase(s)
    for i in range(len(res)):
        res[i] += para_phrase[i]

for rs in res:
    para_score = sia.polarity_scores(rs)['compound']
    print(f'para score: {para_score}')
    print(f'difference: {abs(para_score - base_score)}')

    

para score: -0.8299
difference: 0.16720000000000002
para score: 0.7264
difference: 1.7235
para score: -0.1531
difference: 0.844
para score: -0.4871
difference: 0.51


- Sentecewise paraphrasing does not guarantee closer polarity scores and semantics may be lost

# Training using small dataset

In [19]:
# Sample a small data set from the main dataframe

small_db = imdb.sample(n = 200)
imdb = imdb.drop(small_db.index)
small_db['sentiment'].value_counts()

sentiment
positive    111
negative     89
Name: count, dtype: int64

In [21]:
# Some statistics of the reviews sampled
l = []
for r in small_db['review'].values:
    l.append(len(r))
print(f"Mean length: {np.mean(l)}")
print(f"Max length: {max(l)}")

Mean length: 1370.515
Max length: 6970


## Sentence-wise

- Sentence wise takes an extremely long time as many reviews are many sentences long.
- Generating candidate reviews for the long reviews requires a lot of paraphrase model calls.
- The below code was tested on only a smaller dataset of 20 reviews but takes up to an hour to run.

In [42]:
sentence_db = small_db.copy()
polarity_s = []

reviews = sentence_db['review'].values
sentiments = sentence_db['sentiment'].values

for i in range(len(reviews)):
    base_score = sia.polarity_scores(reviews[i])['compound'] # Get sentiment of original review
    sentences = reviews[i].split(".") # Separate into sentences

    # Get new reviews paraphrased by sentences
    res = ['','','']
    for s in sentences:
        print(s)
        para_phrase =paraphrase(s)
        for j in range(len(res)):
            res[j] += para_phrase[j]

    # New review is added to dataframe if score difference is less than 0.2
    for rs in res:
        para_score = sia.polarity_scores(rs)['compound']
        diff = abs(para_score - base_score)
        print(f'difference: {diff}')
        if diff <= 0.2:
            new_row = {'review': rs, 'sentiment': sentiments[i]}
            sentence_db = pd.concat([sentence_db, pd.DataFrame([new_row])], ignore_index=True)
            polarity_s.append(diff)
    print(f'Review {i+1} done.')
    

The real life case of an innocent First Nations chief(the Indian) by an Winnipeg city officer(the Cowboy) is the basis of this TV movie
 The actual case caused its fair share of racial tension in Canada, a small scale Martin Luther King thing
 The misjustice of First Nations people is becoming a staple in the Canadian cinema diet
 What makes this film worth viewing is the focus on the family's reactions
 The father played by Gordon Tootoosis demands forgiveness and the brother played by Eric Schweig demands justice
 The stars Gordon Tootoosis and Adam Beach(WINDTALKERS, SKINWALKERS)have minor, almost cameo, appearances
 Soon-to-be star Eric Schweig makes his mark in this film with a powerful performance
 An honourable mention goes to veteran actor Gary Chalk who has chalked up over 100 movies to his credit
 His portrayal of the troubled soul Inspector Dowson was worthy of a Gemini Award(the Canadian Emmy)along with Eric Schweig
 The special effects(jump cuts, dream sequences) are occas

 Gum disease is less painful
 No wonder, with the exception of Corner Gas, Canadians generally avoid Canadian TV
 Come on CBC you're suppose to be our leading station showcasing the best of Canadian talent
 Pull the plug on this amateurish mess

difference: 0.042099999999999915
difference: 0.0042000000000000925
difference: 0.033399999999999985
Review 6 done.
I wish I could give this movie a zero, or even lower, because sadly that's what it deserves
 I honestly never walk out of a movie, but this one was so dreadfully awful that I couldn't stand another minute of it
 Please,please, please- for the sake of mankind- skip this movie
 If you want a hot lesbian movie that you can really delve into, this isn't it
It has unattractive, unappealing leads, choppy structure, ridiculous dialog, and it is absolutely unconvincing in every imaginable way
 On an absolutely basic level, it fails to entertain
 Everything about "Mango Kiss" is so stagey, it is WORSE than any student film I have seen
As if

 Throw in a bunch of very colorful supporting characters (such as the guy with the chopsticks and the policemen) and "Pickup on South Street" treats you to a splendor of personalities as they hunt down the mysterious and accidentally stolen microfilm frames
--PolarisDiB
difference: 0.8823000000000001
difference: 0.025800000000000045
difference: 0.09720000000000006
Review 13 done.
I saw the The Bourne Ultimatum last summer with a friend, and, wow! I had already seen the first two films and I liked them, but Ultimatum, I loved
Matt Damon plays Jason Bourne, a amnesia suffering CIA agent on the run, trying to discover who he is
Like I already said, I loved this movie from start to finish, no plot holes, slow scenes, everything was paced just right and it fit in well with the other films, but in all senses it was much better
Best stunts, car chases, actors, and effects I've seen in an action movie all summer, (surprisingly due to Spider-Man 3, Pirates, etc
) But I it wasn't just action in 

 After the euphoria of the fabulous ending wore off, I concluded that they are equal in their excellence
 I am just confused about why its not in the Top 50 along with Memento
 I'm going to venture a guess that (sadly) it's because it's in black and white or because (again sadly)that the characters all have British accents 


sadly because that is no reason to not appreciate a great movie like this
I'm telling you that if you loved Memento, you will love Following as well
 Brilliant!
difference: 0.0021999999999999797
difference: 0.04400000000000004
difference: 0.06169999999999998
Review 17 done.
This is the most cliche ridden and worst romantic comedy I have ever seen
 Every scene is cringe worthy and the two lead actors - Corey and Danny are soo annoying
 Corey is very dumb and naive and should have never listened to Danny's false promises
Neve Campbell and the killer from Urban Legend are the only redeeming qualities in this poor attempt of a film
 Danny (Dean Paras) looks in his lat

In [48]:
print(len(sentence_db))
sentence_db["sentiment"].value_counts()

57


sentiment
positive    39
negative    18
Name: count, dtype: int64

# Whole review wise

In [22]:
para_db = small_db.copy()
polarity_p = []

reviews = para_db['review'].values
sentiments = para_db['sentiment'].values

for i in range(len(reviews)):
    base_score = sia.polarity_scores(reviews[i])['compound'] # Get sentiment of original review

    # New review is added to dataframe if score difference is less than 0.2
    new_reviews = paraphrase(reviews[i])
    for new_r in new_reviews:
        para_score = sia.polarity_scores(new_r)['compound']
        diff = abs(para_score - base_score)
        print(f'difference: {diff}')
        if diff <= 0.2:
            new_row = {'review': new_r, 'sentiment': sentiments[i]}
            para_db = pd.concat([para_db, pd.DataFrame([new_row])], ignore_index=True)
            polarity_p.append(diff)
    print(f'Review {i+1} done.')
    

difference: 1.2647
difference: 1.1275
difference: 0.009499999999999953
difference: 0.8911
Review 1 done.
difference: 1.7423
difference: 1.7618
difference: 0.30679999999999996
difference: 0.0502999999999999
Review 2 done.
difference: 0.35209999999999997
difference: 0.13829999999999998
difference: 0.2862
difference: 0.042700000000000016
Review 3 done.
difference: 0.4005000000000001
difference: 1.3104
difference: 0.30890000000000006
difference: 0.11279999999999996
Review 4 done.
difference: 0.5569999999999999
difference: 0.4124
difference: 0.7493
difference: 0.05679999999999996
Review 5 done.
difference: 1.7231
difference: 0.3266
difference: 0.744
difference: 1.6093000000000002
Review 6 done.
difference: 0.30950000000000005
difference: 0.6277
difference: 0.15100000000000002
difference: 0.031000000000000028
Review 7 done.
difference: 0.8905
difference: 1.674
difference: 0.49929999999999997
difference: 1.3951
Review 8 done.
difference: 0.027100000000000013
difference: 0.23860000000000003
di

difference: 0.19830000000000003
difference: 0.23530000000000006
difference: 0.02859999999999996
difference: 0.1411
Review 69 done.
difference: 0.4745
difference: 0.4874
difference: 0.19940000000000008
difference: 0.3048
Review 70 done.
difference: 1.0796000000000001
difference: 0.504
difference: 0.504
difference: 0.1663
Review 71 done.
difference: 0.9315
difference: 0.3801
difference: 1.799
difference: 0.6739999999999999
Review 72 done.
difference: 1.032
difference: 0.37449999999999994
difference: 1.811
difference: 0.4295
Review 73 done.
difference: 0.012800000000000034
difference: 0.00770000000000004
difference: 0.04579999999999995
difference: 0.9202
Review 74 done.
difference: 0.1733
difference: 0.3696
difference: 0.46
difference: 0.427
Review 75 done.
difference: 0.1135
difference: 0.7692
difference: 0.6545
difference: 0.2735
Review 76 done.
difference: 0.05710000000000004
difference: 0.24329999999999996
difference: 0.010000000000000009
difference: 0.08409999999999995
Review 77 done

difference: 0.6058
difference: 0.5643
difference: 0.6612
difference: 0.11399999999999999
Review 140 done.
difference: 0.5609
difference: 0.08190000000000008
difference: 0.038900000000000046
difference: 0.08710000000000007
Review 141 done.
difference: 0.9094
difference: 0.4572
difference: 0.2978
difference: 0.40099999999999997
Review 142 done.
difference: 0.5265
difference: 0.08599999999999997
difference: 0.015600000000000058
difference: 0.9077
Review 143 done.
difference: 1.3155
difference: 1.0163
difference: 0.4745
difference: 0.1896
Review 144 done.
difference: 0.18170000000000008
difference: 0.13050000000000006
difference: 0.47840000000000005
difference: 0.28470000000000006
Review 145 done.
difference: 0.4718
difference: 0.2659
difference: 0.1456
difference: 0.20929999999999999
Review 146 done.
difference: 0.6554
difference: 0.9954
difference: 0.8886
difference: 1.5703
Review 147 done.
difference: 0.22009999999999996
difference: 0.2592
difference: 0.261
difference: 0.714
Review 148 

In [23]:
print(len(para_db))
para_db["sentiment"].value_counts()

479


sentiment
positive    307
negative    172
Name: count, dtype: int64

# LSTM Parameters and model building

In [39]:
# Sample another small dataset from the remaining data for testing

test = imdb.sample(n = 50)
test['sentiment'].value_counts()

sentiment
negative    27
positive    23
Name: count, dtype: int64

In [25]:
encoder = LabelEncoder()
# Hyperparameters of the model
vocab_size = 5000 
oov_tok = ''
embedding_dim = 100
max_length = 200 
padding_type='post'
trunc_type='post'
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

# Transform test data 
test_sentences = test['review'].values
test_labels = encoder.fit_transform(test['sentiment'].values)

# Test on sampled data set

In [41]:
# Create a model for the  small data set
train_sentences = small_db['review'].values
train_labels = encoder.fit_transform(small_db['sentiment'].values)

# tokenize sentences
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

# model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
# compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# model summary
model.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 200, 100)          500000    
                                                                 
 bidirectional_3 (Bidirecti  (None, 128)               84480     
 onal)                                                           
                                                                 
 dense_6 (Dense)             (None, 24)                3096      
                                                                 
 dense_7 (Dense)             (None, 1)                 25        
                                                                 
Total params: 587601 (2.24 MB)
Trainable params: 587601 (2.24 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [42]:
# Fit model
num_epochs = 5
history = model.fit(train_padded, train_labels, 
                    epochs=num_epochs, verbose=1, 
                    validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [43]:
# Get test accuracy

prediction = model.predict(test_padded)
# Get labels based on probability 1 if p>= 0.5 else 0
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred_labels))

Accuracy of prediction on test set :  0.52


## Test on whole paragraph paraphrase

In [44]:
# Model for paraphrased dataset

train_sentences = para_db['review'].values
train_labels = encoder.fit_transform(para_db['sentiment'].values)

# tokenize sentences
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

# model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
# compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# model summary
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 200, 100)          500000    
                                                                 
 bidirectional_4 (Bidirecti  (None, 128)               84480     
 onal)                                                           
                                                                 
 dense_8 (Dense)             (None, 24)                3096      
                                                                 
 dense_9 (Dense)             (None, 1)                 25        
                                                                 
Total params: 587601 (2.24 MB)
Trainable params: 587601 (2.24 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [45]:
# Fit model
num_epochs = 5
history = model.fit(train_padded, train_labels, 
                    epochs=num_epochs, verbose=1, 
                    validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [46]:
prediction = model.predict(test_padded)
# Get labels based on probability 1 if p>= 0.5 else 0
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred_labels))

Accuracy of prediction on test set :  0.6


# Standford Sentiment Treebank dataset

- Similar procedures will now be repeated for the SST dataset

In [49]:
sst = pd.read_csv('sst.txt', sep ='|', encoding='latin-1',header = None)
sst.rename(columns = { 0 : 'sentiment', 1: 'review'}, inplace = True)
sst

Unnamed: 0,sentiment,review
0,4,The Rock is destined to be the 21st Century '...
1,5,The gorgeously elaborate continuation of `` T...
2,4,Singer/composer Bryan Adams contributes a sle...
3,3,You 'd think by now America would have had en...
4,4,Yet the act is still charming here .
...,...,...
8539,1,A real snooze .
8540,2,No surprises .
8541,4,We 've seen the hippie-turned-yuppie plot bef...
8542,1,Her fans walked out muttering words like `` h...


In [50]:
l = []
for r in sst['review'].values:
    l.append(len(r))
print(f"Mean length: {np.mean(l)}")
print(f"Max length: {max(l)}")

Mean length: 103.31039325842697
Max length: 268


- The sst data set contains much shorter reviews.

In [102]:
# ChatGPT parapharser's hyperperimeters are retuned to better fit sst dataset

tokenizer_p = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

n = 5 # Generate n paraphrases

def paraphrase(
    question,
    num_beams= n,
    num_beam_groups= n,
    num_return_sequences= n,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=250
):
    input_ids = tokenizer_p(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids
    
    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer_p.batch_decode(outputs, skip_special_tokens=True)

    return res


## Testing paraphrasers on sst

In [53]:
# Testing out the paraphrasers using a sample reviews
phrase = sst["review"].values[random.randint(0,8544)]
print(f'review: {phrase}')
print("--------------------------------------------------------")

print("Parrot output:")
para_phrases = parrot.augment(input_phrase=phrase, use_gpu=False) # returns (string, len(string))
for para_phrase in para_phrases:
    print(para_phrase[0])
    
print("--------------------------------------------------------")

print("Humarin's paraphraser output:")
para_phrases2 = paraphrase(phrase)
for para_phrase in para_phrases2:
    print(para_phrase)

review:  When compared to the usual , more somber festival entries , Davis ' highly personal brand of romantic comedy is a tart , smart breath of fresh air that stands out from the pack even if the picture itself is somewhat problematic .
--------------------------------------------------------
Parrot output:
 When compared to the usual , more somber festival entries , Davis ' highly personal brand of romantic comedy is a tart , smart breath of fresh air that stands out from the pack even if the picture itself is somewhat problematic .
--------------------------------------------------------
Humarin's paraphraser output:
In contrast to the usual gloomy festival posts, Davis' personal romantic comedy is a sharp and intelligent piece of literature that stands out, even though the visuals are somewhat problematic.
While the usual festival-centric entries may be, Davis' personal romantic comedy is a sharp and intelligent piece of entertainment that stands out, even though the visuals are s

In [54]:
# Testing out the paraphrasers using a sample reviews
phrase = sst["review"].values[random.randint(0,8544)]
print(f'review: {phrase}')
print("--------------------------------------------------------")

print("Parrot output:")
para_phrases = parrot.augment(input_phrase=phrase, use_gpu=False) # returns (string, len(string))
for para_phrase in para_phrases:
    print(para_phrase[0])
    
print("--------------------------------------------------------")

print("Humarin's paraphraser output:")
para_phrases2 = paraphrase(phrase)
for para_phrase in para_phrases2:
    print(para_phrase)

review:  The film tries to touch on spousal abuse but veers off course and becomes just another revenge film .
--------------------------------------------------------
Parrot output:
 The film tries to touch on spousal abuse but veers off course and becomes just another revenge film .
--------------------------------------------------------
Humarin's paraphraser output:
Spousal abuse is a topic of discussion in the film, but it diverges from its original message and becomes merely another revenge movie.
Despite its attempts to address spousal abuse, the film deviates from its original message and becomes a repetition of seeking revenge.
The film attempts to address spousal abuse but diverges into another revenge movie.
While the film makes an attempt to touch on spousal abuse, it deviates from its original message and becomes a mere revenge movie.
Spousal abuse attempts are made, but the film becomes repetitive and seeks revenge.


- Once again. Parrot fails to paraphrase long sentences and ChatGPT paraphraser is robust enought to handle the reviews
- Only ChatGPT paraphraser will be used from now onwards

## Validating paraphrases on sst data set

In [57]:
phrase = sst["review"].values[random.randint(0,8500)]
base_score = sia.polarity_scores(phrase)['compound']
print(f'base score: {base_score}')

print("--------------------------------------------------------")

print("Humarin's paraphraser output:")
para_phrases2 = paraphrase(phrase)
for para_phrase in para_phrases2:
    para_score = sia.polarity_scores(para_phrase)['compound']
    print(f'para score: {para_score}')
    print(f'difference: {abs(para_score - base_score)}')

base score: 0.8225
--------------------------------------------------------
Humarin's paraphraser output:
para score: 0.7269
difference: 0.09560000000000002
para score: 0.8126
difference: 0.00990000000000002
para score: 0.5859
difference: 0.23660000000000003
para score: 0.7003
difference: 0.12219999999999998
para score: 0.7411
difference: 0.08140000000000003


In [69]:
# Sample a small data set from the main dataframe

sst_small_db = sst.sample(n = 500)
sst = sst.drop(sst_small_db.index)
sst_small_db['sentiment'].value_counts()

sentiment
2    142
4    128
3     94
5     79
1     57
Name: count, dtype: int64

In [72]:
sst_para_db = sst_small_db.copy()

reviews = sst_para_db['review'].values
sentiments = sst_para_db['sentiment'].values

for i in range(len(reviews)):
    base_score = sia.polarity_scores(reviews[i])['compound'] # Get sentiment of original review

    # New review is added to dataframe if score difference is less than 0.1
    new_reviews = paraphrase(reviews[i])
    for new_r in new_reviews:
        para_score = sia.polarity_scores(new_r)['compound']
        diff = abs(para_score - base_score)
        print(f'difference: {diff}')
        if diff <= 0.1:
            new_row = {'review': new_r, 'sentiment': sentiments[i]}
            sst_para_db = pd.concat([sst_para_db, pd.DataFrame([new_row])], ignore_index=True)
    print(f'Review {i+1} done.')

difference: 1.0815000000000001
difference: 1.2675
difference: 0.06470000000000004
difference: 1.122
difference: 1.2675
Review 1 done.
difference: 0.2732
difference: 0.23740000000000006
difference: 0.2216
difference: 0.4619
difference: 0.655
Review 2 done.
difference: 0.1285
difference: 0.1285
difference: 0.1521
difference: 0.10300000000000001
difference: 0.10300000000000001
Review 3 done.
difference: 0.00039999999999995595
difference: 0.0381999999999999
difference: 0.0024999999999999467
difference: 0.00039999999999995595
difference: 0.0023000000000000798
Review 4 done.
difference: 0.4886
difference: 0.39020000000000005
difference: 0.48100000000000004
difference: 0.2811
difference: 0.3331
Review 5 done.
difference: 0.3545
difference: 0.0
difference: 0.3612
difference: 0.4005
difference: 0.0
Review 6 done.
difference: 0.05679999999999999
difference: 0.4106
difference: 0.49779999999999996
difference: 0.3817
difference: 0.43339999999999995
Review 7 done.
difference: 0.44310000000000005
dif

difference: 0.9481999999999999
difference: 0.4215
difference: 0.4473
difference: 0.6477999999999999
difference: 0.6947
Review 61 done.
difference: 0.09399999999999997
difference: 0.28
difference: 0.28
difference: 0.17410000000000003
difference: 0.0033999999999999586
Review 62 done.
difference: 0.0534
difference: 0.0534
difference: 0.11250000000000004
difference: 0.16510000000000002
difference: 0.13490000000000002
Review 63 done.
difference: 0.024499999999999966
difference: 0.32519999999999993
difference: 1.1988
difference: 0.15849999999999997
difference: 0.2628999999999999
Review 64 done.
difference: 0.7086
difference: 1.2625000000000002
difference: 1.4344000000000001
difference: 0.7342000000000001
difference: 0.04260000000000008
Review 65 done.
difference: 0.06500000000000006
difference: 0.06500000000000006
difference: 0.06500000000000006
difference: 0.06500000000000006
difference: 0.6369
Review 66 done.
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0772
difference: 0.

difference: 0.47379999999999994
difference: 0.47379999999999994
difference: 0.16449999999999998
difference: 0.11659999999999993
difference: 0.8814
Review 122 done.
difference: 0.5994
difference: 0.3971
difference: 0.5994
difference: 0.05710000000000004
difference: 0.08879999999999999
Review 123 done.
difference: 0.5592
difference: 0.5592
difference: 0.5316
difference: 0.9978
difference: 0.5592
Review 124 done.
difference: 0.03479999999999994
difference: 0.05170000000000008
difference: 0.18969999999999998
difference: 0.18969999999999998
difference: 0.05170000000000008
Review 125 done.
difference: 0.3612
difference: 0.7003
difference: 0.5106
difference: 0.4401
difference: 0.5106
Review 126 done.
difference: 0.0
difference: 0.12479999999999997
difference: 0.12479999999999997
difference: 0.5008999999999999
difference: 0.06789999999999996
Review 127 done.
difference: 0.8992
difference: 0.14189999999999997
difference: 0.018399999999999972
difference: 0.8601
difference: 0.4588
Review 128 done

difference: 0.9385
difference: 0.24760000000000004
difference: 0.5982
difference: 0.495
difference: 0.4694
Review 180 done.
difference: 0.5106
difference: 0.0
difference: 0.5106
difference: 0.38260000000000005
difference: 0.7369000000000001
Review 181 done.
difference: 0.04069999999999996
difference: 0.04069999999999996
difference: 0.0
difference: 0.04069999999999996
difference: 0.04069999999999996
Review 182 done.
difference: 0.43610000000000004
difference: 0.43610000000000004
difference: 0.43610000000000004
difference: 0.1794
difference: 0.43610000000000004
Review 183 done.
difference: 0.17870000000000008
difference: 0.1512
difference: 0.17870000000000008
difference: 0.17870000000000008
difference: 0.9769000000000001
Review 184 done.
difference: 1.1329
difference: 0.7717
difference: 0.669
difference: 0.9740000000000001
difference: 0.45350000000000007
Review 185 done.
difference: 0.40790000000000004
difference: 0.19899999999999995
difference: 0.19899999999999995
difference: 0.12880000

difference: 0.2676
difference: 1.4011
difference: 0.0
difference: 0.0676
difference: 0.2207
Review 240 done.
difference: 0.0
difference: 0.0
difference: 0.378
difference: 0.378
difference: 0.0
Review 241 done.
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.11429999999999996
Review 242 done.
difference: 0.06719999999999993
difference: 0.6054
difference: 0.05919999999999992
difference: 0.22760000000000002
difference: 0.0847
Review 243 done.
difference: 0.3089
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0
Review 244 done.
difference: 1.1433
difference: 0.5600999999999999
difference: 1.1983000000000001
difference: 0.739
difference: 0.43279999999999996
Review 245 done.
difference: 0.6447
difference: 0.03859999999999997
difference: 0.17559999999999998
difference: 0.17559999999999998
difference: 0.08679999999999999
Review 246 done.
difference: 0.006799999999999973
difference: 0.15600000000000003
difference: 0.452
difference: 0.5853
difference: 

difference: 0.5499
difference: 0.6271000000000001
difference: 0.5499
difference: 0.5499
difference: 0.25750000000000006
Review 303 done.
difference: 0.2957
difference: 0.10770000000000002
difference: 0.03290000000000004
difference: 0.13590000000000002
difference: 0.21739999999999998
Review 304 done.
difference: 0.42
difference: 0.5106999999999999
difference: 1.1421999999999999
difference: 0.6428
difference: 0.06119999999999992
Review 305 done.
difference: 0.2889
difference: 0.09139999999999993
difference: 0.5128999999999999
difference: 0.09139999999999993
difference: 0.07839999999999991
Review 306 done.
difference: 0.7003
difference: 0.7003
difference: 0.10059999999999991
difference: 0.5224
difference: 0.22360000000000002
Review 307 done.
difference: 0.04730000000000001
difference: 0.12869999999999998
difference: 0.12869999999999998
difference: 0.3637
difference: 0.2232
Review 308 done.
difference: 0.0
difference: 0.07089999999999999
difference: 0.2381
difference: 0.5627
difference: 0.

difference: 0.5966
difference: 0.4845
difference: 0.4845
difference: 0.7935
difference: 0.4845
Review 363 done.
difference: 0.8106
difference: 0.1552
difference: 0.1552
difference: 1.0941
difference: 1.4238
Review 364 done.
difference: 0.0
difference: 0.0511
difference: 0.0
difference: 0.0511
difference: 0.38880000000000003
Review 365 done.
difference: 0.3612
difference: 0.7943
difference: 0.4404
difference: 0.3612
difference: 0.7964
Review 366 done.
difference: 0.23760000000000003
difference: 0.09750000000000003
difference: 0.0796
difference: 0.26030000000000003
difference: 0.0796
Review 367 done.
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0
Review 368 done.
difference: 0.5878000000000001
difference: 0.5878000000000001
difference: 0.0
difference: 0.1491
difference: 0.0
Review 369 done.
difference: 0.16709999999999992
difference: 1.3831
difference: 0.8155
difference: 1.1667
difference: 0.5617000000000001
Review 370 done.
difference: 0.2976
difference:

difference: 0.1154
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0
Review 427 done.
difference: 0.10919999999999996
difference: 0.1482
difference: 0.0807
difference: 0.13570000000000004
difference: 0.1482
Review 428 done.
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.6249
difference: 0.0
Review 429 done.
difference: 0.2839999999999999
difference: 0.5433999999999999
difference: 1.0114
difference: 1.0114
difference: 0.44299999999999995
Review 430 done.
difference: 0.6369
difference: 0.6369
difference: 0.25510000000000005
difference: 0.0
difference: 0.14139999999999997
Review 431 done.
difference: 0.2732
difference: 0.2732
difference: 0.7948
difference: 0.2732
difference: 0.2732
Review 432 done.
difference: 0.3246
difference: 0.0
difference: 0.0
difference: 0.0363
difference: 0.3722
Review 433 done.
difference: 0.3665
difference: 0.5968
difference: 0.22150000000000003
difference: 0.15570000000000006
difference: 1.0364
Review 434 done.
difference: 0.4215
dif

difference: 0.1451
difference: 0.21629999999999994
difference: 0.09560000000000002
difference: 0.13560000000000005
difference: 0.03810000000000002
Review 490 done.
difference: 0.04730000000000001
difference: 0.2141
difference: 0.0
difference: 0.9510000000000001
difference: 1.073
Review 491 done.
difference: 0.5423
difference: 0.5423
difference: 0.08460000000000001
difference: 0.5423
difference: 0.08350000000000002
Review 492 done.
difference: 0.2789
difference: 0.0
difference: 0.6808
difference: 0.0
difference: 0.2789
Review 493 done.
difference: 0.08410000000000006
difference: 0.05599999999999994
difference: 0.049899999999999944
difference: 0.6013999999999999
difference: 0.05599999999999994
Review 494 done.
difference: 0.5727
difference: 0.42150000000000004
difference: 0.5028999999999999
difference: 0.5028999999999999
difference: 0.1521
Review 495 done.
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0
Review 496 done.
difference: 0.1033
difference: 0.103

In [74]:
print(len(sst_para_db))
sst_para_db["sentiment"].value_counts()

1343


sentiment
2    373
4    359
3    259
5    223
1    129
Name: count, dtype: int64

## LSTM model for sst data

In [75]:
# Sample test data
sst_test = sst.sample(n = 200)
sst_test['sentiment'].value_counts()

sentiment
4    58
2    52
3    32
5    32
1    26
Name: count, dtype: int64

In [76]:
encoder = LabelEncoder()
# Hyperparameters of the model
vocab_size = 5000 
oov_tok = ''
embedding_dim = 100
max_length = 250 
padding_type='post'
trunc_type='post'
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

# Transform test data 
test_sentences = sst_test['review'].values
test_labels = encoder.fit_transform(sst_test['sentiment'].values)

# Test on sampled dataset

In [98]:
train_sentences = sst_small_db['review'].values
train_labels = encoder.fit_transform(sst_small_db['sentiment'].values)

# tokenize sentences
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

# model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(5, activation='softmax')
])
# compile model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# model summary
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 250, 100)          500000    
                                                                 
 bidirectional_13 (Bidirect  (None, 128)               84480     
 ional)                                                          
                                                                 
 dense_23 (Dense)            (None, 24)                3096      
                                                                 
 dense_24 (Dense)            (None, 5)                 125       
                                                                 
Total params: 587701 (2.24 MB)
Trainable params: 587701 (2.24 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [99]:
num_epochs = 10
history = model.fit(train_padded, train_labels, 
                    epochs=num_epochs, verbose=1, 
                    validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [100]:
prediction = model.predict(test_padded)
# Get labels based on probability
encoded_labels = [0,1,2,3,4]
pred_labels = []
for p in prediction:
    pred_labels.append(encoded_labels[np.argmax(p)])
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred_labels))

Accuracy of prediction on test set :  0.245


# Test on paraphrased dataset

In [91]:
train_sentences = sst_para_db['review'].values
train_labels = encoder.fit_transform(sst_para_db['sentiment'].values)

# tokenize sentences
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

# model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(5, activation='softmax')
])
# compile model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# model summary
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 250, 100)          500000    
                                                                 
 bidirectional_11 (Bidirect  (None, 128)               84480     
 ional)                                                          
                                                                 
 dense_19 (Dense)            (None, 24)                3096      
                                                                 
 dense_20 (Dense)            (None, 5)                 125       
                                                                 
Total params: 587701 (2.24 MB)
Trainable params: 587701 (2.24 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [92]:
num_epochs = 10
history = model.fit(train_padded, train_labels, 
                    epochs=num_epochs, verbose=1, 
                    validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [95]:
prediction = model.predict(test_padded)
# Get labels based on probability
encoded_labels = [0,1,2,3,4]
pred_labels = []
for p in prediction:
    pred_labels.append(encoded_labels[np.argmax(p)])
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred_labels))

Accuracy of prediction on test set :  0.36


# Experiment with lenient validation

- The threshold for paraphrasing validation was raised to 0.3 for the SST data set to see if higher number of training data with possibly less accurate labels might perform better.

In [105]:
sst_lpara_db = sst_small_db.copy()

reviews = sst_lpara_db['review'].values
sentiments = sst_lpara_db['sentiment'].values

for i in range(len(reviews)):
    base_score = sia.polarity_scores(reviews[i])['compound'] # Get sentiment of original review

    # New review is added to dataframe if score difference is less than 0.2
    new_reviews = paraphrase(reviews[i])
    for new_r in new_reviews:
        para_score = sia.polarity_scores(new_r)['compound']
        diff = abs(para_score - base_score)
        print(f'difference: {diff}')
        if diff <= 0.3:
            new_row = {'review': new_r, 'sentiment': sentiments[i]}
            sst_lpara_db = pd.concat([sst_lpara_db, pd.DataFrame([new_row])], ignore_index=True)
    print(f'Review {i+1} done.')

difference: 1.0815000000000001
difference: 1.2675
difference: 0.06470000000000004
difference: 1.122
difference: 1.2675
Review 1 done.
difference: 0.2732
difference: 0.23740000000000006
difference: 0.2216
difference: 0.4619
difference: 0.655
Review 2 done.
difference: 0.1285
difference: 0.1285
difference: 0.1521
difference: 0.10300000000000001
difference: 0.10300000000000001
Review 3 done.
difference: 0.00039999999999995595
difference: 0.0381999999999999
difference: 0.0024999999999999467
difference: 0.00039999999999995595
difference: 0.0023000000000000798
Review 4 done.
difference: 0.4886
difference: 0.39020000000000005
difference: 0.48100000000000004
difference: 0.2811
difference: 0.3331
Review 5 done.
difference: 0.3545
difference: 0.0
difference: 0.3612
difference: 0.4005
difference: 0.0
Review 6 done.
difference: 0.05679999999999999
difference: 0.4106
difference: 0.49779999999999996
difference: 0.3817
difference: 0.43339999999999995
Review 7 done.
difference: 0.44310000000000005
dif

difference: 0.9481999999999999
difference: 0.4215
difference: 0.4473
difference: 0.6477999999999999
difference: 0.6947
Review 61 done.
difference: 0.09399999999999997
difference: 0.28
difference: 0.28
difference: 0.17410000000000003
difference: 0.0033999999999999586
Review 62 done.
difference: 0.0534
difference: 0.0534
difference: 0.11250000000000004
difference: 0.16510000000000002
difference: 0.13490000000000002
Review 63 done.
difference: 0.024499999999999966
difference: 0.32519999999999993
difference: 1.1988
difference: 0.15849999999999997
difference: 0.2628999999999999
Review 64 done.
difference: 0.7086
difference: 1.2625000000000002
difference: 1.4344000000000001
difference: 0.7342000000000001
difference: 0.04260000000000008
Review 65 done.
difference: 0.06500000000000006
difference: 0.06500000000000006
difference: 0.06500000000000006
difference: 0.06500000000000006
difference: 0.6369
Review 66 done.
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0772
difference: 0.

difference: 0.47379999999999994
difference: 0.47379999999999994
difference: 0.16449999999999998
difference: 0.11659999999999993
difference: 0.8814
Review 122 done.
difference: 0.5994
difference: 0.3971
difference: 0.5994
difference: 0.05710000000000004
difference: 0.08879999999999999
Review 123 done.
difference: 0.5592
difference: 0.5592
difference: 0.5316
difference: 0.9978
difference: 0.5592
Review 124 done.
difference: 0.03479999999999994
difference: 0.05170000000000008
difference: 0.18969999999999998
difference: 0.18969999999999998
difference: 0.05170000000000008
Review 125 done.
difference: 0.3612
difference: 0.7003
difference: 0.5106
difference: 0.4401
difference: 0.5106
Review 126 done.
difference: 0.0
difference: 0.12479999999999997
difference: 0.12479999999999997
difference: 0.5008999999999999
difference: 0.06789999999999996
Review 127 done.
difference: 0.8992
difference: 0.14189999999999997
difference: 0.018399999999999972
difference: 0.8601
difference: 0.4588
Review 128 done

difference: 0.9385
difference: 0.24760000000000004
difference: 0.5982
difference: 0.495
difference: 0.4694
Review 180 done.
difference: 0.5106
difference: 0.0
difference: 0.5106
difference: 0.38260000000000005
difference: 0.7369000000000001
Review 181 done.
difference: 0.04069999999999996
difference: 0.04069999999999996
difference: 0.0
difference: 0.04069999999999996
difference: 0.04069999999999996
Review 182 done.
difference: 0.43610000000000004
difference: 0.43610000000000004
difference: 0.43610000000000004
difference: 0.1794
difference: 0.43610000000000004
Review 183 done.
difference: 0.17870000000000008
difference: 0.1512
difference: 0.17870000000000008
difference: 0.17870000000000008
difference: 0.9769000000000001
Review 184 done.
difference: 1.1329
difference: 0.7717
difference: 0.669
difference: 0.9740000000000001
difference: 0.45350000000000007
Review 185 done.
difference: 0.40790000000000004
difference: 0.19899999999999995
difference: 0.19899999999999995
difference: 0.12880000

difference: 0.2676
difference: 1.4011
difference: 0.0
difference: 0.0676
difference: 0.2207
Review 240 done.
difference: 0.0
difference: 0.0
difference: 0.378
difference: 0.378
difference: 0.0
Review 241 done.
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.11429999999999996
Review 242 done.
difference: 0.06719999999999993
difference: 0.6054
difference: 0.05919999999999992
difference: 0.22760000000000002
difference: 0.0847
Review 243 done.
difference: 0.3089
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0
Review 244 done.
difference: 1.1433
difference: 0.5600999999999999
difference: 1.1983000000000001
difference: 0.739
difference: 0.43279999999999996
Review 245 done.
difference: 0.6447
difference: 0.03859999999999997
difference: 0.17559999999999998
difference: 0.17559999999999998
difference: 0.08679999999999999
Review 246 done.
difference: 0.006799999999999973
difference: 0.15600000000000003
difference: 0.452
difference: 0.5853
difference: 

difference: 0.5499
difference: 0.6271000000000001
difference: 0.5499
difference: 0.5499
difference: 0.25750000000000006
Review 303 done.
difference: 0.2957
difference: 0.10770000000000002
difference: 0.03290000000000004
difference: 0.13590000000000002
difference: 0.21739999999999998
Review 304 done.
difference: 0.42
difference: 0.5106999999999999
difference: 1.1421999999999999
difference: 0.6428
difference: 0.06119999999999992
Review 305 done.
difference: 0.2889
difference: 0.09139999999999993
difference: 0.5128999999999999
difference: 0.09139999999999993
difference: 0.07839999999999991
Review 306 done.
difference: 0.7003
difference: 0.7003
difference: 0.10059999999999991
difference: 0.5224
difference: 0.22360000000000002
Review 307 done.
difference: 0.04730000000000001
difference: 0.12869999999999998
difference: 0.12869999999999998
difference: 0.3637
difference: 0.2232
Review 308 done.
difference: 0.0
difference: 0.07089999999999999
difference: 0.2381
difference: 0.5627
difference: 0.

difference: 0.5966
difference: 0.4845
difference: 0.4845
difference: 0.7935
difference: 0.4845
Review 363 done.
difference: 0.8106
difference: 0.1552
difference: 0.1552
difference: 1.0941
difference: 1.4238
Review 364 done.
difference: 0.0
difference: 0.0511
difference: 0.0
difference: 0.0511
difference: 0.38880000000000003
Review 365 done.
difference: 0.3612
difference: 0.7943
difference: 0.4404
difference: 0.3612
difference: 0.7964
Review 366 done.
difference: 0.23760000000000003
difference: 0.09750000000000003
difference: 0.0796
difference: 0.26030000000000003
difference: 0.0796
Review 367 done.
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0
Review 368 done.
difference: 0.5878000000000001
difference: 0.5878000000000001
difference: 0.0
difference: 0.1491
difference: 0.0
Review 369 done.
difference: 0.16709999999999992
difference: 1.3831
difference: 0.8155
difference: 1.1667
difference: 0.5617000000000001
Review 370 done.
difference: 0.2976
difference:

difference: 0.1154
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0
Review 427 done.
difference: 0.10919999999999996
difference: 0.1482
difference: 0.0807
difference: 0.13570000000000004
difference: 0.1482
Review 428 done.
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.6249
difference: 0.0
Review 429 done.
difference: 0.2839999999999999
difference: 0.5433999999999999
difference: 1.0114
difference: 1.0114
difference: 0.44299999999999995
Review 430 done.
difference: 0.6369
difference: 0.6369
difference: 0.25510000000000005
difference: 0.0
difference: 0.14139999999999997
Review 431 done.
difference: 0.2732
difference: 0.2732
difference: 0.7948
difference: 0.2732
difference: 0.2732
Review 432 done.
difference: 0.3246
difference: 0.0
difference: 0.0
difference: 0.0363
difference: 0.3722
Review 433 done.
difference: 0.3665
difference: 0.5968
difference: 0.22150000000000003
difference: 0.15570000000000006
difference: 1.0364
Review 434 done.
difference: 0.4215
dif

difference: 0.1451
difference: 0.21629999999999994
difference: 0.09560000000000002
difference: 0.13560000000000005
difference: 0.03810000000000002
Review 490 done.
difference: 0.04730000000000001
difference: 0.2141
difference: 0.0
difference: 0.9510000000000001
difference: 1.073
Review 491 done.
difference: 0.5423
difference: 0.5423
difference: 0.08460000000000001
difference: 0.5423
difference: 0.08350000000000002
Review 492 done.
difference: 0.2789
difference: 0.0
difference: 0.6808
difference: 0.0
difference: 0.2789
Review 493 done.
difference: 0.08410000000000006
difference: 0.05599999999999994
difference: 0.049899999999999944
difference: 0.6013999999999999
difference: 0.05599999999999994
Review 494 done.
difference: 0.5727
difference: 0.42150000000000004
difference: 0.5028999999999999
difference: 0.5028999999999999
difference: 0.1521
Review 495 done.
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0
difference: 0.0
Review 496 done.
difference: 0.1033
difference: 0.103

In [106]:
print(len(sst_lpara_db))
sst_lpara_db["sentiment"].value_counts()

2035


sentiment
4    552
2    548
3    384
5    344
1    207
Name: count, dtype: int64

- The number of training data has increased to 2035 from 1343 when validation threshold was lower

In [107]:
train_sentences = sst_lpara_db['review'].values
train_labels = encoder.fit_transform(sst_lpara_db['sentiment'].values)

# tokenize sentences
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

# model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(5, activation='softmax')
])
# compile model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# model summary
model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 250, 100)          500000    
                                                                 
 bidirectional_14 (Bidirect  (None, 128)               84480     
 ional)                                                          
                                                                 
 dense_25 (Dense)            (None, 24)                3096      
                                                                 
 dense_26 (Dense)            (None, 5)                 125       
                                                                 
Total params: 587701 (2.24 MB)
Trainable params: 587701 (2.24 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [108]:
num_epochs = 10
history = model.fit(train_padded, train_labels, 
                    epochs=num_epochs, verbose=1, 
                    validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [109]:
prediction = model.predict(test_padded)
# Get labels based on probability
encoded_labels = [0,1,2,3,4]
pred_labels = []
for p in prediction:
    pred_labels.append(encoded_labels[np.argmax(p)])
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred_labels))

Accuracy of prediction on test set :  0.325


- Despite the increase in training data, the model with lenient validation does not perform better than the model with stricter validation when test data was used.