In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

import sys
import os
import pytreebank
import nltk
import re


from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.python.keras.callbacks import EarlyStopping

In [2]:
csv_file_path = os.path.join('..', '..', 'data', 'IMDB Dataset.csv')

if os.path.exists(csv_file_path):
    df = pd.read_csv(csv_file_path)
    print(df.head())
else:
    print("CSV file not found.")

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [12]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
sentiment_mapping = {'positive': 1, 'negative': 0}

# Apply the mapping to the "sentiment" column
df['sentiment_number'] = df['sentiment'].map(sentiment_mapping)


In [51]:
df

Unnamed: 0,review,sentiment,sentiment_number
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0
49997,I am a Catholic taught in parochial elementary...,negative,0
49998,I'm going to have to disagree with the previou...,negative,0


In [4]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

df['review']=df['review'].apply(strip_html)

  soup = BeautifulSoup(text, "html.parser")


In [5]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Print the shapes of the train and test DataFrames
print("Train Data:")
print("Shape:", train_df.shape)

print("Test Data:")
print("Shape:", test_df.shape)

Train Data:
Shape: (35000, 3)
Test Data:
Shape: (15000, 3)


In [58]:
vd=SentimentIntensityAnalyzer()
test_df.iloc[0]['review']

"I really liked this Summerslam due to the look of the arena, the curtains and just the look overall was interesting to me for some reason. Anyways, this could have been one of the best Summerslam's ever if the WWF didn't have Lex Luger in the main event against Yokozuna, now for it's time it was ok to have a huge fat man vs a strong man but I'm glad times have changed. It was a terrible main event just like every match Luger is in is terrible. Other matches on the card were Razor Ramon vs Ted Dibiase, Steiner Brothers vs Heavenly Bodies, Shawn Michaels vs Curt Hening, this was the event where Shawn named his big monster of a body guard Diesel, IRS vs 1-2-3 Kid, Bret Hart first takes on Doink then takes on Jerry Lawler and stuff with the Harts and Lawler was always very interesting, then Ludvig Borga destroyed Marty Jannetty, Undertaker took on Giant Gonzalez in another terrible match, The Smoking Gunns and Tatanka took on Bam Bam Bigelow and the Headshrinkers, and Yokozuna defended th

In [59]:
vd.polarity_scores(test_df.iloc[0]['review'])

{'neg': 0.103, 'neu': 0.775, 'pos': 0.122, 'compound': 0.0891}

In [60]:
test_df['score']=test_df['review'].apply(lambda text:vd.polarity_scores(text))

In [61]:
test_df

Unnamed: 0,review,sentiment,sentiment_number,score
33553,I really liked this Summerslam due to the look...,positive,1,"{'neg': 0.103, 'neu': 0.775, 'pos': 0.122, 'co..."
9427,Not many television shows appeal to quite as m...,positive,1,"{'neg': 0.023, 'neu': 0.865, 'pos': 0.112, 'co..."
199,The film quickly gets to a major chase scene w...,negative,0,"{'neg': 0.052, 'neu': 0.808, 'pos': 0.14, 'com..."
12447,Jane Austen would definitely approve of this o...,positive,1,"{'neg': 0.114, 'neu': 0.681, 'pos': 0.205, 'co..."
39489,Expectations were somewhat high for me when I ...,negative,0,"{'neg': 0.088, 'neu': 0.824, 'pos': 0.087, 'co..."
...,...,...,...,...
15168,"""Landscape after a battle"" opens with escaping...",positive,1,"{'neg': 0.148, 'neu': 0.679, 'pos': 0.173, 'co..."
49241,Jake Speed (1986) was an amusing parody of Ind...,positive,1,"{'neg': 0.083, 'neu': 0.737, 'pos': 0.18, 'com..."
39317,"PLAN B has the appearance of a quickly made, u...",negative,0,"{'neg': 0.094, 'neu': 0.831, 'pos': 0.075, 'co..."
42191,One of the perks of my job is that when things...,positive,1,"{'neg': 0.071, 'neu': 0.783, 'pos': 0.146, 'co..."


In [62]:
test_df['compound']=test_df['score'].apply(lambda d:d['compound'])
test_df['compound_score']=test_df['compound'].apply(lambda score:1 if score>=0 else 0)

In [63]:
test_df

Unnamed: 0,review,sentiment,sentiment_number,score,compound,compound_score
33553,I really liked this Summerslam due to the look...,positive,1,"{'neg': 0.103, 'neu': 0.775, 'pos': 0.122, 'co...",0.0891,1
9427,Not many television shows appeal to quite as m...,positive,1,"{'neg': 0.023, 'neu': 0.865, 'pos': 0.112, 'co...",0.9848,1
199,The film quickly gets to a major chase scene w...,negative,0,"{'neg': 0.052, 'neu': 0.808, 'pos': 0.14, 'com...",0.9245,1
12447,Jane Austen would definitely approve of this o...,positive,1,"{'neg': 0.114, 'neu': 0.681, 'pos': 0.205, 'co...",0.8873,1
39489,Expectations were somewhat high for me when I ...,negative,0,"{'neg': 0.088, 'neu': 0.824, 'pos': 0.087, 'co...",0.2058,1
...,...,...,...,...,...,...
15168,"""Landscape after a battle"" opens with escaping...",positive,1,"{'neg': 0.148, 'neu': 0.679, 'pos': 0.173, 'co...",0.9761,1
49241,Jake Speed (1986) was an amusing parody of Ind...,positive,1,"{'neg': 0.083, 'neu': 0.737, 'pos': 0.18, 'com...",0.9041,1
39317,"PLAN B has the appearance of a quickly made, u...",negative,0,"{'neg': 0.094, 'neu': 0.831, 'pos': 0.075, 'co...",-0.7551,0
42191,One of the perks of my job is that when things...,positive,1,"{'neg': 0.071, 'neu': 0.783, 'pos': 0.146, 'co...",0.9156,1


In [64]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [65]:
print(classification_report(test_df['sentiment_number'],test_df['compound_score']))

              precision    recall  f1-score   support

           0       0.78      0.54      0.64      7411
           1       0.65      0.85      0.74      7589

    accuracy                           0.70     15000
   macro avg       0.72      0.70      0.69     15000
weighted avg       0.72      0.70      0.69     15000



In [None]:
vd=SentimentIntensityAnalyzer()
test_df.iloc[0]['review']
vd.polarity_scores(test_df.iloc[0]['review'])

In [68]:
blob = TextBlob(test_df.iloc[0]['review'])
polarity = blob.sentiment.polarity

print("Polarity:", polarity)

Polarity: -0.003968253968253997


In [69]:
test_df['tb_score'] = test_df['review'].apply(lambda text: TextBlob(text).sentiment.polarity)

def convert_to_compound_score(compound):
    return 1 if compound >= 0 else 0

test_df['tb_label'] = test_df['tb_score'].apply(convert_to_compound_score)

print(test_df)

                                                  review sentiment  \
33553  I really liked this Summerslam due to the look...  positive   
9427   Not many television shows appeal to quite as m...  positive   
199    The film quickly gets to a major chase scene w...  negative   
12447  Jane Austen would definitely approve of this o...  positive   
39489  Expectations were somewhat high for me when I ...  negative   
...                                                  ...       ...   
15168  "Landscape after a battle" opens with escaping...  positive   
49241  Jake Speed (1986) was an amusing parody of Ind...  positive   
39317  PLAN B has the appearance of a quickly made, u...  negative   
42191  One of the perks of my job is that when things...  positive   
15109  Once you can get past the film's title, "Pecke...  positive   

       sentiment_number                                              score  \
33553                 1  {'neg': 0.103, 'neu': 0.775, 'pos': 0.122, 'co...   
942

In [70]:
print(classification_report(test_df['sentiment_number'],test_df['tb_label']))

              precision    recall  f1-score   support

           0       0.89      0.42      0.58      7411
           1       0.63      0.95      0.76      7589

    accuracy                           0.69     15000
   macro avg       0.76      0.69      0.67     15000
weighted avg       0.76      0.69      0.67     15000



In [6]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"

In [7]:
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
dtype=tf.string, trainable=True)

In [8]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1))

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 20)                400020    
                                                                 
 dense (Dense)               (None, 16)                336       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 400373 (1.53 MB)
Trainable params: 400373 (1.53 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
model.compile(optimizer='adam', 
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
metrics=['accuracy'])

In [11]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_df['review'], train_df['sentiment_number']))
test_dataset = tf.data.Dataset.from_tensor_slices((test_df['review'], test_df['sentiment_number']))


In [12]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [13]:
history = model.fit(

    train_dataset.shuffle(10000).batch(512),
    epochs=100,
    validation_data=test_dataset.batch(512),
    verbose=1,
    callbacks=[early_stopping]
)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
