In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

import sys
import os
import pytreebank
import nltk
import re


from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import f1_score, accuracy_score
from textblob import TextBlob
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import string

from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.python.keras.callbacks import EarlyStopping

In [2]:
csv_file_path = os.path.join('..', '..', 'data', 'IMDB Dataset.csv')

if os.path.exists(csv_file_path):
    df = pd.read_csv(csv_file_path)
    print(df.head())
else:           
    print("CSV file not found.")

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
sentiment_mapping = {'positive': 1, 'negative': 0}

# Apply the mapping to the "sentiment" column
df['sentiment_number'] = df['sentiment'].map(sentiment_mapping)


In [5]:
df

Unnamed: 0,review,sentiment,sentiment_number
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0
49997,I am a Catholic taught in parochial elementary...,negative,0
49998,I'm going to have to disagree with the previou...,negative,0


In [8]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def preprocess_text(text):
    text = text.lower()
    return text

def remove_punctuation(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    filtered_text = [word for word in text.split() if word not in stop_words]
    return " ".join(filtered_text)
df['review']=df['review'].apply(strip_html)
df['review']=df['review'].apply(preprocess_text)
df['review']=df['review'].apply(remove_punctuation)
df['review']=df['review'].apply(remove_stopwords)

  soup = BeautifulSoup(text, "html.parser")


In [9]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Print the shapes of the train and test DataFrames
print("Train Data:")
print("Shape:", train_df.shape)

print("Test Data:")
print("Shape:", test_df.shape)

Train Data:
Shape: (35000, 3)
Test Data:
Shape: (15000, 3)


In [10]:
vd=SentimentIntensityAnalyzer()
test_df.iloc[0]['review']

'really liked summerslam due look arena curtains look overall interesting reason anyways could one best summerslams ever wwf didnt lex luger main event yokozuna time ok huge fat man vs strong man im glad times changed terrible main event like every match luger terrible matches card razor ramon vs ted dibiase steiner brothers vs heavenly bodies shawn michaels vs curt hening event shawn named big monster body guard diesel irs vs 123 kid bret hart first takes doink takes jerry lawler stuff harts lawler always interesting ludvig borga destroyed marty jannetty undertaker took giant gonzalez another terrible match smoking gunns tatanka took bam bam bigelow headshrinkers yokozuna defended world title lex luger match boring terrible ending however deserves 810'

In [11]:
vd.polarity_scores(test_df.iloc[0]['review'])

{'neg': 0.119, 'neu': 0.683, 'pos': 0.199, 'compound': 0.902}

In [12]:
test_df['score']=test_df['review'].apply(lambda text:vd.polarity_scores(text))

In [13]:
test_df

Unnamed: 0,review,sentiment,sentiment_number,score
33553,really liked summerslam due look arena curtain...,positive,1,"{'neg': 0.119, 'neu': 0.683, 'pos': 0.199, 'co..."
9427,many television shows appeal quite many differ...,positive,1,"{'neg': 0.03, 'neu': 0.803, 'pos': 0.168, 'com..."
199,film quickly gets major chase scene ever incre...,negative,0,"{'neg': 0.131, 'neu': 0.699, 'pos': 0.17, 'com..."
12447,jane austen would definitely approve onegwynet...,positive,1,"{'neg': 0.123, 'neu': 0.556, 'pos': 0.322, 'co..."
39489,expectations somewhat high went see movie thou...,negative,0,"{'neg': 0.143, 'neu': 0.706, 'pos': 0.15, 'com..."
...,...,...,...,...
15168,landscape battle opens escaping prisoners snow...,positive,1,"{'neg': 0.196, 'neu': 0.58, 'pos': 0.224, 'com..."
49241,jake speed 1986 amusing parody indiana jones a...,positive,1,"{'neg': 0.152, 'neu': 0.587, 'pos': 0.262, 'co..."
39317,plan b appearance quickly made unedited sloppy...,negative,0,"{'neg': 0.191, 'neu': 0.671, 'pos': 0.138, 'co..."
42191,one perks job things slow watch movie downstai...,positive,1,"{'neg': 0.144, 'neu': 0.701, 'pos': 0.155, 'co..."


In [14]:
test_df['compound']=test_df['score'].apply(lambda d:d['compound'])
test_df['compound_score']=test_df['compound'].apply(lambda score:1 if score>=0 else 0)

In [15]:
test_df

Unnamed: 0,review,sentiment,sentiment_number,score,compound,compound_score
33553,really liked summerslam due look arena curtain...,positive,1,"{'neg': 0.119, 'neu': 0.683, 'pos': 0.199, 'co...",0.9020,1
9427,many television shows appeal quite many differ...,positive,1,"{'neg': 0.03, 'neu': 0.803, 'pos': 0.168, 'com...",0.9788,1
199,film quickly gets major chase scene ever incre...,negative,0,"{'neg': 0.131, 'neu': 0.699, 'pos': 0.17, 'com...",0.2748,1
12447,jane austen would definitely approve onegwynet...,positive,1,"{'neg': 0.123, 'neu': 0.556, 'pos': 0.322, 'co...",0.9630,1
39489,expectations somewhat high went see movie thou...,negative,0,"{'neg': 0.143, 'neu': 0.706, 'pos': 0.15, 'com...",0.3431,1
...,...,...,...,...,...,...
15168,landscape battle opens escaping prisoners snow...,positive,1,"{'neg': 0.196, 'neu': 0.58, 'pos': 0.224, 'com...",0.9501,1
49241,jake speed 1986 amusing parody indiana jones a...,positive,1,"{'neg': 0.152, 'neu': 0.587, 'pos': 0.262, 'co...",0.8315,1
39317,plan b appearance quickly made unedited sloppy...,negative,0,"{'neg': 0.191, 'neu': 0.671, 'pos': 0.138, 'co...",-0.9291,0
42191,one perks job things slow watch movie downstai...,positive,1,"{'neg': 0.144, 'neu': 0.701, 'pos': 0.155, 'co...",0.2604,1


In [16]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [17]:
print(classification_report(test_df['sentiment_number'],test_df['compound_score']))

              precision    recall  f1-score   support

           0       0.78      0.49      0.60      7411
           1       0.63      0.86      0.73      7589

    accuracy                           0.68     15000
   macro avg       0.71      0.68      0.67     15000
weighted avg       0.71      0.68      0.67     15000



In [32]:
def f1_acc(df,pred_column):
    f1_macro  = f1_score(df['sentiment_number'],df[pred_column],average='macro')
    acc = accuracy_score(df['sentiment_number'],df[pred_column])
    print("F1 Score : {} \n Accuracy : {}".format(f1_macro,acc))

In [33]:
f1_acc(test_df,"compound_score")

F1 Score : 0.6654370080291651 
 Accuracy : 0.6783333333333333


In [18]:
vd=SentimentIntensityAnalyzer()
test_df.iloc[0]['review']
vd.polarity_scores(test_df.iloc[0]['review'])

{'neg': 0.119, 'neu': 0.683, 'pos': 0.199, 'compound': 0.902}

In [19]:
blob = TextBlob(test_df.iloc[0]['review'])
polarity = blob.sentiment.polarity

print("Polarity:", polarity)

Polarity: -0.005416666666666692


In [20]:
test_df['tb_score'] = test_df['review'].apply(lambda text: TextBlob(text).sentiment.polarity)

def convert_to_compound_score(compound):
    return 1 if compound >= 0 else 0

test_df['tb_label'] = test_df['tb_score'].apply(convert_to_compound_score)

print(test_df)

                                                  review sentiment  \
33553  really liked summerslam due look arena curtain...  positive   
9427   many television shows appeal quite many differ...  positive   
199    film quickly gets major chase scene ever incre...  negative   
12447  jane austen would definitely approve onegwynet...  positive   
39489  expectations somewhat high went see movie thou...  negative   
...                                                  ...       ...   
15168  landscape battle opens escaping prisoners snow...  positive   
49241  jake speed 1986 amusing parody indiana jones a...  positive   
39317  plan b appearance quickly made unedited sloppy...  negative   
42191  one perks job things slow watch movie downstai...  positive   
15109  get past films title pecker great film perhaps...  positive   

       sentiment_number                                              score  \
33553                 1  {'neg': 0.119, 'neu': 0.683, 'pos': 0.199, 'co...   
942

In [21]:
print(classification_report(test_df['sentiment_number'],test_df['tb_label']))

              precision    recall  f1-score   support

           0       0.87      0.46      0.60      7411
           1       0.64      0.93      0.76      7589

    accuracy                           0.70     15000
   macro avg       0.75      0.70      0.68     15000
weighted avg       0.75      0.70      0.68     15000



In [34]:
f1_acc(test_df,"tb_label")

F1 Score : 0.6796813732872438 
 Accuracy : 0.6988666666666666


In [22]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"

In [23]:
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
dtype=tf.string, trainable=True)

In [24]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1))

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 20)                400020    
                                                                 
 dense (Dense)               (None, 16)                336       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 400373 (1.53 MB)
Trainable params: 400373 (1.53 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [26]:
model.compile(optimizer='adam', 
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
metrics=['accuracy'])

In [27]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_df['review'], train_df['sentiment_number']))
test_dataset = tf.data.Dataset.from_tensor_slices((test_df['review'], test_df['sentiment_number']))


In [28]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [29]:
history = model.fit(

    train_dataset.shuffle(10000).batch(512),
    epochs=100,
    validation_data=test_dataset.batch(512),
    verbose=1,
    callbacks=[early_stopping]
)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
