In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

import sys
import os
import pytreebank
import nltk
import re


from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import f1_score, accuracy_score
from textblob import TextBlob
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import string

from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.python.keras.callbacks import EarlyStopping

### LOADING THE DATASET

In [2]:
csv_file_path = os.path.join('..', '..', 'data', 'IMDB Dataset.csv')

if os.path.exists(csv_file_path):
    df = pd.read_csv(csv_file_path)
    print(df.head())
else:           
    print("CSV file not found.")

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


### DATA PREPROCESSING

In [3]:
sentiment_mapping = {'positive': 1, 'negative': 0}

# Apply the mapping to the "sentiment" column
df['sentiment_number'] = df['sentiment'].map(sentiment_mapping)


In [4]:
df

Unnamed: 0,review,sentiment,sentiment_number
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0
49997,I am a Catholic taught in parochial elementary...,negative,0
49998,I'm going to have to disagree with the previou...,negative,0


In [5]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def preprocess_text(text):
    text = text.lower()
    return text

def remove_punctuation(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    filtered_text = [word for word in text.split() if word not in stop_words]
    return " ".join(filtered_text)
df['review']=df['review'].apply(strip_html)
df['review']=df['review'].apply(preprocess_text)
df['review']=df['review'].apply(remove_punctuation)
df['review']=df['review'].apply(remove_stopwords)

  soup = BeautifulSoup(text, "html.parser")


In [6]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Print the shapes of the train and test DataFrames
print("Train Data:")
print("Shape:", train_df.shape)

print("Test Data:")
print("Shape:", test_df.shape)

Train Data:
Shape: (35000, 3)
Test Data:
Shape: (15000, 3)


### MULTI-LAYER PERCEPTRON (MLP)

In [21]:
embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"

hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True)

model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1))
model.summary()

model.compile(optimizer='adam', 
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
metrics=['accuracy'])

train_dataset = tf.data.Dataset.from_tensor_slices((train_df['review'], train_df['sentiment_number']))
test_dataset = tf.data.Dataset.from_tensor_slices((test_df['review'], test_df['sentiment_number']))


early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(

    train_dataset.shuffle(10000).batch(512),
    epochs=100,
    validation_data=test_dataset.batch(512),
    verbose=1,
    callbacks=[early_stopping]
)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


### TEXTBLOB

In [7]:
blob = TextBlob(test_df.iloc[0]['review'])
polarity = blob.sentiment.polarity

print("Polarity:", polarity)

Polarity: -0.005416666666666692


In [8]:
test_df['tb_score'] = test_df['review'].apply(lambda text: TextBlob(text).sentiment.polarity)

def convert_to_compound_score(compound):
    return 1 if compound >= 0 else 0

test_df['tb_label'] = test_df['tb_score'].apply(convert_to_compound_score)

print(test_df)

                                                  review sentiment  \
33553  really liked summerslam due look arena curtain...  positive   
9427   many television shows appeal quite many differ...  positive   
199    film quickly gets major chase scene ever incre...  negative   
12447  jane austen would definitely approve onegwynet...  positive   
39489  expectations somewhat high went see movie thou...  negative   
...                                                  ...       ...   
15168  landscape battle opens escaping prisoners snow...  positive   
49241  jake speed 1986 amusing parody indiana jones a...  positive   
39317  plan b appearance quickly made unedited sloppy...  negative   
42191  one perks job things slow watch movie downstai...  positive   
15109  get past films title pecker great film perhaps...  positive   

       sentiment_number  tb_score  tb_label  
33553                 1 -0.005417         0  
9427                  1  0.177121         1  
199                  

In [9]:
def f1_acc(df,pred_column):
    f1_macro  = f1_score(df['sentiment_number'],df[pred_column],average='macro')
    acc = accuracy_score(df['sentiment_number'],df[pred_column])
    print("F1 Score : {} \n Accuracy : {}".format(f1_macro,acc))

In [10]:
f1_acc(test_df,"tb_label")

F1 Score : 0.6796813732872438 
 Accuracy : 0.6988666666666666


### VADER

In [11]:
vd=SentimentIntensityAnalyzer()
test_df['score']=test_df['review'].apply(lambda text:vd.polarity_scores(text))

In [12]:
test_df['compound']=test_df['score'].apply(lambda d:d['compound'])
test_df['compound_score']=test_df['compound'].apply(lambda score:1 if score>=0 else 0)

In [13]:
f1_acc(test_df,"compound_score")

F1 Score : 0.6654370080291651 
 Accuracy : 0.6783333333333333


In [14]:
test_df

Unnamed: 0,review,sentiment,sentiment_number,tb_score,tb_label,score,compound,compound_score
33553,really liked summerslam due look arena curtain...,positive,1,-0.005417,0,"{'neg': 0.119, 'neu': 0.683, 'pos': 0.199, 'co...",0.9020,1
9427,many television shows appeal quite many differ...,positive,1,0.177121,1,"{'neg': 0.03, 'neu': 0.803, 'pos': 0.168, 'com...",0.9788,1
199,film quickly gets major chase scene ever incre...,negative,0,0.174583,1,"{'neg': 0.131, 'neu': 0.699, 'pos': 0.17, 'com...",0.2748,1
12447,jane austen would definitely approve onegwynet...,positive,1,0.365385,1,"{'neg': 0.123, 'neu': 0.556, 'pos': 0.322, 'co...",0.9630,1
39489,expectations somewhat high went see movie thou...,negative,0,0.017917,1,"{'neg': 0.143, 'neu': 0.706, 'pos': 0.15, 'com...",0.3431,1
...,...,...,...,...,...,...,...,...
15168,landscape battle opens escaping prisoners snow...,positive,1,0.112626,1,"{'neg': 0.196, 'neu': 0.58, 'pos': 0.224, 'com...",0.9501,1
49241,jake speed 1986 amusing parody indiana jones a...,positive,1,-0.021212,0,"{'neg': 0.152, 'neu': 0.587, 'pos': 0.262, 'co...",0.8315,1
39317,plan b appearance quickly made unedited sloppy...,negative,0,-0.044015,0,"{'neg': 0.191, 'neu': 0.671, 'pos': 0.138, 'co...",-0.9291,0
42191,one perks job things slow watch movie downstai...,positive,1,0.144038,1,"{'neg': 0.144, 'neu': 0.701, 'pos': 0.155, 'co...",0.2604,1
