# Setup

Import all required packages and set the base path for the datasets

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
datasetsPath = "./sentimentalAnalysisDatasets"
datasetFile = "IMDB_dataset.csv"

print("Path to dataset files:", datasetsPath)

Path to dataset files: ./sentimentalAnalysisDatasets


In [3]:
data = pd.read_csv(os.path.join(datasetsPath, datasetFile))

In [4]:
data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
data["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [6]:
# Silence downcasting deprication warning
pd.set_option('future.no_silent_downcasting', True)

# Replace "positive" -> 1, "negative" -> 0

data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [7]:
labels = np.array(data['sentiment'].values, dtype=np.float32)

In [8]:
from bs4 import BeautifulSoup
import re

def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # Emoticons
                           u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # Transport & map symbols
                           u"\U0001F700-\U0001F77F"  # Alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric shapes
                           u"\U0001F800-\U0001F8FF"  # Supplemental arrows
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


# Remove any and all HTML tags from the review.
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

data["review"] = data["review"].apply(remove_html_tags)
data["review"] = data["review"].apply(remove_emojis)
data["review"] = data["review"].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)


  soup = BeautifulSoup(text, "html.parser")


In [9]:
data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production The filming tech...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically theres a family where a little boy J...,0
4,Petter Matteis Love in the Time of Money is a ...,1


In [10]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [11]:
tokenizer = Tokenizer(num_words = 10000)
tokenizer.fit_on_texts(data["review"])

In [12]:
sequences = tokenizer.texts_to_sequences(data["review"])
padded_sequences = pad_sequences(sequences, maxlen=400)

In [13]:

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [14]:
model = Sequential()

model.add(Embedding(input_dim = 10000, output_dim = 128, input_length = 400))
model.add(Conv1D(64, 5, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.03)))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation = "sigmoid"))


In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 400, 128)          1280000   
                                                                 
 conv1d (Conv1D)             (None, 396, 64)           41024     
                                                                 
 batch_normalization (BatchN  (None, 396, 64)          256       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 396, 64)           0         
                                                                 
 global_max_pooling1d (Globa  (None, 64)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 1)                 6

In [16]:
model.compile(optimizer = "adam", loss="binary_crossentropy", metrics=["accuracy"])

In [17]:
history = model.fit(X_train,
                    y_train,
                    epochs = 10,
                    batch_size = 64,
                    steps_per_epoch = 150,
                    validation_split = 0.3)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
model_eval = model.evaluate(X_test, y_test)

print(f"Test Loss: {model_eval[0]}")
print(f"Test Accuarcy: {model_eval[1]}")

Test Loss: 0.3838854432106018
Test Accuarcy: 0.883400022983551
