# Setup

Import all required packages and set the base path for the datasets

In [1]:
import pandas as pd
import numpy as np
import os

In [None]:
datasetsPath = "./sentimentalAnalysisDatasets"
datasetFile = "IMDB_dataset.csv"

print("Path to dataset files:", datasetsPath)

In [3]:
data = pd.read_csv(os.path.join(datasetsPath, datasetFile))

In [None]:
data.head(5)

In [None]:
data["sentiment"].value_counts()

In [6]:
# Silence downcasting deprication warning
pd.set_option('future.no_silent_downcasting', True)

# Replace "positive" -> 1, "negative" -> 0

data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [7]:
labels = np.array(data['sentiment'].values, dtype=np.float32)

In [None]:
from bs4 import BeautifulSoup
import re

def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # Emoticons
                           u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # Transport & map symbols
                           u"\U0001F700-\U0001F77F"  # Alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric shapes
                           u"\U0001F800-\U0001F8FF"  # Supplemental arrows
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


# Remove any and all HTML tags from the review.
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

data["review"] = data["review"].apply(remove_html_tags)
data["review"] = data["review"].apply(remove_emojis)
data["review"] = data["review"].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)


In [None]:
data.head(5)

In [10]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

In [11]:
tokenizer = Tokenizer(num_words = 10000)
tokenizer.fit_on_texts(data["review"])

In [12]:
sequences = tokenizer.texts_to_sequences(data["review"])
padded_sequences = pad_sequences(sequences, maxlen=400)

In [13]:

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [14]:
model = Sequential()

model.add(Embedding(input_dim = 10000, output_dim = 128, input_length = 400))
model.add(Conv1D(64, 5, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.02)))
model.add(BatchNormalization())
model.add(Dropout(0.6))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation = "sigmoid"))


In [15]:
early_stop = EarlyStopping(monitor= "val_loss",
                   mode= "min",
                   patience= 4)

In [None]:
model.summary()

In [17]:
model.compile(optimizer = "adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

In [None]:
history = model.fit(X_train,
                    y_train,
                    epochs = 15,
                    batch_size = 64,
                    steps_per_epoch = 150,
                    validation_split = 0.3,
                    callbacks=[early_stop])

In [None]:
model_eval = model.evaluate(X_test, y_test)

print(f"Test Loss: {model_eval[0]}")
print(f"Test Accuarcy: {model_eval[1]}")