# Overview

## Dataset

A public dataset from kaggle was used. The original can be found [here](https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia)

# Setup

Import all required packages and set the base path for the datasets

In [23]:
import pandas as pd
import numpy as np
import os

In [None]:
datasetsPath = "./sentimentalAnalysisDatasets"
datasetFile = "IMDB_dataset.csv"

print("Path to dataset files:", datasetsPath)

In [25]:
data = pd.read_csv(os.path.join(datasetsPath, datasetFile))

In [None]:
data.head(5)

In [None]:
data["sentiment"].value_counts()

In [28]:
# Silence downcasting deprication warning
pd.set_option('future.no_silent_downcasting', True)

# Replace "positive" -> 1, "negative" -> 0

data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [None]:
from bs4 import BeautifulSoup
import re

def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # Emoticons
                           u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # Transport & map symbols
                           u"\U0001F700-\U0001F77F"  # Alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric shapes
                           u"\U0001F800-\U0001F8FF"  # Supplemental arrows
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


# Remove any and all HTML tags from the review.
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

data["review"] = data["review"].apply(remove_html_tags)
data["review"] = data["review"].apply(remove_emojis)
data["review"] = data["review"].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)


In [None]:
data.head(5)

In [None]:
data["sentiment"].value_counts()

In [32]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [33]:
(train_data, test_data) = train_test_split(data, test_size = 0.2, random_state=42)

In [None]:
train_data.shape


In [None]:
test_data.shape

In [36]:
tokenizer = Tokenizer(num_words = 10000)
tokenizer.fit_on_texts(train_data["review"])

In [37]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=400)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=400)

In [38]:
Y_train = train_data["sentiment"].astype('int64')
Y_test = test_data["sentiment"].astype('int64')

In [None]:
Y_train

In [None]:
model = Sequential()
model.add(Embedding(input_dim = 10000, output_dim = 128, input_length = 400))
model.add(LSTM(128, dropout=0.4, recurrent_dropout = 0.4))
model.add(Dense(1, activation = "sigmoid"))

In [None]:
model.summary()

In [42]:
model.compile(optimizer = "adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
history = model.fit(X_train,
                    Y_train,
                    epochs = 10,
                    batch_size = 64,
                    steps_per_epoch = 150,
                    validation_split = 0.3)


In [None]:
model_eval = model.evaluate(X_test, Y_test)

print(f"Test Loss: {model_eval[0]}")
print(f"Test Accuarcy: {model_eval[1]}")