# Overview

## Dataset

A public dataset from kaggle was used. The original can be found [here](https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia)

# Setup

Import all required packages and set the base path for the datasets

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
datasetsPath = "./sentimentalAnalysisDatasets"
datasetFile = "IMDB_dataset.csv"

print("Path to dataset files:", datasetsPath)

Path to dataset files: ./sentimentalAnalysisDatasets


In [3]:
data = pd.read_csv(os.path.join(datasetsPath, datasetFile))

In [4]:
data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
data["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [6]:
# Silence downcasting deprication warning
pd.set_option('future.no_silent_downcasting', True)

# Replace "positive" -> 1, "negative" -> 0

data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [7]:
from bs4 import BeautifulSoup
import re

def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # Emoticons
                           u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # Transport & map symbols
                           u"\U0001F700-\U0001F77F"  # Alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric shapes
                           u"\U0001F800-\U0001F8FF"  # Supplemental arrows
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


# Remove any and all HTML tags from the review.
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

data["review"] = data["review"].apply(remove_html_tags)
data["review"] = data["review"].apply(remove_emojis)
data["review"] = data["review"].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)


  soup = BeautifulSoup(text, "html.parser")


In [8]:
data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production The filming tech...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically theres a family where a little boy J...,0
4,Petter Matteis Love in the Time of Money is a ...,1


In [9]:
data["sentiment"].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
(train_data, test_data) = train_test_split(data, test_size = 0.2, random_state=42)

In [12]:
train_data.shape


(40000, 2)

In [13]:
test_data.shape

(10000, 2)

In [14]:
tokenizer = Tokenizer(num_words = 10000)
tokenizer.fit_on_texts(train_data["review"])

In [15]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=400)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=400)

In [16]:
Y_train = train_data["sentiment"].astype('int64')
#Y_train = np.array(Y_train, dtype='float32')  # Converts to float32
Y_test = test_data["sentiment"].astype('int64')
#Y_test = np.array(Y_test, dtype='float32')  # Converts to float32

In [17]:
Y_train

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64

In [18]:
model = Sequential()
model.add(Embedding(input_dim = 10000, output_dim = 128, input_length = 400))
model.add(LSTM(128, dropout=0.4, recurrent_dropout = 0.4))
model.add(Dense(1, activation = "sigmoid"))



In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          1280000   
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,411,713
Trainable params: 1,411,713
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.compile(optimizer = "adam", loss="binary_crossentropy", metrics=["accuracy"])

In [21]:
history = model.fit(X_train,
                    Y_train,
                    epochs = 5,
                    batch_size = 64,
                    steps_per_epoch = 150,
                    validation_split = 0.2)


Epoch 1/5


ValueError: in user code:

    File "c:\Users\kille\.conda\envs\tensorflowgpu\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\kille\.conda\envs\tensorflowgpu\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\kille\.conda\envs\tensorflowgpu\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\kille\.conda\envs\tensorflowgpu\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\kille\.conda\envs\tensorflowgpu\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\kille\.conda\envs\tensorflowgpu\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 200), found shape=(64, 400)


In [None]:
model_eval = model.evaluate(X_test, Y_test)

print(f"Test Loss: {model_eval[0]}")
print(f"Test Accuarcy: {model_eval[1]}")