## Import

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
from keras.models import Sequential
from keras.layers import Input, Dense

## Load Dataset

In [3]:
dataset_path = "./IMDB Dataset.csv"

In [4]:
df = pd.read_csv(dataset_path)

## EDA

In [5]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [6]:
reviews_column = df.columns[0]
target_column = df.columns[-1]

In [7]:
df[target_column].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

## Preprocessing

### Drop Duplicates

In [8]:
df = df.drop_duplicates()

In [9]:
df.describe()

Unnamed: 0,review,sentiment
count,49582,49582
unique,49582,2
top,One of the other reviewers has mentioned that ...,positive
freq,1,24884


In [10]:
df[target_column].value_counts()

sentiment
positive    24884
negative    24698
Name: count, dtype: int64

### TF-IDF vectorization

In [11]:
max_features = 5000

In [12]:
tf_idf_vectorizer = TfidfVectorizer(max_features = max_features)

In [13]:
reviews_vectorized = tf_idf_vectorizer.fit_transform(df[reviews_column]).toarray()

In [14]:
reviews_vectorized.shape

(49582, 5000)

### Label Encoding

In [15]:
label_encoder = LabelEncoder()

In [16]:
encoded_sentiment = label_encoder.fit_transform(df[target_column])

In [17]:
encoded_sentiment

array([1, 1, 1, ..., 0, 0, 0])

### Split Dataset

In [18]:
x = reviews_vectorized
y = encoded_sentiment

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y)

In [20]:
x_train.shape, y_train.shape

((37186, 5000), (37186,))

## Train Model

In [21]:
input_shape = (max_features,)
layer_1_units = 256
layer_2_units = 256
output_layer_units = 1 


In [22]:
model = Sequential([
    Input(shape = input_shape ),
    Dense(units = layer_1_units, activation='relu'),
    Dense(units = layer_2_units, activation = 'relu'),
    Dense(units = output_layer_units, activation = 'sigmoid')
])

In [23]:
model.summary()

In [24]:
optimizer = 'adam'
loss = 'binary_crossentropy'
metrics = ['accuracy']

In [25]:
model.compile(optimizer = optimizer, loss = loss, metrics = metrics)

In [26]:
epochs = 2
batch_size = 32
validation_split = 0.1

In [27]:
model.fit(x_train, y_train, epochs = epochs, batch_size = batch_size, validation_split = validation_split)

Epoch 1/2
[1m1046/1046[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 25ms/step - accuracy: 0.8392 - loss: 0.3678 - val_accuracy: 0.8933 - val_loss: 0.2557
Epoch 2/2
[1m1046/1046[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 25ms/step - accuracy: 0.9221 - loss: 0.2011 - val_accuracy: 0.8922 - val_loss: 0.2579


<keras.src.callbacks.history.History at 0x1ffa3838190>

In [29]:
loss, accuracy = model.evaluate(x_test, y_test)

[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8871 - loss: 0.2641


In [30]:
print("Loss : ", loss)
print("Accuracy : ", accuracy)

Loss :  0.2807644307613373
Accuracy :  0.8796386122703552
