# DATA PREPARATION

In this file, we are going to prepare our data for the deep learning algorithms we want to use for the classification problem.

In [None]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import sys
sys.path.append('../')
from utils import utils

# --------- Step 1: Load Data ---------
def load_labeled_corpus(pos_path, neg_path):
    data = []
    for file_path in Path(pos_path).rglob("*.txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            data.append((f.read(), 1))  # positive = 1
    for file_path in Path(neg_path).rglob("*.txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            data.append((f.read(), 0))  # negative = 0
    return pd.DataFrame(data, columns=['text', 'label'])

# Set path to your data
train_path = '../../aclImdb_v1/aclImdb/train'
df = load_labeled_corpus(train_path + '/pos', train_path + '/neg')

# --------- Step 2: Preprocess Text ---------
df['clean_text'] = df['text'].apply(utils.clean_text)

# --------- Step 3: Tokenize and Pad Sequences ---------
MAX_VOCAB = 10000
MAX_LEN = 200

tokenizer = Tokenizer(num_words=MAX_VOCAB)
tokenizer.fit_on_texts(df['clean_text'])

X = tokenizer.texts_to_sequences(df['clean_text'])
X = pad_sequences(X, maxlen=MAX_LEN)

y = df['label'].values

# --------- Step 4: Train/Test Split ---------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --------- Step 5: Build RNN (LSTM) Model ---------
model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=128, input_length=MAX_LEN),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [4]:
# --------- Step 6: Train Model ---------
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)

# --------- Step 7: Evaluate ---------
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {accuracy:.4f}")

Epoch 1/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 104ms/step - accuracy: 0.7303 - loss: 0.5225 - val_accuracy: 0.8705 - val_loss: 0.3045
Epoch 2/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 110ms/step - accuracy: 0.9296 - loss: 0.2031 - val_accuracy: 0.8610 - val_loss: 0.3156
Epoch 3/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 104ms/step - accuracy: 0.9511 - loss: 0.1435 - val_accuracy: 0.8735 - val_loss: 0.3981
Epoch 4/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 110ms/step - accuracy: 0.9695 - loss: 0.0929 - val_accuracy: 0.8655 - val_loss: 0.4317
Epoch 5/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 108ms/step - accuracy: 0.9777 - loss: 0.0731 - val_accuracy: 0.8570 - val_loss: 0.4810
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.8445 - loss: 0.5016

Test Accuracy: 0.8498


In [9]:
model.summary()

In [None]:
# Load test data
test_path = '../../aclImdb_v1/aclImdb/test'
df_test = load_labeled_corpus(test_path + '/pos', test_path + '/neg')

# Optional: Clean the text using the same function
df_test['clean_text'] = df_test['text'].apply(utils.clean_text)

In [6]:
X_test_seq = tokenizer.texts_to_sequences(df_test['clean_text'])
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

y_test = df_test['label'].values

In [7]:
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=1)
print(f"Test Accuracy: {accuracy:.4f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 25ms/step - accuracy: 0.8650 - loss: 0.4080
Test Accuracy: 0.8400


In [8]:
from sklearn.metrics import classification_report

# Predict probabilities and convert to class labels
y_pred_probs = model.predict(X_test_pad)
y_pred = (y_pred_probs > 0.5).astype("int32")

# Print detailed classification metrics
print(classification_report(y_test, y_pred))


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 24ms/step
              precision    recall  f1-score   support

           0       0.86      0.81      0.84     12500
           1       0.82      0.87      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

