In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chaud\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chaud\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# 3. Preprocess text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [5]:
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

df['clean_review'] = df['review'].apply(preprocess_text)

In [6]:
# 4. Encode labels (pos=1, neg=0)
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])  # positive=1, negative=0


In [7]:
# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_review'], df['label'], test_size=0.2, random_state=42
)


In [8]:
# 6. TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)  # increased features
X_traintf = tfidf.fit_transform(X_train).toarray()
X_testtf = tfidf.transform(X_test).toarray()

In [9]:
# 7. Build a deeper, regularized model
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X_traintf.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # sigmoid for binary classification

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
# 8. Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [11]:
# 9. Train the model with validation split and early stopping
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [12]:
model.fit(
    X_traintf, y_train,
    epochs=20,  # reduce from 100
    batch_size=64,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/20
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.7933 - loss: 0.4250 - val_accuracy: 0.8863 - val_loss: 0.2800
Epoch 2/20
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - accuracy: 0.9148 - loss: 0.2136 - val_accuracy: 0.8865 - val_loss: 0.2759
Epoch 3/20
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - accuracy: 0.9482 - loss: 0.1431 - val_accuracy: 0.8860 - val_loss: 0.3236
Epoch 4/20
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - accuracy: 0.9736 - loss: 0.0804 - val_accuracy: 0.8838 - val_loss: 0.4020
Epoch 5/20
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 13ms/step - accuracy: 0.9851 - loss: 0.0462 - val_accuracy: 0.8813 - val_loss: 0.4708


<keras.src.callbacks.history.History at 0x22620658c40>

In [13]:
# 10. Evaluate the model
loss, accuracy = model.evaluate(X_testtf, y_test)
print("Test Accuracy:", accuracy)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8846 - loss: 0.2789
Test Accuracy: 0.8848999738693237
