Imports

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
import re
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense
from tensorflow.keras.callbacks import EarlyStopping

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

from transformers import pipeline

# Reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

print("TensorFlow version:", tf.__version__)


TensorFlow version: 2.20.0


In [None]:
DATA_PATH = "C:/Users/james/Downloads/Amazon_reviews/Amazon_reviews.csv"

PRODUCT_COL = "ProductId"   # from your CSV
REVIEW_COL = "Text"         # from your CSV

RNN_CELL_TYPE = "LSTM"      # or "GRU"

MODEL_PATH = "amazon_rnn_model.keras"
TOKENIZER_PATH = "amazon_tokenizer.pkl"
OUTPUT_EXCEL = "C:/Users/james/Downloads/filtered_reviews_with_sentiment.xlsx"


Load Data and Filter

In [None]:
df = pd.read_csv(DATA_PATH)

# Drop rows without review text
df = df.dropna(subset=[REVIEW_COL])

print("Total rows:", len(df))
print("Columns:", df.columns.tolist())

# Top 5 products by number of reviews
top_products = (
    df[PRODUCT_COL]
    .value_counts()
    .nlargest(5)
    .index
    .tolist()
)

print("\nTop 5 products by review count:")
for i, p in enumerate(top_products):
    print(f"{i+1}. {p}")

# ONE product to train on (changing index 0-4 for top 5 products)
target_product = top_products[0]
print("\nUsing target product:", target_product)

df_product = df[df[PRODUCT_COL] == target_product].copy()
print("Rows for chosen product (before dedup):", len(df_product))


Total rows: 568454
Columns: ['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text']

Top 5 products by review count:
1. B007JFMH8M
2. B002QWP8H0
3. B002QWHJOU
4. B002QWP89S
5. B0026RQTGE

Using target product: B007JFMH8M
Rows for chosen product (before dedup): 913


Parameters

In [None]:
# Parameters
max_sequence_length = 40
embedding_dim = 100
units = 128
vocab_size_limit = 10000
batch_size = 64
epochs = 100              # requirement; EarlyStopping will cut it short


Preprocessing

In [None]:
# Drop missing review text
df = df.dropna(subset=["Text"])

# Pick top product by review count
top_products = df["ProductId"].value_counts().nlargest(5).index.tolist()
target_product = top_products[0]
print("Using product:", target_product)

df_product = df[df["ProductId"] == target_product].copy()

# Normalize and deduplicate reviews
def normalize_for_dedup(text):
    text = str(text).lower().strip()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text

df_product["clean_review"] = df_product["Text"].apply(normalize_for_dedup)
df_product = df_product.drop_duplicates(subset=["clean_review"], keep="first")

# Save the cleaned reviews list for tokenizer
reviews = df_product["clean_review"].tolist()
print("Unique reviews:", len(reviews))
print(reviews[:3])  # preview


# tokenize the text
tokenizer = Tokenizer(num_words=vocab_size_limit, oov_token="<OOV>")
tokenizer.fit_on_texts(reviews)

word_index = tokenizer.word_index
vocab_size = min(vocab_size_limit, len(word_index) + 1)

print("Actual vocab_size used:", vocab_size)

sequences = []

for sentence in reviews:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    if len(token_list) < 2:
        continue
    for i in range(2, len(token_list) + 1):
        n_gram_seq = token_list[:i]
        n_gram_seq = n_gram_seq[-max_sequence_length:]
        sequences.append(n_gram_seq)

if not sequences:
    raise ValueError("No sequences created. Check reviews/product choice.")

# pad to fixed length
sequences = pad_sequences(
    sequences,
    maxlen=max_sequence_length,
    padding="pre"
)

# split into inputs (all but last) and labels (last token)
X = sequences[:, :-1]
y = sequences[:, -1]   # integer class ids

print("X shape:", X.shape)
print("y shape:", y.shape)


Using product: B007JFMH8M
Unique reviews: 910
['i love these cookies  not only are they healthy but they taste great and are so soft  i will definitely add these to my grocery list', 'quaker soft baked oatmeal cookies with raisins are a delicious treat great for anytime of day  for examplebr br at breakfast i had one with a large banana and a cup of coffee and felt id had a relatively healthy start to the daybr br the next day at lunch following a tuna sandwich i had one with a glass of milk and was satisfied enough to not need a snack before dinner at 630br br the following night after dinner i had one with the remainder of my glass of wine delicious and again didnt feel the need to snack later in the eveningbr br each cookie is individually packaged and their texture is soft and moist with just the right amount of sweetness natural flavors used in the making are cinnamon and all spice  these flavorings give the cookies a real oldfashioned homemade tastebr br nutritionally the cookies

Create/Train Model

In [157]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Create model
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_length - 1),
    # choose ONE: LSTM or GRU
    LSTM(units),
    # GRU(units),
    Dense(vocab_size, activation="softmax")
])

# a bit higher LR helps converge faster on this tiny model
optimizer = Adam(learning_rate=0.001)

model.compile(
    optimizer=optimizer,
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

# Early stopping
early_stopping = EarlyStopping(
    monitor="accuracy",
    patience=5,
    min_delta=0.001,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X,
    y,
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stopping],
    verbose=1
)

final_acc = history.history["accuracy"][-1]
print("Final training accuracy:", final_acc)

if final_acc < 0.70:
    print("WARNING: accuracy < 70%. You can:")
    print("- increase units to 64")
    print("- lower learning rate to 0.002")
    print("- increase batch_size for stability/speed.")




Epoch 1/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 27ms/step - accuracy: 0.0488 - loss: 5.8547
Epoch 2/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 26ms/step - accuracy: 0.1164 - loss: 5.1494
Epoch 3/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 26ms/step - accuracy: 0.1641 - loss: 4.6757
Epoch 4/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 26ms/step - accuracy: 0.1924 - loss: 4.3941
Epoch 5/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 26ms/step - accuracy: 0.2136 - loss: 4.1795
Epoch 6/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 26ms/step - accuracy: 0.2311 - loss: 4.0014
Epoch 7/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 26ms/step - accuracy: 0.2429 - loss: 3.8471
Epoch 8/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 26ms/step - accuracy: 0.2554 - loss: 3.7110
Epoch 9/100
[1m

Save Model

In [158]:
model.save(MODEL_PATH)
print(f"Saved model to: {MODEL_PATH}")

with open(TOKENIZER_PATH, "wb") as f:
    pickle.dump(tokenizer, f)
print(f"Saved tokenizer to: {TOKENIZER_PATH}")

Saved model to: amazon_rnn_model.keras
Saved tokenizer to: amazon_tokenizer.pkl


Reload Model

In [None]:
loaded_model = load_model(MODEL_PATH)

with open(TOKENIZER_PATH, "rb") as f:
    loaded_tokenizer = pickle.load(f)

input_length = loaded_model.input_shape[1]   # max_sequence_len - 1
max_sequence_len_loaded = input_length + 1

print("Reloaded model. Input length:", input_length)
print("Max sequence length (for generation):", max_sequence_len_loaded)

Reloaded model. Input length: 39
Max sequence length (for generation): 40


Text Generation

In [160]:
def generate_text(seed_text, next_words, tokenizer, model, max_sequence_length):
    text = seed_text

    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([text])[0]
        token_list = pad_sequences(
            [token_list],
            maxlen=max_sequence_length - 1,
            padding="pre"
        )

        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs, axis=-1)[0]
        next_word = tokenizer.index_word.get(predicted_index, "")

        if not next_word:
            break

        text += " " + next_word

    return text

# Text Generation Runs

print("=== Text Generation Outputs ===")

out1 = generate_text(
    seed_text="This product",
    next_words=10,
    tokenizer=loaded_tokenizer,
    model=loaded_model,
    max_sequence_length=max_sequence_len_loaded
)
print("1)", out1)

# "These [product]" — simple version using first token of product name
product_label = "Quaker Soft Baked Oatmeal Raisin Cookies"
seed2 = f"These {product_label}"
out2 = generate_text(
    seed_text=seed2,
    next_words=3,
    tokenizer=loaded_tokenizer,
    model=loaded_model,
    max_sequence_length=max_sequence_len_loaded
)
print("2)", out2)

out3 = generate_text(
    seed_text="I think",
    next_words=5,
    tokenizer=loaded_tokenizer,
    model=loaded_model,
    max_sequence_length=max_sequence_len_loaded
)
print("3)", out3)


=== Text Generation Outputs ===
1) This product was very good but i was pleasantly surprised how soft
2) These Quaker Soft Baked Oatmeal Raisin Cookies are my favorite
3) I think this cookie was in my


Sentiment Analysis

In [161]:
%%capture
!pip install openpyxl

In [162]:
# Sentiment Analysis (VADER + TextBlob) & Excel Export

analyzer = SentimentIntensityAnalyzer()

def clean_for_sentiment(text: str) -> str:
    return str(text).strip()

vader_scores = []
tb_polarities = []

for review in df_product[REVIEW_COL]:
    txt = clean_for_sentiment(review)

    vs = analyzer.polarity_scores(txt)["compound"]
    vader_scores.append(vs)

    tb = TextBlob(txt).sentiment.polarity
    tb_polarities.append(tb)

df_product["VADER_Compound"] = vader_scores
df_product["TextBlob_Polarity"] = tb_polarities

# Save Excel file with filtered reviews + sentiment scores
df_product.to_excel(OUTPUT_EXCEL, index=False)
print(f"Saved Excel with sentiment scores to: {OUTPUT_EXCEL}")


Saved Excel with sentiment scores to: C:/Users/james/Downloads/filtered_reviews_with_sentiment.xlsx


Transformer

In [None]:
# Summarization
print("=== Transformer Summarization ===")

# Concatenate reviews (truncate to keep it reasonable for the model)
all_reviews_text = " ".join(df_product[REVIEW_COL].astype(str).tolist())
max_chars = 4000
if len(all_reviews_text) > max_chars:
    all_reviews_text = all_reviews_text[:max_chars]

summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn"
)

summary = summarizer(
    all_reviews_text,
    max_length=50,    # target about 30-50 words
    min_length=30,
    do_sample=False
)[0]["summary_text"]

print("Summary (English):")
print(summary)


# Translation (EN -> ES)

print("=== Translation EN -> ES ===")

translator = pipeline(
    "translation_en_to_es",
    model="Helsinki-NLP/opus-mt-en-es"
)

translation = translator(summary)[0]["translation_text"]

print("Summary (Spanish):")
print(translation)


=== Transformer Summarization ===


Device set to use cpu


Summary (English):
Quaker Soft Baked Oatmeal Cookies with raisins are a delicious treat, great for anytime of day. Nutritionally, the cookies have 170 calories each, 1.5g saturated fat, 150 mg sodium, and 12
=== Translation EN -> ES ===


Device set to use cpu


Summary (Spanish):
Las galletas de avena al horno suave con pasas son una deliciosa delicia, ideal para cualquier momento del día. Nutricionalmente, las galletas tienen 170 calorías cada una, 1,5 g de grasa saturada, 150 mg de sodio y 12
