In [1]:
import pandas as pd
import regex as re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/amiralid/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
!unzip data/amazon_reviews.csv.zip

In [2]:
df = pd.read_csv('amazon_reviews.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4.0,No issues.,2014-07-23,138,0,0,0,0,0.0,0.0
1,1,0mie,5.0,"Purchased this for my device, it worked as adv...",2013-10-25,409,0,0,0,0,0.0,0.0
2,2,1K3,4.0,it works as expected. I should have sprung for...,2012-12-23,715,0,0,0,0,0.0,0.0
3,3,1m2,5.0,This think has worked out great.Had a diff. br...,2013-11-21,382,0,0,0,0,0.0,0.0
4,4,2&amp;1/2Men,5.0,"Bought it with Retail Packaging, arrived legit...",2013-07-13,513,0,0,0,0,0.0,0.0


In [4]:
# Keep only reviewText column
df = df[['reviewText']]

In [5]:
df.head()

Unnamed: 0,reviewText
0,No issues.
1,"Purchased this for my device, it worked as adv..."
2,it works as expected. I should have sprung for...
3,This think has worked out great.Had a diff. br...
4,"Bought it with Retail Packaging, arrived legit..."


In [6]:
def process(text: str):
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[\/|-|>|<|=|\(|\)|\{|\}|\[\]]+', '', text) # Eliminate slash and hyphen and other chars
    text = re.sub(r'[0-9]+(gb|GB|Gb)', '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{2}', '', text)  # Remove dates
    
    if not text.endswith('.'):
        text += '.'  # Add a period to the end of the sentence if it doesn't have one
    text += " "
    return text

# Apply the function to each item in the column and then concatenate the results
running_string = "".join(df['reviewText'].apply(lambda x: process(str(x))))

In [7]:
sentences = sent_tokenize(running_string)

In [8]:
sentences = sentences[:9000]

In [9]:
sentences[:10]

['No issues.',
 'Purchased this for my device, it worked as advertised.',
 'You can never have too much phone memory, since I download a lot of stuff this was a no brainer for me.',
 'it works as expected.',
 'I should have sprung for the higher capacity.',
 'I think its made a bit cheesier than the earlier versions; the paint looks not as clean as before.',
 'This think has worked out great.Had a diff.',
 'bran  card and if went south after  months.This one has held up pretty well since I had my S, now on my Note.',
 "*** update I've had this for a few months and have had ZERO issue's since it was transferred from my S to my Note and into a note.",
 'This card is reliable and solid!Cheers!.']

In [10]:
sentences[100:110]

['This item is great!',
 "I can't believe how small it is.",
 "Imagine  this size back in the 's when they had whole buildings full of computers that couldn't pack this much space!",
 'Great item.',
 "I've been trying for a while to get a hold of a  Micro SD for my phone and tablet.",
 "It's pretty fast compared to the  cards I've been using.",
 'I really saw the performance boost with my phone, which I have many apps, pictures, music, and other documents on.',
 'The phone boots fully stable in about  mins as apposed to - mins.',
 'My computer also can write to these cards much faster than the  ones I was using.',
 'If you phone, tablet or any other device can support it I recommend you purchase these.']

In [11]:
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

In [12]:
all_words = [word.strip() for sentence in tokenized_sentences for word in sentence]
vocabulary = set(all_words)

In [13]:
all_words[115:135]

['for',
 'a',
 'few',
 'months',
 'and',
 'have',
 'had',
 'ZERO',
 'issue',
 "'s",
 'since',
 'it',
 'was',
 'transferred',
 'from',
 'my',
 'S',
 'to',
 'my',
 'Note']

In [14]:
len(vocabulary)

8198

In [15]:
# Using a list comprehension, we can loop through each (idx, word) pair from enumerate
# Each pair holds the key and value we want in our word_to_idx dict
# We start the enumeration from 1, not 0, because want 0 to represent the padding token
word_to_idx = {word.strip() : idx for idx, word in enumerate(vocabulary, 1)}
# Let's also create a idx_to_word dict so we can interpet the results of the model later
idx_to_word = {idx : word.strip() for word, idx in word_to_idx.items()}
vocab_size = len(vocabulary) + 1

In [16]:
import pickle
with open('amzn_word_to_idx.pkl', 'wb') as f:  # open a text file
    pickle.dump(word_to_idx, f)
with open('amzn_idx_to_word.pkl', 'wb') as f:
    pickle.dump(idx_to_word, f)

In [17]:
input_sequences = []
for sentence in tokenized_sentences:
  # Convert the sentence to its numerical representation with the word_to_idx mapping
  numerized_sentence = [word_to_idx[word.strip()] for word in sentence]
  # Create ngrams from size 2 to the size of the sentence
  for i in range(2, len(sentence) + 1):
    ngram = numerized_sentence[:i]
    input_sequences.append(ngram)

In [18]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
# Now, let's pad the sequences so they are all the same length
max_sequence_len = max([len(seq) for seq in input_sequences])
# pad_sequences adds 0s to the beginning of each array until size(vector) = max_sequence_len
# This is why we started our enumeration from 1, not 0, because 0 represents the padding token
# We use pre padding because padding at the end would cause us to lose the location of the label
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

In [20]:
X = [sequence[:-1] for sequence in input_sequences]
y = [sequence[-1] for sequence in input_sequences]

In [21]:
def pretty_display(vec: list[int], idx_to_word: dict):
    print(" ".join([idx_to_word[idx] if idx else '' for idx in vec]).strip())

def pretty_display_all(vecs: list[list[int]], idx_to_word: dict):
    for vec in vecs:
        pretty_display(vec, idx_to_word)

def pretty_display_one(idx: int, idx_to_word: dict):
    print(idx_to_word[idx])

In [22]:
pretty_display(X[1003], idx_to_word)

I would not hesitate to buy


In [23]:
pretty_display(y[1003:1008], idx_to_word)

another one of these great


In [24]:
y = to_categorical(y, num_classes=vocab_size)

In [30]:
import numpy as np

In [31]:
arr_x = np.array(X)
arr_x.shape

(140881, 198)

In [None]:
# arr_y = np.array(y)
# arr_y.shape

# Building the Model

In [25]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization, SpatialDropout1D, GaussianNoise
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

In [27]:
# Building the RNN model
model = Sequential()

# Embedding layer
model.add(Embedding(vocab_size, 48, input_length=max_sequence_len-1))  # Increased embedding dimensions
model.add(SpatialDropout1D(0.15))
model.add(GaussianNoise(0.1))

# RNN 1
model.add(LSTM(96, dropout=0.2, recurrent_dropout=0.2))  # Increased units, added dropout
model.add(BatchNormalization())

# Final Layer
model.add(Dense(vocab_size, activation='softmax'))



In [28]:
model.compile(optimizer=Adam(learning_rate=0.01, clipnorm=1.0), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 198, 48)           393552    
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 198, 48)          0         
 lDropout1D)                                                     
                                                                 
 gaussian_noise_1 (GaussianN  (None, 198, 48)          0         
 oise)                                                           
                                                                 
 lstm_1 (LSTM)               (None, 96)                55680     
                                                                 
 batch_normalization_1 (Batc  (None, 96)               384       
 hNormalization)                                                 
                                                      

In [32]:
# Stop the model early
early_stop = EarlyStopping(monitor='loss', patience=12, verbose=1, restore_best_weights=True)

In [29]:
from tensorflow import convert_to_tensor

In [34]:
X = convert_to_tensor(arr_x)
y = convert_to_tensor(y)

In [35]:
# This will train the model; adjust epochs and batch size as necessary
history = model.fit(X, y, epochs=50, batch_size=64, verbose=1, validation_split=0.2, callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [36]:
model.save('amazon_reviews_LSTM.h5')

In [37]:
def predict_next_n_words(model, text, n, max_sequence_len, word_to_index, index_to_word):
    """
    Predict the next n words based on the input text.

    Args:
    - model (tf.keras.Model): Trained model for prediction.
    - text (str): Input string.
    - n (int): Number of words to predict.
    - max_sequence_len (int): Maximum length of input sequences.
    - word_to_index (dict): Mapping from words to their respective indices.
    - index_to_word (dict): Mapping from indices to their respective words.

    Returns:
    - str: Predicted sequence of words.
    """

    predicted_sequence = []

    for _ in range(n):
        # Tokenize the input string
        token_list = [word_to_index[word] for word in word_tokenize(text) if word in word_to_index]

        # Pad the token sequence
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

        # Predict the token of the next word
        predicted_idx = np.argmax(model.predict(token_list), axis=-1)

        # Convert the token back to a word
        predicted_word = index_to_word.get(predicted_idx[0], '')

        # Append the predicted word to the sequence and to the text (for the next iteration)
        predicted_sequence.append(predicted_word)
        text += " " + predicted_word

    return ' '.join(predicted_sequence)

In [38]:
input_text = "This new phone is going to be the next biggest thing"
prediction = predict_next_n_words(model, input_text, 5, max_sequence_len, word_to_idx, idx_to_word)
print(input_text + " " + prediction)

This new phone is going to be the next biggest thing . , and they are
