In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding
from tensorflow.keras.utils import pad_sequences

In [None]:
train_df = pd.read_csv('EcoPreprocessed.csv')
text = train_df['review']
print('Number of training sentences: ',len(text))
print(text[1])

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text.values)
sequences = tokenizer.texts_to_sequences(text.values)
print('One sequence of a sentence looks like this: ', sequences[0])
# Flatten the list of lists resulting from the tokenization. This will reduce the list
# to one dimension, allowing us to apply the sliding window technique to predict the next word
words = []
for sublist in sequences:
  for item in sublist:
    words.append(item)
print("The words list first 10 elements: ", words[:10])
vocab_size = len(tokenizer.word_index)
print("The number of different words: ", vocab_size)

In [None]:
sentence_len = 15
pred_len = 1
train_len = sentence_len - pred_len
seq = []
# Sliding window to generate train data
for i in range(len(words)-sentence_len):
    seq.append(words[i:i+sentence_len])
print("The first two 15-length lists: ", seq[:2])

# Reverse dictionary to decode tokenized sequences back to words
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Each row in seq is a 15 word long window. We append the first 14 words as the input to predict the 10th word
trainX = []
trainy = []
for i in seq:
    trainX.append(i[:train_len])
    trainy.append(i[-1])

In [None]:
model = Sequential([
    Embedding(vocab_size+1, 50, input_length=train_len),
    LSTM(100, return_sequences=True),
    LSTM(100),
    Dense(50, activation='relu'),
    Dropout(0.3),
    Dense(vocab_size, activation='softmax')
])

In [None]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit(np.asarray(trainX),
         pd.get_dummies(np.asarray(trainy)),
         epochs = 200,
         batch_size = 128)

In [None]:
def gen(model,seq,max_len = 15):
    tokenized_sent = tokenizer.texts_to_sequences([seq])
    max_len = max_len+len(tokenized_sent[0])
    # If sentence is not as long as the desired sentence length, we need to 'pad sequence' so that
    # the array input shape is correct going into our LSTM. the `pad_sequences` function adds
    # zeroes to the left side of our sequence until it becomes 14 long, the number of input features.
    while len(tokenized_sent[0]) < max_len:
        padded_sentence = pad_sequences(tokenized_sent[-14:],maxlen=14)
        op = model.predict(np.asarray(padded_sentence).reshape(1,-1))
        tokenized_sent[0].append(op.argmax()+1)

    return " ".join(map(lambda x : reverse_word_map[x],tokenized_sent[0]))

In [None]:
gen(model, "sound")