In [1]:

import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense,Embedding,LSTM,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
with open('movie_clean.txt', 'w', encoding='utf-8') as file:
    for line in cleaned_lines:
        file.write(line + '\n')


DATA PREPROCESSING

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(cleaned_lines)
total_words = len(tokenizer.word_index) + 1 

In [4]:
input_sequences = []

MAX_TOKENS = 20  # Change this as needed

for line in cleaned_lines:
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    # Skip lines that are too long
    if len(token_list) > MAX_TOKENS:
        token_list = token_list[:MAX_TOKENS]

    # generate n-gram sequences
    for i in range(1, len(token_list)):
        n_gram_seq = token_list[:i+1]
        input_sequences.append(n_gram_seq)


In [5]:
input_sequences

[[1, 1896],
 [1, 1896, 36],
 [1, 1896, 36, 22],
 [1, 1896, 36, 22, 31],
 [1, 2493],
 [1, 2493, 36],
 [1, 2493, 36, 22],
 [1, 2493, 36, 22, 5],
 [1, 1896],
 [1, 1896, 3],
 [1, 1896, 3, 392],
 [1, 1896, 3, 392, 45],
 [1, 2493],
 [1, 2493, 49],
 [1, 2493, 49, 110],
 [1, 1896],
 [1, 1896, 92],
 [1, 1896, 92, 7],
 [1, 1896, 92, 7, 61],
 [1, 2493],
 [1, 2493, 1258],
 [1, 1896],
 [1, 1896, 110],
 [1, 1896, 110, 2],
 [1, 1896, 110, 2, 24],
 [1, 1896, 110, 2, 24, 115],
 [1, 1896, 110, 2, 24, 115, 127],
 [1, 1896, 110, 2, 24, 115, 127, 5],
 [1, 1896, 110, 2, 24, 115, 127, 5, 832],
 [1, 1896, 110, 2, 24, 115, 127, 5, 832, 53],
 [1, 1896, 110, 2, 24, 115, 127, 5, 832, 53, 5],
 [1, 1896, 110, 2, 24, 115, 127, 5, 832, 53, 5, 776],
 [1, 2493],
 [1, 2493, 30],
 [1, 1896],
 [1, 1896, 3],
 [1, 1896, 3, 1],
 [1, 1896, 3, 1, 871],
 [1, 1896, 3, 1, 871, 2],
 [1, 1896, 3, 1, 871, 2, 23],
 [1, 1896, 3, 1, 871, 2, 23, 53],
 [1, 1896, 3, 1, 871, 2, 23, 53, 545],
 [1, 1896, 3, 1, 871, 2, 23, 53, 545, 2],
 [1, 1

In [6]:
##pad sequence
max_sequence_len = max([len(x) for x in input_sequences])
max_sequence_len

20

In [7]:
input_sequence = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [8]:
input_sequence

array([[   0,    0,    0, ...,    0,    1, 1896],
       [   0,    0,    0, ...,    1, 1896,   36],
       [   0,    0,    0, ..., 1896,   36,   22],
       ...,
       [   0,    0,    0, ...,    2,   58,   88],
       [   0,    0,    0, ...,   58,   88, 7597],
       [   0,    0,    0, ...,   88, 7597, 3403]])

In [9]:
x,y = input_sequence[:,:-1],input_sequence[:,-1]

In [10]:
x

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1, 1896],
       [   0,    0,    0, ...,    1, 1896,   36],
       ...,
       [   0,    0,    0, ...,  236,    2,   58],
       [   0,    0,    0, ...,    2,   58,   88],
       [   0,    0,    0, ...,   58,   88, 7597]])

In [11]:
y

array([1896,   36,   22, ...,   88, 7597, 3403])

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [13]:
x_train,y_train

(array([[    0,     0,     0, ...,   156,    52,     4],
        [    0,     0,     0, ...,     8,     3,    40],
        [    0,     0,     0, ...,    13,     7,    35],
        ...,
        [    0,     0,     0, ...,     0,     0,     1],
        [    0,     0,     0, ...,   695,    27, 15508],
        [    0,     0,     0, ...,    32,     6,   746]]),
 array([13824,  1717,    26, ...,  2702,     3,    49]))

In [14]:
## Model Training

model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=200, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dense(total_words, activation="softmax"))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 19, 200)           9774400   
                                                                 
 lstm (LSTM)                 (None, 19, 150)           210600    
                                                                 
 dropout (Dropout)           (None, 19, 150)           0         
                                                                 
 lstm_1 (LSTM)               (None, 256)               416768    
                                                                 
 dense (Dense)               (None, 48872)             12560104  
                                                                 
Total params: 22,961,872
Trainable params: 22,961,872
Non-trainable params: 0
_________________________________________________________________


In [16]:
early_stopping = EarlyStopping(monitor='val_loss',patience=10,restore_best_weights=True)

In [20]:
history = model.fit(
    x_train, y_train,
    epochs=5,
    batch_size=128,
    validation_data=(x_test, y_test),
    validation_split=0.2,
    callbacks=[early_stopping]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
def predict_next_word(model,tokenizer,text,max_sequence_length):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if(len(token_list) > max_sequence_length):
        token_list=token_list[-(max_sequence_length - 1):]
    token_list = pad_sequences([token_list],maxlen=max_sequence_length-1,padding='pre')
    predict = model.predict(token_list,verbose=0)
    predicted_word_index = np.argmax(predict,axis=1)
    for word,index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [22]:
model.save('next_word_predictor_model.h5')


In [23]:
with open('tokenizer.pickle', 'wb') as handle:
    import pickle
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

In [25]:
input_text = "All these people are "
result=predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(result)

going
