In [None]:
import nltk
nltk.download('gutenberg')

nltk.download('punkt')


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Abhin\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Abhin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [1]:


import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense,Embedding,LSTM,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
import re

# Step 1: Load the file
with open('movie_lines.txt', 'r', encoding='utf-8', errors='ignore') as file:
    raw_lines = file.readlines()

cleaned_lines = []

# Step 2: Process each line
for line in raw_lines:
    parts = line.strip().split()

    # Skip lines that are too short to contain dialogue
    if len(parts) < 5:
        continue

    # Only keep parts from index 4 onward (dialogue part)
    dialogue = ' '.join(parts[4:])

    # Clean the dialogue
    dialogue = re.sub(r'[^a-zA-Z\s]', ' ', dialogue)  # remove non-alphabetic characters
    dialogue = re.sub(r'\s+', ' ', dialogue).strip()  # normalize spaces

    if dialogue:  # skip empty results
        cleaned_lines.append(dialogue.lower())

# Step 3: Save to cleaned file
with open('movie_clean.txt', 'w', encoding='utf-8') as file:
    for line in cleaned_lines:
        file.write(line + '\n')


DATA PREPROCESSING

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(cleaned_lines)
total_words = len(tokenizer.word_index) + 1

In [4]:
input_sequences = []

MAX_TOKENS = 20  # Change this as needed

for line in cleaned_lines:
    token_list = tokenizer.texts_to_sequences([line])[0]

    # Skip lines that are too long
    if len(token_list) > MAX_TOKENS:
        token_list = token_list[:MAX_TOKENS]

    # generate n-gram sequences
    for i in range(1, len(token_list)):
        n_gram_seq = token_list[:i+1]
        input_sequences.append(n_gram_seq)


In [5]:
input_sequences

[[1, 1896],
 [1, 1896, 36],
 [1, 1896, 36, 22],
 [1, 1896, 36, 22, 31],
 [1, 2493],
 [1, 2493, 36],
 [1, 2493, 36, 22],
 [1, 2493, 36, 22, 5],
 [1, 1896],
 [1, 1896, 3],
 [1, 1896, 3, 392],
 [1, 1896, 3, 392, 45],
 [1, 2493],
 [1, 2493, 49],
 [1, 2493, 49, 110],
 [1, 1896],
 [1, 1896, 92],
 [1, 1896, 92, 7],
 [1, 1896, 92, 7, 61],
 [1, 2493],
 [1, 2493, 1258],
 [1, 1896],
 [1, 1896, 110],
 [1, 1896, 110, 2],
 [1, 1896, 110, 2, 24],
 [1, 1896, 110, 2, 24, 115],
 [1, 1896, 110, 2, 24, 115, 127],
 [1, 1896, 110, 2, 24, 115, 127, 5],
 [1, 1896, 110, 2, 24, 115, 127, 5, 832],
 [1, 1896, 110, 2, 24, 115, 127, 5, 832, 53],
 [1, 1896, 110, 2, 24, 115, 127, 5, 832, 53, 5],
 [1, 1896, 110, 2, 24, 115, 127, 5, 832, 53, 5, 776],
 [1, 2493],
 [1, 2493, 30],
 [1, 1896],
 [1, 1896, 3],
 [1, 1896, 3, 1],
 [1, 1896, 3, 1, 871],
 [1, 1896, 3, 1, 871, 2],
 [1, 1896, 3, 1, 871, 2, 23],
 [1, 1896, 3, 1, 871, 2, 23, 53],
 [1, 1896, 3, 1, 871, 2, 23, 53, 545],
 [1, 1896, 3, 1, 871, 2, 23, 53, 545, 2],
 [1, 1

In [6]:
##pad sequence
max_sequence_len = max([len(x) for x in input_sequences])
max_sequence_len

20

In [7]:
input_sequence = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [8]:
input_sequence

array([[   0,    0,    0, ...,    0,    1, 1896],
       [   0,    0,    0, ...,    1, 1896,   36],
       [   0,    0,    0, ..., 1896,   36,   22],
       ...,
       [   0,    0,    0, ...,    2,   58,   88],
       [   0,    0,    0, ...,   58,   88, 7597],
       [   0,    0,    0, ...,   88, 7597, 3403]], dtype=int32)

In [9]:
x,y = input_sequence[:,:-1],input_sequence[:,-1]

In [10]:
x

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1, 1896],
       [   0,    0,    0, ...,    1, 1896,   36],
       ...,
       [   0,    0,    0, ...,  236,    2,   58],
       [   0,    0,    0, ...,    2,   58,   88],
       [   0,    0,    0, ...,   58,   88, 7597]], dtype=int32)

In [11]:
y

array([1896,   36,   22, ...,   88, 7597, 3403], dtype=int32)

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [13]:
x_train,y_train

(array([[   0,    0,    0, ...,    1,  993,    8],
        [   0,    0,    0, ...,   44,  247,   40],
        [   0,    0,    0, ...,  144, 1763, 3106],
        ...,
        [   0,    0,    0, ...,    0,    1,  949],
        [   0,    1, 4897, ...,    8,   46,   21],
        [   0,    0,    0, ...,    4,  467,   11]], dtype=int32),
 array([  256, 29242,  2631, ...,   586,  1802,    93], dtype=int32))

In [14]:
## Model Training

model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=200, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dense(total_words, activation="softmax"))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [15]:
model.summary()

In [16]:
early_stopping = EarlyStopping(monitor='val_loss',patience=10,restore_best_weights=True)

In [17]:
history = model.fit(
    x_train, y_train,
    epochs=5,
    batch_size=64,
    validation_data=(x_test, y_test),
    validation_split=0.1,
    callbacks=[early_stopping]
)

Epoch 1/5
[1m37662/37662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1044s[0m 28ms/step - accuracy: 0.1202 - loss: 5.9476 - val_accuracy: 0.1618 - val_loss: 5.3829
Epoch 2/5
[1m37662/37662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1033s[0m 27ms/step - accuracy: 0.1638 - loss: 5.3158 - val_accuracy: 0.1716 - val_loss: 5.3022
Epoch 3/5
[1m37662/37662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1044s[0m 27ms/step - accuracy: 0.1725 - loss: 5.2076 - val_accuracy: 0.1755 - val_loss: 5.2719
Epoch 4/5
[1m37662/37662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1034s[0m 27ms/step - accuracy: 0.1778 - loss: 5.1342 - val_accuracy: 0.1787 - val_loss: 5.2602
Epoch 5/5
[1m37662/37662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1035s[0m 27ms/step - accuracy: 0.1814 - loss: 5.0827 - val_accuracy: 0.1807 - val_loss: 5.2447


In [18]:
def predict_next_word(model,tokenizer,text,max_sequence_length):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if(len(token_list) > max_sequence_length):
        token_list=token_list[-(max_sequence_length - 1):]
    token_list = pad_sequences([token_list],maxlen=max_sequence_length-1,padding='pre')
    predict = model.predict(token_list,verbose=0)
    predicted_word_index = np.argmax(predict,axis=1)
    for word,index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [35]:
model.save('wiki_model.h5')




In [36]:
with open('tokenizerr_wiki.pickle', 'wb') as handle:
    import pickle
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [37]:
from google.colab import files

# Download model
files.download('wiki_model.h5')

# Download tokenizer
files.download('tokenizerr_wiki.pickle')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [43]:
input_text = "do you"
result=predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(result)

know
