In [1]:
import numpy as np
import tensorflow as tf

In [2]:
!pip install PyPDF2



In [3]:
from PyPDF2 import PdfReader

pdf_file = open('/content/Harry Potter and the Prisoner of Azkaban.pdf', 'rb')

pdf_reader = PdfReader(pdf_file)

text = ''
for page_num in range(len(pdf_reader.pages)):
    page = pdf_reader.pages[page_num]
    text += page.extract_text()
pdf_file.close()



In [4]:
import re
text = re.sub(r'\s+', ' ', text)
text = text.replace('\n', ' ')

In [196]:
# print(text)

In [6]:
len(text)

145934

In [195]:
def add_newline(text, n=30):
    words = text.split()
    result = ""
    count = 0
    for word in words:
        result += word + " "
        count += 1
        if count == n:
            result += "\n"
            count = 0
    return result.strip()


formatted_text = add_newline(text, 20)
# print(formatted_text)

In [8]:
import spacy

nlp = spacy.load('en_core_web_sm')

def extract_dialogues_nlp(script_text):
    doc = nlp(script_text)
    dialogues = []
    current_dialogue = []

    for sent in doc.sents:
        if sent.text.startswith(('INT.', 'EXT.')):
            continue  # Skip scene headers
        if sent.text.startswith(('HARRY', 'RON', 'HERMIONE')):
            if current_dialogue:
                dialogues.append(' '.join(current_dialogue))
                current_dialogue = []
            current_dialogue.append(sent.text)
        elif current_dialogue:
            current_dialogue.append(sent.text)

    if current_dialogue:
        dialogues.append(' '.join(current_dialogue))

    return dialogues

In [9]:
dialogue_segments = extract_dialogues_nlp(formatted_text)

# for segment in dialogue_segments:
#     print(segment)

In [10]:
len(dialogue_segments)

254

In [11]:
def clean_dialogue_segments(dialogue_segments):
    cleaned_segments = []
    for segment in dialogue_segments:
        if not any(direction in segment.upper() for direction in ['(MOMENTS LATER)', '(CONTINUED)', 'INT.', 'EXT.']) and not any(char.isdigit() for char in segment):
            cleaned_segments.append(segment)
    return cleaned_segments

In [12]:
sc = clean_dialogue_segments(dialogue_segments)

In [191]:
# sc

In [14]:
def remove_text_in_brackets(sc):
    cleaned_sc = []
    for sentence in sc:
        words = sentence.split()
        cleaned_sentence = []
        skip = False
        for word in words:
            if '(' in word:
                skip = True
            if not skip:
                cleaned_sentence.append(word)
            if ')' in word:
                skip = False
        cleaned_sc.append(' '.join(cleaned_sentence))
    return cleaned_sc

In [192]:
cleaned_sc = remove_text_in_brackets(sc)
# for sentence in cleaned_sc:
#     print(sentence)

In [16]:
len(cleaned_sc)

146

In [193]:
# cleaned_sc

In [18]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [194]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(cleaned_sc)
total_words = len(tokenizer.word_index) + 1  # Adding 1 for padding

In [20]:
input_sequences = []
for line in cleaned_sc:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [21]:
# input_sequences

In [22]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

print("Max Sequence Length after padding:", max_sequence_len)
print("Shape of padded sequences:", input_sequences.shape)

Max Sequence Length after padding: 170
Shape of padded sequences: (5178, 170)


In [23]:
max_sequence_len

170

In [24]:
input_sequences = np.array(input_sequences)

print(input_sequences.shape)

(5178, 170)


In [25]:
from tensorflow import keras

In [26]:
x, y = input_sequences[:,:-1], input_sequences[:,-1]

In [27]:
total_words = len(tokenizer.word_index) + 1
y = keras.utils.to_categorical(y, num_classes=total_words)

In [28]:
print(x.shape, y.shape)

(5178, 169) (5178, 2026)


In [29]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,)

In [30]:
# model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(total_words, 100, input_length=max_sequence_len-1),
#     tf.keras.layers.LSTM(150),
#     tf.keras.layers.Dense(total_words, activation='softmax')
# ])

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding,Dropout

In [32]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

In [33]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 169, 100)          202600    
                                                                 
 lstm (LSTM)                 (None, 169, 150)          150600    
                                                                 
 dropout (Dropout)           (None, 169, 150)          0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               100400    
                                                                 
 dense (Dense)               (None, 2026)              204626    
                                                                 
Total params: 658226 (2.51 MB)
Trainable params: 658226 (2.51 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [121]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=30,restore_best_weights=True)

In [122]:
history = model.fit(x_train, y_train, epochs=100, verbose=1, validation_data=(x_test,y_test), callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100


In [118]:
def sample_with_temperature(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def predict_words(model, tokenizer, input_text, max_sequence_len, n=10, temperature=1.0):
    for _ in range(n):
        input_sequence = tokenizer.texts_to_sequences([input_text])[0]
        input_sequence = np.pad(input_sequence, (max_sequence_len-len(input_sequence)-1, 0), 'constant')
        input_sequence = np.array([input_sequence])
        predicted_probabilities = model.predict(input_sequence, verbose=0)[0]
        predicted_index = sample_with_temperature(predicted_probabilities, temperature)

        # Convert the predicted index to a word
        predicted_word = None
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                predicted_word = word
                break

        if predicted_word is None:
            break
        input_text += ' ' + predicted_word

    return input_text

In [135]:
input_text = "Harry potter"
output = predict_words(model, tokenizer, input_text, max_sequence_len, n=10, temperature=0.77)
print(f"Output : {output}")

Output : Harry potter the trees sight at the window they're and harry edges


In [163]:
input_text = "Professor. Just so you know, I don't think the map"
output = predict_words(model, tokenizer, input_text, max_sequence_len, n=10, temperature=0.65)
print(f"Output : {output}")

Output : Professor. Just so you know, I don't think the map have over the shallows of his squinting the shape thatappears


In [190]:
input_text = "Sirius finds peter pettigrew is alive and among us"
output = predict_words(model, tokenizer, input_text, max_sequence_len, n=10, temperature=0.54)
print(f"Output : {output}")

Output : Sirius finds peter pettigrew is alive and among us shock ron turns harry the invisibility would'vebetrayed ground are danger


In [198]:
input_text = "It is not and you bloody well know"
output = predict_words(model, tokenizer, input_text, max_sequence_len, n=1, temperature=1)
print(f"Output : {output}")

Output : It is not and you bloody well know her
