In [1]:
import tensorflow as tf

In [2]:
tf.__version__

'2.10.1'

In [3]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
# GPU memory configuration
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only allocate memory as needed on the GPU
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("Memory growth set for GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

Memory growth set for GPUs


In [5]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout
import os

In [6]:
hp1 = pd.read_csv(r"C:\Users\thakk\Desktop\Tech\Projects\Next Word Prediction using RNNs\data\Harry Potter 1.csv",delimiter=';')
hp2 = pd.read_csv(r"C:\Users\thakk\Desktop\Tech\Projects\Next Word Prediction using RNNs\data\Harry Potter 2.csv",delimiter=';')
hp3 = pd.read_csv(r"C:\Users\thakk\Desktop\Tech\Projects\Next Word Prediction using RNNs\data\Harry Potter 3.csv",delimiter=';')

In [7]:
hp3

Unnamed: 0,CHARACTER,SENTENCE
0,HARRY,Lumos Maxima...
1,HARRY,Lumos Maxima...
2,HARRY,Lumos Maxima...
3,HARRY,Lumos... MAXIMA!
4,AUNT PETUNIA,Harry! Harry!
...,...,...
1633,HERMIONE,"How fast is it, Harry?"
1634,HARRY,Lumos.
1635,HARRY,I solemnly swear that I am up to no good.
1636,HARRY,Mischief managed.


In [8]:
hp3 = hp3.rename(columns={'CHARACTER':'Character','SENTENCE':'Sentence'})

### Merging all three dataframes into one, resetting indexes and lowercasing all the sentences

In [9]:
df = pd.concat([hp1,hp2,hp3],axis=0)

In [10]:
df[df['Character']=='Harry']

Unnamed: 0,Character,Sentence
36,Harry,"Yes, Aunt Petunia."
39,Harry,"Yes, Uncle Vernon."
59,Harry,He's asleep!
61,Harry,Sorry about him.
62,Harry,"He doesn't understand what it's like, lying th..."
...,...,...
1563,Harry,Good job.
1577,Harry,One minute.
1580,Harry,"Thanks, Hagrid."
1583,Harry,"But Hagrid, we're not allowed to do magic away..."


In [11]:
char_lengths = {char: len(char) for char in list(df['Character'].unique())}

### The lenghts for all the characters are varying, leading and trailing spaces from the Character column and lowering all names

In [12]:
df['Character'] = df['Character'].str.strip().str.lower()

In [13]:
df

Unnamed: 0,Character,Sentence
0,dumbledore,"I should've known that you would be here, Prof..."
1,mcgonagall,"Good evening, Professor Dumbledore."
2,mcgonagall,"Are the rumors true, Albus?"
3,dumbledore,"I'm afraid so, professor."
4,dumbledore,The good and the bad.
...,...,...
1633,hermione,"How fast is it, Harry?"
1634,harry,Lumos.
1635,harry,I solemnly swear that I am up to no good.
1636,harry,Mischief managed.


In [14]:
new_char_lengths = {char: len(char) for char in list(df['Character'].unique())}

In [15]:
new_char_lengths

{'dumbledore': 10,
 'mcgonagall': 10,
 'hagrid': 6,
 'petunia': 7,
 'dudley': 6,
 'vernon': 6,
 'harry': 5,
 'snake': 5,
 'someone': 7,
 'barkeep\xa0tom': 11,
 'man': 3,
 'witch': 5,
 'quirrell': 8,
 'boy': 3,
 'goblin': 6,
 'griphook': 8,
 'ollivander': 10,
 'trainmaster': 11,
 'mrs. weasley': 12,
 'george': 6,
 'fred': 4,
 'ginny': 5,
 'ron': 3,
 'woman': 5,
 'hermione': 8,
 'neville': 7,
 'malfoy': 6,
 'whispers': 8,
 'sorting hat': 11,
 'seamus': 6,
 'percy': 5,
 'sir nicholas': 12,
 'girl': 4,
 'man in paint': 12,
 'fat lady': 8,
 'snape': 5,
 'dean': 4,
 'madam hooch': 11,
 'class': 5,
 'filch': 5,
 'all': 3,
 'oliver': 6,
 'flitwick': 8,
 'draco': 5,
 'ron and harry': 13,
 'oiiver': 6,
 'lee jordan': 10,
 'gryffindors': 11,
 'flint': 5,
 'crowd': 5,
 'lee  jordan': 11,
 'hermoine': 8,
 'all 3': 5,
 'firenze': 7,
 'voldemort': 9,
 'students': 8,
 'uncle vernon': 12,
 'aunt petunia': 12,
 'dobby': 5,
 'aunt\xa0petunia\xa0& dudley': 21,
 'mr. weasley': 11,
 'fred, george, ron': 17,

### Lower casing the sentences column

In [16]:
df['Sentence'] = df['Sentence'].str.lower()

In [17]:
df

Unnamed: 0,Character,Sentence
0,dumbledore,"i should've known that you would be here, prof..."
1,mcgonagall,"good evening, professor dumbledore."
2,mcgonagall,"are the rumors true, albus?"
3,dumbledore,"i'm afraid so, professor."
4,dumbledore,the good and the bad.
...,...,...
1633,hermione,"how fast is it, harry?"
1634,harry,lumos.
1635,harry,i solemnly swear that i am up to no good.
1636,harry,mischief managed.


### Now, let's remove all the sentences by characters that are not fun

In [18]:
df['Character'].value_counts().to_dict()

{'harry': 1028,
 'ron': 536,
 'hermione': 485,
 'hagrid': 394,
 'dumbledore': 239,
 'lupin': 207,
 'mcgonagall': 152,
 'snape': 121,
 'gilderoy lockhart': 113,
 'draco': 95,
 'mrs. weasley': 75,
 'lucius malfoy': 75,
 'tom riddle': 70,
 'sirius': 70,
 'dobby': 69,
 'fudge': 65,
 'uncle vernon': 51,
 'filch': 50,
 'petunia': 46,
 'percy': 44,
 'fred': 39,
 'trelawney': 39,
 'vernon': 39,
 'mr. weasley': 39,
 'aunt marge': 37,
 'neville': 36,
 'malfoy': 36,
 'quirrell': 36,
 'george': 32,
 'seamus': 31,
 'stan shunpike': 28,
 'voldemort': 26,
 'moaning myrtle': 25,
 'madam hooch': 25,
 'oliver': 24,
 'sorting hat': 23,
 'voice': 21,
 'professor sprout': 20,
 'aragog': 20,
 'ollivander': 20,
 'dudley': 20,
 'pettigrew': 19,
 'lee jordan': 16,
 'shrunken head': 16,
 'wood': 14,
 'firenze': 14,
 'sir nicholas': 14,
 'flitwick': 13,
 'madam pomfrey': 13,
 'madam rosmerta': 13,
 'boy': 10,
 'mr. borgin': 10,
 'fat lady': 10,
 'aunt petunia': 9,
 'bem': 8,
 'ginny': 8,
 'man': 8,
 'lockhart': 

In [19]:
names = ['harry','ron','hermione']
df = df[df['Character'].isin(names)]

In [20]:
df

Unnamed: 0,Character,Sentence
36,harry,"yes, aunt petunia."
39,harry,"yes, uncle vernon."
59,harry,he's asleep!
61,harry,sorry about him.
62,harry,"he doesn't understand what it's like, lying th..."
...,...,...
1633,hermione,"how fast is it, harry?"
1634,harry,lumos.
1635,harry,i solemnly swear that i am up to no good.
1636,harry,mischief managed.


### Preprocessing all the sentences

In [21]:
sentences = df['Sentence'].to_list()

In [22]:
#tokenize 

tokenizer = Tokenizer()

In [23]:
tokenizer.fit_on_texts(sentences)

In [24]:
total_words = len(tokenizer.word_index)+1

In [25]:
# creating input sequences 

input_sequences = []

for line in sentences:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [26]:
input_sequences

[[113, 867],
 [113, 867, 868],
 [113, 447],
 [113, 447, 584],
 [55, 448],
 [101, 61],
 [101, 61, 42],
 [24, 245],
 [24, 245, 269],
 [24, 245, 269, 8],
 [24, 245, 269, 8, 13],
 [24, 245, 269, 8, 13, 52],
 [24, 245, 269, 8, 13, 52, 585],
 [24, 245, 269, 8, 13, 52, 585, 35],
 [24, 245, 269, 8, 13, 52, 585, 35, 157],
 [24, 245, 269, 8, 13, 52, 585, 35, 157, 173],
 [24, 245, 269, 8, 13, 52, 585, 35, 157, 173, 157],
 [869, 222],
 [869, 222, 870],
 [869, 222, 870, 158],
 [869, 222, 870, 158, 871],
 [869, 222, 870, 158, 871, 586],
 [869, 222, 870, 158, 871, 586, 16],
 [869, 222, 870, 158, 871, 586, 16, 14],
 [869, 222, 870, 158, 871, 586, 16, 14, 1],
 [64, 1],
 [64, 1, 159],
 [64, 1, 159, 10],
 [13, 34],
 [13, 34, 87],
 [13, 34, 87, 77],
 [13, 34, 87, 77, 587],
 [13, 34, 87, 77, 587, 4],
 [13, 34, 87, 77, 587, 4, 6],
 [13, 34, 87, 77, 587, 4, 6, 246],
 [13, 34, 87, 77, 587, 4, 6, 246, 91],
 [20, 1],
 [3, 65],
 [3, 65, 20],
 [3, 65, 20, 1],
 [3, 65, 20, 1, 372],
 [3, 65, 20, 1, 372, 4],
 [3, 65

### Let's find the length of the longest sentence and pad all the sentences based on it.

In [27]:
max_len_sentence = max([len(x) for x in input_sequences])
max_len_sentence

28

In [28]:
input_sequences = np.array(pad_sequences(input_sequences,maxlen=max_len_sentence,padding='pre'))

In [29]:
input_sequences

array([[  0,   0,   0, ...,   0, 113, 867],
       [  0,   0,   0, ..., 113, 867, 868],
       [  0,   0,   0, ...,   0, 113, 447],
       ...,
       [  0,   0,   0, ...,  53,   4,  23],
       [  0,   0,   0, ...,   4,  23,  85],
       [  0,   0,   0, ...,   0, 853, 467]])

### Preparing X and y. i.e. input features and output labels

In [30]:
X = input_sequences[:,:-1]
y = input_sequences[:,-1]

### We should keep in mind that some input sentences (X) may or may not have same value for label/output (y). Let's check for it

In [31]:
len(y) == len(set(y))

False

### This means that we should convert our labels into a categorical type.

In [32]:
y = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [33]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### Finally splitting X and y into train and test sets

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Building a model architecture

In [35]:
with tf.device('/GPU:0'):
    
    model = Sequential()
    model.add(Embedding(input_dim=total_words,output_dim=100,input_length = max_len_sentence-1))
    model.add(LSTM(150,return_sequences=True))
    model.add(Dropout(0.2))  #to deal with overfitting
    model.add(LSTM(100))
    model.add(Dense(total_words,activation='softmax'))


    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])


    model_history = model.fit(X_train,y_train,epochs=100,validation_data=(X_test,y_test),verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### Although I am not satisfied with the performance, specially the validation loss and accuracy values, for now, I will save this model and see its predictions. Later, I will look at ways to improve it

In [12]:
import pickle

In [38]:
model.save('lstm_next_word.h5')

In [39]:
#also saving the tokenizer.

with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

### Loading the model and the tokenizer so that I can test on new inputs

In [15]:
from tensorflow.keras.models import load_model

In [16]:
model = load_model("lstm_next_word.h5")

with open("tokenizer.pickle",'rb') as handle:
    tokenizer = pickle.load(handle)

In [17]:
#function to predict the next word


def predict_next_word(predictor_model,tokenizer,input_sentence,max_len_sentence):

    #first, tokenize the input sentence
    tokens = tokenizer.texts_to_sequences([input_sentence])[0]
    if len(tokens)>= max_len_sentence:
        tokens = tokens[-(max_len_sentence-1):]
    
    #apply padding
    tokens = pad_sequences([tokens],maxlen=max_len_sentence-1,padding='pre')

    predicted_words_with_probabilities = model.predict(tokens,verbose=True)

    index_of_word_with_highest_chance = np.argmax(predicted_words_with_probabilities,axis=1)

    for word, index in tokenizer.word_index.items():

        if index == index_of_word_with_highest_chance:
            return word
    return None



In [18]:
next_word_1 = predict_next_word(model,tokenizer=tokenizer,input_sentence="Blimey!",max_len_sentence=model.input_shape[1]+1)



In [19]:
next_word_1

'something'

In [20]:
next_word_2 = predict_next_word(model,tokenizer=tokenizer,input_sentence="i solemnly",max_len_sentence=model.input_shape[1]+1)



In [21]:
next_word_2

'swear'

In [22]:
next_word_3 = predict_next_word(model,tokenizer=tokenizer,input_sentence="he who must not be",max_len_sentence=model.input_shape[1]+1)



In [23]:
next_word_3

'to'

In [28]:
hp

Unnamed: 0,Character,Sentence
0,HARRY,"I can’t let you out, Hedwig."
1,HARRY,I’m not allowed to use magic outside of school.
2,HARRY,"Besides, if Uncle Vernon…"
3,VERNON,Harry Potter!
4,HARRY,Now you’ve done it.
...,...,...
1695,HAGRID,Sorry I'm late.
1696,HAGRID,The owl that delivered my release papers got a...
1697,HAGRID,Some ruddy bird called Errol.
1698,HAGRID,And I'd just like to say that if it hadn't bee...


In [36]:
hp2[hp2['Character']=="HERMOINE"]

Unnamed: 0,Character,Sentence


In [40]:
df[df['Character']=="HERMIONE"]['Sentence']

261                       Harry. Hagrid.
263             It's so good to see you.
265     What did you do to your glasses?
266                       Oculus Reparo.
270                           Okay, bye.
                      ...               
1561                           Bombarda!
1577                      We have to go.
1585     What's he talking about, Harry?
1630                  This came with it.
1633              How fast is it, Harry?
Name: Sentence, Length: 315, dtype: object

In [47]:
test_sentences = ["why dogs? can it be","oculus","expecto","drink the polyjuice","how are you doing Professor","salazar","10 points to","wingardium","harry! do you want to go play some","he who must not be",
                  "all you need is the golden","his name is","Snape is a big"]

In [48]:
for text in test_sentences:
    print(f"Input text: {text}")
    next_word = predict_next_word(model,tokenizer=tokenizer,input_sentence=text,max_len_sentence=model.input_shape[1]+1)
    print(f"Predicted: {next_word}",end="\n\n")
    

Input text: why dogs? can it be
Predicted: careful

Input text: oculus
Predicted: reparo

Input text: expecto
Predicted: patronum

Input text: drink the polyjuice
Predicted: potion

Input text: how are you doing Professor
Predicted: that

Input text: salazar
Predicted: slytherin

Input text: 10 points to
Predicted: you

Input text: wingardium
Predicted: leviosa

Input text: harry! do you want to go play some
Predicted: s

Input text: he who must not be
Predicted: to

Input text: all you need is the golden
Predicted: shrieking

Input text: his name is
Predicted: a

Input text: Snape is a big
Predicted: stuff

