<a href="https://colab.research.google.com/github/e-olang/NLP/blob/main/Language%20Modeling/lstm_lang_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy
import re
import pandas as pd
import keras
import string
import nltk

In [13]:
#!pip install np_utils --quiet
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

string.punctuation = string.punctuation +'“'+'”'+'-'+'’'+'‘'+'—'
string.punctuation = string.punctuation.replace('.', '')

In [14]:
# Loads the data and preprocesses data and stores corpus in raw_text
raw_text = open('text.txt', encoding = 'utf8').read()

file_nl_removed = ""
for line in raw_text:
  line_nl_removed = line.replace("\n", " ")           
#removes newlines
  file_nl_removed += line_nl_removed

file_p = "".join([char for char in file_nl_removed if char not in string.punctuation])   
#removes all special characters
sents = nltk.sent_tokenize(file_p)
print("The number of sentences is", len(sents)) 
#prints the number of sentences

string.punctuation = string.punctuation + '.'
file_q = "".join([char for char in file_p if char not in string.punctuation])   #removes even periods.
words = nltk.word_tokenize(file_q)
print("The number of tokens is", len(words)) 
#prints the number of tokens

average_tokens = round(len(words)/len(sents))
print("The average number of tokens per sentence is", average_tokens) 
#prints the average number of tokens per sentence

unique_tokens = set(words)
print("The number of unique tokens are", len(unique_tokens)) 
#prints the number of unique tokens

preprocessed_text = file_p.lower()       
#converts corpus into lowercase

The number of sentences is 981
The number of tokens is 26381
The average number of tokens per sentence is 27
The number of unique tokens are 3037


In [15]:
# Hyperparameters of the model
vocab_size = 2750  #chosen based on statistics of the model
oov_tok = '<OOV>'
embedding_dim = 100
padding_type='post'
trunc_type='post'

# tokenizes sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts([preprocessed_text])
word_index = tokenizer.word_index
seq_length = 50
tokens = tokenizer.texts_to_sequences([preprocessed_text])[0]

In [16]:
dataX = []
dataY = []

for i in range(0, len(tokens) - seq_length-1 , 1):
  seq_in = tokens[i:i + seq_length]
  seq_out = tokens[i + seq_length]

  if seq_out==1: #Skip samples where target word is OOV
    continue
    
  dataX.append(seq_in)
  dataY.append(seq_out)
 
N = len(dataX)
print ("Total training data size is -", N)
X = numpy.array(dataX)

# one hot encodes the output variable
y = numpy.array(dataY)
y = np_utils.to_categorical(dataY)

Total training data size is - 26333


In [17]:
# with embedding
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=seq_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(vocab_size, activation='softmax')
])

# compiles model
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 100)           275000    
                                                                 
 bidirectional (Bidirectiona  (None, 128)              84480     
 l)                                                              
                                                                 
 dense (Dense)               (None, 2750)              354750    
                                                                 
Total params: 714,230
Trainable params: 714,230
Non-trainable params: 0
_________________________________________________________________


In [18]:
# Uses validation split of 0.2 while training
num_epochs = 5
history = model.fit(X, y, epochs=num_epochs, batch_size = 128, verbose=1, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
#Creates word to idx map using tokenizer.word_index
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Returns the next n words greedily
def next_tokens(input_str, n):
    print ("Seed -",  input_str, sep = '\n\n')
    final_string = ''
    for i in range(n):
        token = tokenizer.texts_to_sequences([input_str])[0]
        prediction = model.predict(token, verbose=0)
        final_string = final_string + reverse_word_map[numpy.argmax(prediction[0])] + ' ' 
        input_str = input_str + ' ' + reverse_word_map[numpy.argmax(prediction[0])]
        input_str = ' '.join(input_str.split(' ')[1:])
    return final_string

In [21]:
# picks a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
input_str = ' '.join([reverse_word_map[value] for value in pattern])

#speficies an unseen input string
input_str = "The boy laughed at the fright he had caused. This time, the villagers left angrily. The third day, as the boy went up\
 the small hill, he suddenly saw a wolf attacking his sheep. He cried as hard as he could, “Wolf! Wolf! Wolf!”, but not \
 a single villager came to help him. The villagers thought that he was trying to fool them again and did not come to rescue \
 him or his sheep."
 
output = next_tokens(input_str, 10)
print("\nGenerated string -\n\n", output)


# Uses first 50 tokens from given input_str as input. Since the seq_length is 50, only 50 tokens are taken using the tokenizer.
output = next_tokens(input_str, 10)
print("\nGenerated string -\n\n", output)

Seed -

The boy laughed at the fright he had caused. This time, the villagers left angrily. The third day, as the boy went up the small hill, he suddenly saw a wolf attacking his sheep. He cried as hard as he could, “Wolf! Wolf! Wolf!”, but not  a single villager came to help him. The villagers thought that he was trying to fool them again and did not come to rescue  him or his sheep.


ValueError: ignored