# Importing Libraries

In [None]:
from keras.models import Sequential
import numpy as np
import re
from keras.layers.embeddings import Embedding
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout

# Loading Data

In [None]:
# Testing Data used here is present in "train.europarl"
file = open("train.europarl", mode='rt', encoding='utf-8')
trainlines = file.readlines()
len(trainlines) 

20000

In [None]:
# Testing Data used here is present in "test.europarl"
file = open("test.europarl", mode='rt', encoding='utf-8')
testlines = file.readlines()
len(testlines)

1000

# Data Preprocessing

In [None]:
from collections import Counter
counts = Counter()
traindata = [] 
for line in trainlines:
    linesplit = re.sub("\W"," ",line.lower())
    linesplit = linesplit.split()
    for word in linesplit:
        counts[word] += 1
    traindata.append(linesplit)

In [None]:
newdata = []
for line in traindata:
    newline = ""
    for word in line:
        if counts[word] <=20:
            newline = newline + "UNK" + " "
        else:
            newline = newline + word + " "
    newdata.append(newline)
oldtrainines = trainlines
trainlines = newdata 

In [None]:
inputs = []
tokenizer = Tokenizer()
tokenizer.fit_on_texts(trainlines)
for line in trainlines:
		token_list = tokenizer.texts_to_sequences([line])[0]
		for i in range(1, len(token_list)):
			sequence = token_list[:i+1]
			inputs.append(sequence)

In [None]:
max_sent_len = max([len(x) for x in inputs])

In [None]:
inputs = np.array(pad_sequences(inputs, padding='pre',maxlen=max_sent_len))
train, output = inputs[:,:-1],inputs[:,-1]

In [None]:
total_words = len(tokenizer.word_index) + 1

In [None]:
output = tf.keras.utils.to_categorical(output, num_classes=total_words)

# Model Parameters

In [None]:
model = Sequential()
model.add(Embedding(total_words, 30, input_length=max_sent_len-1,name='embeddings'))
model.add(LSTM(30, batch_input_shape=(2000,None,None)))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train, output, epochs=50, verbose=1,batch_size=2000)
model.summary()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embeddings (Embedding)      (None, 149, 30)           68430     
                                                                 
 lstm (LSTM)                 (None, 30)                7320      
                                                           

In [None]:
print(len(tokenizer.word_index) + 1)

# Save/Load Model

In [None]:
!mkdir -p saved_model_emd
model.save('saved_model_emd/my_model') 
# my_model directory
!ls saved_model_emd

# Contains an assets folder, saved_model.pb, and variables folder.
!ls saved_model_emd/my_model



INFO:tensorflow:Assets written to: saved_model_emd/my_model/assets


INFO:tensorflow:Assets written to: saved_model_emd/my_model/assets


my_model
assets	keras_metadata.pb  saved_model.pb  variables


In [None]:
model = tf.keras.models.load_model('saved_model_emd/my_model')

# Test/Train Perplexity Calculation

In [None]:
newtestlines = []
for line in testlines:
    linesplit = line.split()
    testtext = ""
    for word in linesplit:
        if counts[word] <= 20:
            testtext = testtext + "UNK" + " "
        else:
            testtext = testtext + word + " "
    newtestlines.append(testtext)

oldtestlines = testlines
testlines = newtestlines

In [None]:
inputs = []
for line in testlines:
		token_list = tokenizer.texts_to_sequences([line])[0]
		for i in range(1, len(token_list)):
			sequence = token_list[:i+1]
			inputs.append(sequence)

In [None]:
inputs = np.array(pad_sequences(inputs, maxlen=max_sent_len, padding='pre'))
inputs = inputs[:,:-1]

In [None]:
! zip model1.zip -r saved_model_emd

  adding: saved_model_emd/ (stored 0%)
  adding: saved_model_emd/my_model/ (stored 0%)
  adding: saved_model_emd/my_model/assets/ (stored 0%)
  adding: saved_model_emd/my_model/variables/ (stored 0%)
  adding: saved_model_emd/my_model/variables/variables.data-00000-of-00001 (deflated 8%)
  adding: saved_model_emd/my_model/variables/variables.index (deflated 61%)
  adding: saved_model_emd/my_model/keras_metadata.pb (deflated 87%)
  adding: saved_model_emd/my_model/saved_model.pb (deflated 89%)


In [None]:
prediction = model.predict(inputs)

In [None]:
f = open("LM_test.txt", "w") 
index = 0
lineindex = 0
perpnum = 0
perpnumcount = 0
for line in testlines:
    linesplit = line.split()
    prob = 1
    avg_prob = 0
    avg_prob_num = 0
    for i in range(1,len(linesplit)):
        predindex = tokenizer.texts_to_sequences([linesplit[i]])[0][0]
        prob = prob * prediction[index][predindex]
        index = index + 1
    if prob < 1 and prob > 0:
        avg_prob += (1/prob) ** (1/(len(linesplit) - 1))
        avg_prob_num += 1
    if avg_prob_num > 0:
        print("{}\t{}".format(oldtestlines[lineindex][:-1],avg_prob/avg_prob_num),file=f)
        perpnum += avg_prob
        perpnumcount += 1
    lineindex += 1

print(perpnum/perpnumcount,file=f)
f.close()

In [None]:
model = tf.keras.models.load_model('saved_model_emd/my_model')

In [None]:
f = open("LM_train.txt", "w") 

inputs = []
for line in trainlines:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        inputs.append(n_gram_sequence)
        
input_sequences = np.array(pad_sequences(inputs, maxlen=max_sent_len, padding='pre'))
predictors = input_sequences[:,:-1]

prediction = model.predict(predictors)

index = 0
lineindex = 0
perpnum = 0
perpnumcount = 0
for line in trainlines:
    linesplit = line.split()
    prob = 1
    avg_prob = 0
    avg_prob_num = 0
    for i in range(1,len(linesplit)):
        predindex = tokenizer.texts_to_sequences([linesplit[i]])[0][0]
        prob = prob * prediction[index][predindex]
        index = index + 1
    if prob < 1 and prob > 0:
        avg_prob += (1/prob) ** (1/(len(linesplit) - 1))
        avg_prob_num += 1
    if avg_prob_num > 0:
        print("{}\t{}".format(oldtrainines[lineindex][-1],avg_prob/avg_prob_num),file=f)
        perpnum += avg_prob
        perpnumcount += 1
    lineindex += 1


print(perpnum/perpnumcount,file=f)
f.close()