In [60]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout

from collections import Counter

In [61]:
def build_vocab(filepath, vocab_size=20000, add_special_tokens=True):
    #Reading Each token in data line by line
    tokens = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            tokens.extend(line.strip().split(" "))

    #Storing frequency of all tokens
    freqs = Counter(tokens)

    #word2idx is something that we would be sending to Embedding layer
    #idx2word is something that we would need to get the word from the predicted index
    word2idx = {}
    idx2word = {}
    start_index=0

    if add_special_tokens:
        specials = ["<pad>", "<unk>", "</s>"]

        for i, tok in enumerate(specials):
            word2idx[tok] = i
            idx2word[i] = tok
        start_index = len(specials)

    #Taking out the most common tokens
    most_common = freqs.most_common(vocab_size-start_index)

    #Adding all the most_common words in word2idx and idx2word dict
    for i, (word, freq) in enumerate(most_common, start=start_index):
        word2idx[word] = i
        idx2word[i] = word

    return idx2word, word2idx

In [62]:
idx2word_train, word2idx_train = build_vocab('/kaggle/input/wikitext/wikitext-103/wiki.train.tokens')

In [63]:
word2idx_valid = word2idx_train
idx2word_valid = idx2word_train

In [64]:
input_sequences_train = []
max_sentences = 35000
count = 0
with open('/kaggle/input/wikitext/wikitext-103/wiki.train.tokens', 'r', encoding='utf-8') as f:
    for line in f:
        tokens = line.strip().split()
        if len(tokens) < 2:
            continue

        tokenized_sentences = [word2idx_train.get(tok, word2idx_train["<unk>"]) for tok in tokens]


        for i in range(1,len(tokenized_sentences)):
            input_sequences_train.append(tokenized_sentences[:i+1])

        count += 1
        if count >= max_sentences:
            break

In [65]:
input_sequences_valid = []
count = 0
with open('/kaggle/input/wikitext/wikitext-103/wiki.valid.tokens', 'r', encoding='utf-8') as f:
    for line in f:
        tokens = line.strip().split()
        if len(tokens) < 2:
            continue

        tokenized_sentences = [word2idx_valid.get(tok, word2idx_valid["<unk>"]) for tok in tokens]


        for i in range(1,len(tokenized_sentences)):
            input_sequences_valid.append(tokenized_sentences[:i+1])

In [66]:
# max_len_train = max(len(sentence) for sentence in input_sequences_train)
# print(max_len_train)

In [67]:
from keras.preprocessing.sequence import pad_sequences
input_sequences_train = pad_sequences(input_sequences_train, maxlen=50, padding='pre')
input_sequences_valid = pad_sequences(input_sequences_valid, maxlen=50, padding='pre')

In [68]:
X = input_sequences_train[:, :-1]
y = input_sequences_train[:, -1]

print(X.shape, y.shape)

(2985648, 49) (2985648,)


In [69]:
X_valid = input_sequences_valid[:, :-1]
y_valid = input_sequences_valid[:, -1]
print(X_valid.shape, y_valid.shape)

(211425, 49) (211425,)


In [70]:
len(word2idx_train)

19999

In [71]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor = 'val_loss',
    patience = 3,
    restore_best_weights= True
)

In [72]:
model = Sequential()

model.add(Embedding(input_dim=len(word2idx_train)+1, output_dim=256))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.2)) 
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(len(word2idx_train)+1, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X, y, verbose=1,validation_data=(X_valid, y_valid), epochs=10, batch_size=128, callbacks=[early_stopping])

Epoch 1/10
[1m23326/23326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m955s[0m 41ms/step - accuracy: 0.1279 - loss: 6.3046 - val_accuracy: 0.2182 - val_loss: 5.3855
Epoch 2/10
[1m23326/23326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m949s[0m 41ms/step - accuracy: 0.2161 - loss: 5.2319 - val_accuracy: 0.2339 - val_loss: 5.1206
Epoch 3/10
[1m23326/23326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m951s[0m 41ms/step - accuracy: 0.2320 - loss: 4.9734 - val_accuracy: 0.2430 - val_loss: 4.9881
Epoch 4/10
[1m23326/23326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m953s[0m 41ms/step - accuracy: 0.2407 - loss: 4.8356 - val_accuracy: 0.2489 - val_loss: 4.9044
Epoch 5/10
[1m23326/23326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m954s[0m 41ms/step - accuracy: 0.2478 - loss: 4.7443 - val_accuracy: 0.2518 - val_loss: 4.8524
Epoch 6/10
[1m23326/23326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m950s[0m 41ms/step - accuracy: 0.2528 - loss: 4.6787 - val_accuracy: 0.2550 - val

In [80]:
model.summary()

In [81]:
idx2word_test = idx2word_train
word2idx_test = word2idx_train

In [82]:
input_sequences_test = []
with open('/kaggle/input/wikitext/wikitext-103/wiki.test.tokens', 'r', encoding='utf-8') as f:
    for line in f:
        tokens = line.strip().split()
        if len(tokens) < 2:
            continue

        tokenized_sentences = [word2idx_test.get(tok, word2idx_test["<unk>"]) for tok in tokens]


        for i in range(1,len(tokenized_sentences)):
            input_sequences_test.append(tokenized_sentences[:i+1])

In [84]:
input_sequences_test = pad_sequences(input_sequences_test, maxlen=50, padding='pre')

In [85]:
X_test = input_sequences_test[:, :-1]
y_test = input_sequences_test[:, -1]

print(X_test.shape, y_test.shape)

(238320, 49) (238320,)


In [86]:
loss, acc = model.evaluate(X_test, y_test, batch_size=128)

[1m1862/1862[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 18ms/step - accuracy: 0.2657 - loss: 4.6642


In [87]:
perplexity = tf.exp(loss).numpy()
print(f"Test Perplexity: {perplexity:.2f}")

Test Perplexity: 114.46
