NLP Project : Next Token Prediction
I used dataset from Kaggle

In [None]:
# Data Loading and reading file
from google.colab import files
uploaded = files.upload()

# unzip the file
import zipfile
zip_file = "archive.zip"

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall("dataset")

# open and read the file
with open("dataset/Sherlock Holmes.txt", "r", encoding="utf-8") as f:
  text = f.read()

Saving archive.zip to archive (6).zip


**Dataset Preprocessing**
We used Sherlock Holmes text. The data was cleaned by removing punctuation, numbers,sub titles, white spaces and lowercasing the text.


In [None]:
# preprocessing and data cleaning
import string
import re
import numpy as np

punc = string.punctuation
def remove_punc(text):
  return text.translate(str.maketrans('', '', punc))

def preprocess_text(text):
  text = text.lower()                            # lowercas
  text = remove_punc(text)                       # remove punctions
  text = re.sub(r'\d+', '', text)                # remove digits
  text = re.sub(r'\s+', ' ', text).strip()       # remove extra spaces
  text = re.sub(r'\bchapter\b\s+\w+', '',text, flags=re.IGNORECASE)
  # text contains chapter titles like "CHAPTER III", which are irrelevant
  return text

In [None]:
# apply the preprocessing function to clean text and save it
cleaned_text = preprocess_text(text)

with open("cleaned_data.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)

# Review first 500 letter
print(cleaned_text[:500])

# review script length
print(len(cleaned_text))

the adventures of sherlock holmes arthur conan doyle table of contents a scandal in bohemia the redheaded league a case of identity the boscombe valley mystery the five orange pips the man with the twisted lip the adventure of the blue carbuncle the adventure of the speckled band the adventure of the engineers thumb the adventure of the noble bachelor the adventure of the beryl coronet the adventure of the copper beeches a scandal in bohemia table of contents   i to sherlock holmes she is always
536327


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
import numpy as np

Word-level tokenization + LSTM

In [None]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([cleaned_text])
sequences = tokenizer.texts_to_sequences([cleaned_text])
vocab_size = len(tokenizer.word_index) + 1

# Generate input-output pairs using n-gram
input_sequences = []
output_words = []
n = 5

for i in range(n, len(sequences[0])):
    input_sequences.append(sequences[0][i-n:i])
    output_words.append(sequences[0][i])

# Padding
X = pad_sequences(input_sequences)
y = to_categorical(output_words, num_classes=vocab_size)

LSTM training to predict next token

In [None]:
#training using LSTM
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=n))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=20, verbose=1)




Epoch 1/20
[1m3265/3265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 32ms/step - accuracy: 0.0616 - loss: 6.6020
Epoch 2/20
[1m3265/3265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 34ms/step - accuracy: 0.1275 - loss: 5.4647
Epoch 3/20
[1m3265/3265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 32ms/step - accuracy: 0.1558 - loss: 4.9359
Epoch 4/20
[1m3265/3265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 33ms/step - accuracy: 0.1807 - loss: 4.4469
Epoch 5/20
[1m3265/3265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 33ms/step - accuracy: 0.2260 - loss: 3.9188
Epoch 6/20
[1m3265/3265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 35ms/step - accuracy: 0.2952 - loss: 3.3800
Epoch 7/20
[1m3265/3265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 34ms/step - accuracy: 0.3690 - loss: 2.9091
Epoch 8/20
[1m3265/3265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 41ms/step - accuracy: 0.4475 - loss: 2.4803


<keras.src.callbacks.history.History at 0x787869920610>

**Subword Tokenization using BPE + LSTM**

In [None]:
# save the model
model_word = model
model_word.save("model_word.h5")



LSTM training with BPE tokenization

In [None]:
!pip install sentencepiece



In [None]:
with open("dataset/Sherlock Holmes.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [None]:
# training LSTM using BPE tokenization
bpe_ids = sp.encode(cleaned_text, out_type=int)[:100000]
n = 5
vocab_size = sp.get_piece_size()

X = []
y = []

for i in range(n, len(bpe_ids)):
    X.append(bpe_ids[i - n:i])
    y.append(bpe_ids[i])

X = np.array(X)
y = to_categorical(y, num_classes=vocab_size)



In [None]:
#LSTM training
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=n))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=20, batch_size=128, verbose=1)

Epoch 1/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 40ms/step - accuracy: 0.0582 - loss: 6.5703
Epoch 2/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 39ms/step - accuracy: 0.1063 - loss: 5.5895
Epoch 3/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 38ms/step - accuracy: 0.1306 - loss: 5.1805
Epoch 4/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 38ms/step - accuracy: 0.1479 - loss: 4.9272
Epoch 5/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 39ms/step - accuracy: 0.1668 - loss: 4.6718
Epoch 6/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 38ms/step - accuracy: 0.1865 - loss: 4.4633
Epoch 7/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 38ms/step - accuracy: 0.2022 - loss: 4.2489
Epoch 8/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 38ms/step - accuracy: 0.2226 - loss: 4.0537
Epoch 9/20
[1m782/782[

<keras.src.callbacks.history.History at 0x7c2f7094c5d0>

In [None]:
# save the model
model_bpe = model
model_bpe.save("model_bpe.h5")



In [None]:
# test word model

def predict_next_word(input_text, tokenizer, model, seq_length=5):
    tokens = tokenizer.texts_to_sequences([input_text])[0]
    tokens = tokens[-seq_length:]

    from tensorflow.keras.preprocessing.sequence import pad_sequences

    padded = pad_sequences([tokens], maxlen=seq_length)
    pred_index = model.predict(padded, verbose=0).argmax(axis=-1)[0]

    return tokenizer.index_word.get(pred_index, "<unk>")

predict_next_word("the adventure of", tokenizer, model_word)

'bottom'

In [None]:
# test BPE model

def predict_next_bpe(input_text, sp, model, seq_length=5):
    tokens = sp.encode(input_text, out_type=int)[-seq_length:]

    from tensorflow.keras.preprocessing.sequence import pad_sequences
    padded = pad_sequences([tokens], maxlen=seq_length)
    pred_index = model.predict(padded, verbose=0).argmax(axis=-1)[0]

    return sp.id_to_piece(int(pred_index))

predict_next_bpe("the adventure of", sp, model_bpe)

'▁the'