In [2]:
# !pip install opendatasets

In [3]:
import opendatasets as od

In [4]:
od.download_kaggle_dataset('https://www.kaggle.com/datasets/ashishpandey2062/next-word-predictor-text-generator-dataset', data_dir= '/content/sample_data')

Dataset URL: https://www.kaggle.com/datasets/ashishpandey2062/next-word-predictor-text-generator-dataset
Downloading next-word-predictor-text-generator-dataset.zip to /content/sample_data/next-word-predictor-text-generator-dataset


100%|██████████| 61.5k/61.5k [00:00<00:00, 111MB/s]







## Docs creation

In [5]:
# import os
# download_path = '/content/sample_data/next-word-predictor-text-generator-dataset'
# if os.path.exists(download_path):
#     print(f"Contents of {download_path}:")
#     for item in os.listdir(download_path):
#         print(item)
# else:
#     print(f"Directory not found: {download_path}")

In [6]:
# Define the path to the downloaded text file
file_path = '/content/sample_data/next-word-predictor-text-generator-dataset/next_word_predictor.txt'

# Read the content of the file
with open(file_path, 'r') as f:
    text = f.read()

# Print the first 500 characters and the total size of the text
print("First 500 characters:")
print(text[:500])
print("\nTotal size of the text:", len(text), "characters")

First 500 characters:
The sun was shining brightly in the clear blue sky, and a gentle breeze rustled the leaves of the tall trees. People were out enjoying the beautiful weather, some sitting in the park, others taking a leisurely stroll along the riverbank. Children were playing games, and laughter filled the air.

As the day turned into evening, the temperature started to drop, and the sky transformed into a canvas of vibrant colors. Families gathered for picnics, and the smell of barbecues wafted through the air.

Total size of the text: 167445 characters


## Preprocess the data

Clean and prepare the text data for model training, including tokenization and creating sequences.

In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
tokenizer= Tokenizer ()
tokenizer.fit_on_texts([text])

In [8]:
len( tokenizer.word_index)

4993

In [9]:
# # Calculate the maximum sequence length after tokenization
# max_sequence_len = max([len(tokenizer_sentences= tokenizer.texts_to_sequences([line])[0]) for line in text.split('\n') if line.strip()])

# print(f"Maximum length of tokenized sequences: {max_sequence_len}")
tokenized_sentences= [tokenizer.texts_to_sequences([line])[0]
                      for line in text.split('\n')
                       if line.strip()]
print(f"Maximum length: {max(len(seq) for seq in tokenized_sentences)}")

Maximum length: 325


In [10]:
input_sequences=[]
for sentence_tokens in tokenized_sentences:
  for i in range (1, len(sentence_tokens)):
    input_sequences.append (sentence_tokens[:i+1])

In [11]:
max_len = max([len(seq) for seq in input_sequences])
print(f"Maximum sequence length: {max_len}")

Maximum sequence length: 325


In [12]:
padded_input_sequences= pad_sequences (input_sequences, maxlen= max_len, padding='pre')
padded_input_sequences

array([[   0,    0,    0, ...,    0,    1,  155],
       [   0,    0,    0, ...,    1,  155,   21],
       [   0,    0,    0, ...,  155,   21, 2368],
       ...,
       [   0,    0,    0, ..., 2331,  290,   19],
       [   0,    0,    0, ...,  290,   19,   54],
       [   0,    0,    0, ...,   19,   54, 1535]], dtype=int32)

In [13]:
padded_input_sequences.shape

(26383, 325)

In [14]:
X= padded_input_sequences [:,:-1]
y= padded_input_sequences[:,-1]

In [15]:
from tensorflow.keras.utils import to_categorical
y= to_categorical(y, num_classes= len(tokenizer.word_index)+1) ## converted it into a one hot encoded type output
y.shape , X.shape

((26383, 4994), (26383, 324))

## Model

In [17]:
## Now the Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [39]:
model = Sequential ()
model.add (Embedding (input_dim=len(tokenizer.word_index)+1, output_dim=150, input_length= max_len-1))
model.add(LSTM(150))
model.add (Dense(len(tokenizer.word_index)+1, activation='softmax'))



In [41]:
model.compile (loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
model.fit(X_train,y_train, epochs=20, validation_data=(X_test, y_test))

Epoch 1/20
[1m660/660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 22ms/step - accuracy: 0.0512 - loss: 7.3182 - val_accuracy: 0.0606 - val_loss: 6.8793
Epoch 2/20
[1m660/660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 19ms/step - accuracy: 0.0718 - loss: 6.4360 - val_accuracy: 0.0758 - val_loss: 6.7521
Epoch 3/20
[1m660/660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 19ms/step - accuracy: 0.0936 - loss: 5.9511 - val_accuracy: 0.0881 - val_loss: 6.7104
Epoch 4/20
[1m660/660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 19ms/step - accuracy: 0.1130 - loss: 5.5442 - val_accuracy: 0.0995 - val_loss: 6.7038
Epoch 5/20
[1m660/660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 19ms/step - accuracy: 0.1331 - loss: 5.1693 - val_accuracy: 0.1110 - val_loss: 6.7347
Epoch 6/20
[1m660/660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 19ms/step - accuracy: 0.1595 - loss: 4.7704 - val_accuracy: 0.1133 - val_loss: 6.7972
Epoch 7/20
[1m6

<keras.src.callbacks.history.History at 0x7f79d794fed0>

In [44]:
#test
text= 'temperature'
#tokenize
token_text= tokenizer.texts_to_sequences ([text])[0] ##[1552]
#padding
padded_text= pad_sequences ([token_text], maxlen= max_len -1, padding='pre' ) ## as in input we remove one column which is y
#predict
y_pred= np.argmax(model.predict(padded_text))
y_pred

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step


np.int64(21)

In [56]:
for word, index in tokenizer.word_index.items():
  if index == y_pred:
    print(f'The predicted word is: {word}')
    break

The predicted word is: where


In [61]:
import time
#test
text= 'blue'
for i in range (10):
  #tokenize
  token_text= tokenizer.texts_to_sequences ([text])[0] ##[1552]
  #padding
  padded_text= pad_sequences ([token_text], maxlen= max_len -1, padding='pre' ) ## as in input we remove one column which is y
  #predict
  y_pred= np.argmax(model.predict(padded_text))
  for word, index in tokenizer.word_index.items():
    if index == y_pred:
      text = text + " " + word  # Append the predicted word to the text
      print(f'The next sentence is: {text}')
      time.sleep (1)
      break
print("\nGenerated text:", text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
The next sentence is: blue spirit
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
The next sentence is: blue spirit of
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
The next sentence is: blue spirit of innovation
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
The next sentence is: blue spirit of innovation resonates
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
The next sentence is: blue spirit of innovation resonates its
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
The next sentence is: blue spirit of innovation resonates its challenges
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
The next sentence is: blue spirit of innovation resonates its challenges for
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
The next sentence is: blue spirit o