## Import Tensorflow and Check Version

In [1]:
import tensorflow as tf
print(tf.__version__)


2.17.0


## Import Necessary Libraries

In [2]:
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import requests

The code imports necessary libraries:
* numpy for numerical operations.
* Tokenizer and pad_sequences from tensorflow.keras.preprocessing.text to tokenize and pad sequences of text.
* requests for fetching data from a URL.

## Fetch Shakespeare Data

In [3]:
response = requests.get('https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')
shakespeare_data = response.text

The code requests to get a dataset of Shakespeare's text from the provided URL.

## Store the Fetched Data

In [4]:
input_text = shakespeare_data

The code assigns the fetched data in input_text

## Tokenization

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([input_text])

A tokenizer object is created, and it is trained on the Shakespearean text. It converts words into a numeric representation.

## Calculate Number of Words

In [6]:
num_words = len(tokenizer.word_index) + 1

num_words is calculated based on the number of unique words in the text, plus 1 for padding.

## Create Input Sequence

In [7]:
input_sequences = tokenizer.texts_to_sequences([input_text])
input_seq_length = 10
step = 1

Input Sequences: The text is tokenized into a list of integers using texts_to_sequences.

In [8]:
X = []
y = []
for i in range(0, len(input_sequences[0])-input_seq_length, step):
  input_seq = input_sequences[0][i:i+input_seq_length]
  output_seq = input_sequences[0][i+input_seq_length]
  X.append(input_seq)
  y.append(output_seq)

* Sliding Window: A sliding window of length 10 (input_seq_length) moves over the tokenized text with step size 1 (step), generating overlapping sequences. These sequences represent the training data (X).
* Next Word Prediction: For each input sequence, the next word in the text (y) is used as the output label.

In [9]:
X = np.array(X)
y = np.array(y)
y = to_categorical(y, num_classes=num_words)

* The training data X and output label y are converted to numpy arrays.
* The labels y are then one-hot encoded using to_categorical, converting the output into a vector format.

## Define the Model

In [10]:
model = Sequential([
    Embedding(input_dim=num_words,output_dim=100,input_length=input_seq_length),
    LSTM(128),
    Dense(num_words, activation='softmax')
])



* Embedding Layer: This converts each word in the input sequence into a 100-dimensional vector representation. The input_dim is num_words (the vocabulary size), and the input_length is input_seq_length.
* LSTM Layer: A layer with 128 hidden units to process the sequential data and capture long-term dependencies in the text.
* Dense Layer: A dense output layer with num_words neurons and a softmax activation function, which outputs the probability distribution over the vocabulary for the next word.

## Compile the Model

In [11]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

The model is compiled with the categorical crossentropy loss function and the Adam optimizer.

## Print the Training Data and Output Label Shapes

In [12]:
print(X.shape, y.shape)

(204079, 10) (204079, 12633)


The shapes of X and y are printed for observation.

## Train the Model

In [13]:
epochs = 10
batch_size = 4096
model.fit(X, y, epochs=epochs, batch_size=batch_size)

Epoch 1/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 555ms/step - loss: 8.8226
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 564ms/step - loss: 6.8037
Epoch 3/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 496ms/step - loss: 6.7886
Epoch 4/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 555ms/step - loss: 6.7882
Epoch 5/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 484ms/step - loss: 6.7687
Epoch 6/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 504ms/step - loss: 6.7165
Epoch 7/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 503ms/step - loss: 6.6499
Epoch 8/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 541ms/step - loss: 6.5891
Epoch 9/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 531ms/step - loss: 6.5316
Epoch 10/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 557ms

<keras.src.callbacks.history.History at 0x1ab8999ff40>

The model is trained on the input data X and corresponding output y for 10 epochs with a batch size of 4096. The goal is for the model to predict the next word given a sequence of 10 words.

## Text Generation

In [14]:
def generate_text(model, seed_text, num_words_to_generate=50):
  generated_text = seed_text
  for _ in range(num_words_to_generate):
      input_seq = tokenizer.texts_to_sequences([generated_text])[0]
      input_seq = pad_sequences([input_seq], maxlen=input_seq_length, padding='pre')

      preds = model.predict(input_seq)[0]
      next_word_idx = np.argmax(preds)
      next_word = tokenizer.index_word[next_word_idx]
      generated_text += " " + next_word
  return generated_text

The generate_text function generates new text based on a seed text.
* The seed text is tokenized and padded to match the input sequence length.
* The trained model predicts the probability distribution of the next word.
* The most probable next word is chosen (np.argmax(preds)), and it is added to the generated text.
* The process repeats until num_words_to_generate new words have been generated.

## Run Text Generation:

In [16]:
seed_text = "To be or not to be"
generated_text = generate_text(model, seed_text, num_words_to_generate=50)
print(generated_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 526us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

* A seed text is provided ("To be or not to be"), and the model generates 50 more words based on this seed.
* The generated text is printed.

## Summary
This code builds a word-based LSTM language model that learns from Shakespeare’s text and can generate new text. After tokenizing the data, it creates input-output sequences and trains the model. The model can then generate text given a seed, by predicting the next word in the sequence.

## Limitations of the Model
* Limited context window leading to more superficial text generation
* Word-level tokenization which can be limiting when dealing with rare or unseen words
* Comparatively smaller and simple than existing transformer-based pre-trained models

## Ways to Improve
* Using a large dataset with a variety of contexts
* Designing complex model architecture
* Using transformer-based models