In [5]:
# !pip install 'tensorflow[and-cuda]' 
# !pip install datasets nltk 

In [6]:
import tensorflow as tf
from datasets import load_dataset
from tensorflow.keras.layers import LSTM , Input , Embedding , Dropout , Dense
from tensorflow.keras import Model
import re
import logging
import numpy as np

In [7]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip
# nltk.download('stopwords')
# stopwords = nltk.corpus.stopwords.words('english')

In [8]:
ds = load_dataset("Trelis/tiny-shakespeare")
maxInDs = 0
for x in ds['train']:
  sentence_array = re.findall(r"\b\w+(?:'\w+)?\b|[.!?,-]", x['Text'].lower())
  maxInDs = max(len(sentence_array),maxInDs)
maxInDs

739

In [9]:
# Making my own dicrtionary from the dataset
newVocab = []

def create_new_vocab(sentence,newVocab):
  sentence_array = re.findall(r"\b\w+(?:'\w+)?\b|[.!?,-]", sentence.lower())
  new_words = [x for x in sentence_array if x not in newVocab]
  new_words_set = set(new_words)
  new_words = list(new_words_set)
  newVocab += new_words

ds['train'].map(lambda x : create_new_vocab(x["Text"],newVocab))
ds['test'].map(lambda x : create_new_vocab(x["Text"],newVocab))
newVocab.insert(0,"<UNK>")
newVocab.insert(0,"<PAD>")

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

In [None]:
vocab_size = len(newVocab)
index_to_word = {i:k for i,k in enumerate(newVocab)}
word_to_index = {k:i for i,k in enumerate(newVocab)}

In [11]:
def preprocessing_layer(sentence):
  sentence_array = re.findall(r"\b\w+(?:'\w+)?\b|[.!?,-]", sentence.lower())
  # print(sentence_array)
  wor_to_in = [word_to_index[x] for x in sentence_array]
  # print(wor_to_in)
  pad_len = maxInDs - len(wor_to_in)
  zero_array = [0]*pad_len
  # print(len(wor_to_in))
  padded_wor_to_in = wor_to_in+zero_array
  # print(len(padded_wor_to_in))
  if(len(padded_wor_to_in) > maxInDs):
    logging.log("BRUHHHHH")
    return "STOP"
  labels = padded_wor_to_in[1:] + [0]
  return {"features":padded_wor_to_in,"labels":labels}

processed_ds_train = ds['train'].map(lambda x : preprocessing_layer(x["Text"]))
processed_ds_test = ds['test'].map(lambda x : preprocessing_layer(x["Text"]))

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

In [12]:
tf_ds_train = processed_ds_train.to_tf_dataset(
    columns = ['features'],
    label_cols=['labels'],
    batch_size = 10,
    shuffle = True
)
tf_ds_test = processed_ds_test.to_tf_dataset(
    columns = ['features'],
    label_cols=['labels'],
    batch_size = 10,
    shuffle = True
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
I0000 00:00:1757099906.433476   48600 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1171 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [13]:
def embedding_layer_generator():
  embedding_layer = Embedding(
      input_dim = vocab_size,
      output_dim = 300,
      mask_zero = True
  )
  return embedding_layer
emb_layer = embedding_layer_generator()

In [14]:
# import numpy as np

# embeddings_index = {}
# with open("glove.6B.200d.txt", encoding="utf8") as f:  # pick 50d/100d/200d/300d
#     for line in f:
#         values = line.split()
#         word = values[0]
#         vector = np.asarray(values[1:], dtype="float32")
#         embeddings_index[word] = vector

# print("Loaded %s word vectors." % len(embeddings_index))

In [15]:
# for x,y in embeddings_index.items():
#   print(x)
#   break

In [16]:
# lastIndexEmb = len(list(embeddings_index.keys()))
# lastIndexEmb

In [17]:
# embDim = len(embeddings_index['random'])
# embDim

In [18]:
# # Reserve 0 for <PAD>
# word_to_index = {"<PAD>": 0, "<UNK>": 1}

# for i, word in enumerate(embeddings_index.keys(), start=2):
#     word_to_index[word] = i

# word_to_index['the']

In [19]:
# vocab_size = len(word_to_index)
# embedding_matrix = np.zeros((vocab_size, embDim))

# for word, idx in word_to_index.items():
#     if word == "<PAD>":
#         embedding_matrix[idx] = np.zeros(embDim)
#     elif word == "<UNK>":
#         embedding_matrix[idx] = np.random.uniform(-0.05, 0.05, embDim)
#     else:
#         embedding_matrix[idx] = embeddings_index.get(word, np.random.uniform(-0.05, 0.05, embDim))


In [20]:
# This is the embedding layer with glove

# def embedding_layer():
#   emb_layer = Embedding(
#       input_dim=vocab_size,
#       output_dim=embDim,
#       weights=[embedding_matrix],
#       trainable=False,
#       mask_zero=True
#   )

#   return emb_layer
# emb_layer = embedding_layer()

In [21]:
# Preprocessing layer for glove

# def preprocessing_layer(sentence):
#     split = re.findall(r"\b\w+\b", sentence.lower())
#     sen_to_in = [word_to_index.get(x, word_to_index["<UNK>"]) for x in split]

#     # --- Features ---
#     features = sen_to_in + [0] * (maxInDs - len(sen_to_in))
#     features = features[:maxInDs]

#     # --- Labels (shifted raw sequence, not padded features) ---
#     shifted = sen_to_in[1:] + [0]   # one step ahead
#     labels = shifted + [0] * (maxInDs - len(shifted))
#     labels = labels[:maxInDs]

#     return {"features": features, "labels": labels}


In [22]:
# ds['train'] = ds['train'].map(lambda x : preprocessing_layer(x['Text']))

In [23]:
# for x in ds['train']:
#   print(x)
#   break

In [24]:
# tf_ds = ds["train"].to_tf_dataset(
#     columns=["features"],
#     label_cols=["labels"],
#     shuffle=True,
#     batch_size=1
# )


In [25]:
# bad_rows = []
# for i, x in enumerate(ds["train"]):
#     if len(x["features"]) != maxInDs or len(x["labels"]) != maxInDs:
#         bad_rows.append((i, len(x["features"]), len(x["labels"])))

# if bad_rows:
#     print("❌ Found mismatched rows:")
#     for row in bad_rows[:10]:  # sirf first 10 print karo
#         print(f"Index={row[0]}, features={row[1]}, labels={row[2]}, expected={maxInDs}")
# else:
#     print("✅ All rows match expected length =", maxInDs)


In [26]:
def shakesphereModel():
  inputs = tf.keras.layers.Input(shape = (maxInDs,))
  X = emb_layer(inputs)
  # LSTM Block 1
  X , _ , _ = LSTM(units = 256,activation = 'tanh' , return_sequences=True , return_state = True)(X)
  X = Dropout(0.4)(X)
  X , _ , _ = LSTM(units = 512,activation = 'tanh' , return_sequences=True ,return_state = True)(X)
  X = Dropout(0.4)(X)
  # X = LSTM(units = 256,activation = 'tanh' , return_sequences=True)(X)
  # X = Dropout(0.4)(X)
  X = Dense(units = vocab_size , activation = 'softmax')(X)
  outputs = X
  model = Model(inputs = inputs , outputs = outputs)
  return model
model = shakesphereModel()

In [28]:
model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(),optimizer = "adam" , metrics = [tf.keras.metrics.SparseCategoricalCrossentropy()])
model.fit(tf_ds_train,epochs=40)

Epoch 1/40


2025-09-06 00:49:41.569339: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:501] Allocator (GPU_0_bfc) ran out of memory trying to allocate 348.77MiB (rounded to 365716480)requested by op StatefulPartitionedCall/compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2025-09-06 00:49:41.569386: I external/local_xla/xla/tsl/framework/bfc_allocator.cc:1049] BFCAllocator dump for GPU_0_bfc
2025-09-06 00:49:41.569403: I external/local_xla/xla/tsl/framework/bfc_allocator.cc:1056] Bin (256): 	Total Chunks: 76, Chunks in use: 76. 19.0KiB allocated for chunks. 19.0KiB in use in bin. 465B client-requested in use in bin.
2025-09-06 00:49:41.569412: I external/local_xla/xla/tsl/framework/bfc_allocator.cc:1056] Bin (512): 	Tota

ResourceExhaustedError: Graph execution error:

Detected at node compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 211, in start

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/asyncio/base_events.py", line 608, in run_forever

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/asyncio/events.py", line 84, in _run

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 519, in dispatch_queue

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 508, in process_one

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 400, in dispatch_shell

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 368, in execute_request

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 767, in execute_request

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 455, in do_execute

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/ipykernel/zmqshell.py", line 577, in run_cell

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3116, in run_cell

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3171, in _run_cell

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3394, in run_cell_async

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3639, in run_ast_nodes

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3699, in run_code

  File "/tmp/ipykernel_48600/627538764.py", line 2, in <module>

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 377, in fit

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 220, in function

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 133, in multi_step_on_iterator

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 114, in one_step_on_data

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 61, in train_step

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/trainers/trainer.py", line 383, in _compute_loss

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/trainers/trainer.py", line 351, in compute_loss

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/trainers/compile_utils.py", line 690, in __call__

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/trainers/compile_utils.py", line 699, in call

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/losses/loss.py", line 67, in __call__

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/losses/losses.py", line 33, in call

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/losses/losses.py", line 2330, in sparse_categorical_crossentropy

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/ops/nn.py", line 2008, in sparse_categorical_crossentropy

  File "/home/garvthakral/miniconda3/envs/shakespere/lib/python3.11/site-packages/keras/src/backend/tensorflow/nn.py", line 753, in sparse_categorical_crossentropy

OOM when allocating tensor with shape[7390,12372] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_multi_step_on_iterator_6198]

In [None]:
import numpy as np
import tensorflow as tf

# Generated array init
generated_array = []

# States Init
a0 = c0 = np.zeros((1,256))
a1 = c1 = np.zeros((1,512))

# Pulling trained layers from the model
embedding_layer = model.layers[1]
lstm_layer_1 = model.layers[2]
lstm_layer_2 = model.layers[4]
dense_layer_1 = model.layers[6]

# Start tokens
inputs = tf.constant([[12]], dtype=tf.int32)  # start with first token

temperature = 0.7  # controls randomness (higher = more diverse)
top_k = 10         # optional: only consider top-k tokens

for i in range(40):
    # Embedding
    X = embedding_layer(inputs)

    # LSTM layers
    X, a0, c0 = lstm_layer_1(X, initial_state=[a0, c0])
    X, a1, c1 = lstm_layer_2(X, initial_state=[a1, c1])

    # Dense
    X = dense_layer_1(X)

    # Convert tensor to numpy
    logits = X.numpy()  # shape: (batch, seq_len, vocab_size)
    
    # Apply temperature and numerical stability
    scaled_logits = logits / temperature
    scaled_logits -= np.max(scaled_logits, axis=-1, keepdims=True)
    probs = np.exp(scaled_logits)
    probs /= probs.sum(axis=-1, keepdims=True)

    # Top-k sampling
    top_k_indices = np.argsort(probs[0, -1, :])[-top_k:]
    top_k_probs = probs[0, -1, :][top_k_indices]
    top_k_probs /= top_k_probs.sum()

    # Sample next token
    next_token = np.random.choice(top_k_indices, p=top_k_probs)

    # Append to generated array
    generated_array.append(next_token)

    # Prepare input for next step (last token only)
    inputs = tf.constant([[next_token]], dtype=tf.int32)

generated_array


[np.int64(131),
 np.int64(50),
 np.int64(84),
 np.int64(253),
 np.int64(1129),
 np.int64(253),
 np.int64(1050),
 np.int64(55),
 np.int64(103),
 np.int64(249),
 np.int64(253),
 np.int64(350),
 np.int64(586),
 np.int64(196),
 np.int64(53),
 np.int64(297),
 np.int64(120),
 np.int64(257),
 np.int64(179),
 np.int64(120),
 np.int64(407),
 np.int64(9935),
 np.int64(103),
 np.int64(4068),
 np.int64(50),
 np.int64(77),
 np.int64(200),
 np.int64(18),
 np.int64(251),
 np.int64(760),
 np.int64(75),
 np.int64(151),
 np.int64(583),
 np.int64(179),
 np.int64(18),
 np.int64(30),
 np.int64(27),
 np.int64(175),
 np.int64(75),
 np.int64(70)]

In [None]:
generated_string = [index_to_word[x] for x in generated_array]
" ".join(generated_string)

'them . what , ho , fie of the people , or else we have heard you know that you should arraign the tower . i think he hath been in a world that he shall not stay in his'

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1
