## Import Libraries

In [1]:
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Activation, Dense, LSTM
import tensorflow_datasets as tfds

2024-11-20 20:20:03.205259: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-20 20:20:03.542925: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-20 20:20:03.813380: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732134004.131286    5644 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732134004.220277    5644 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-20 20:20:04.665662: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

## Acquire Training Data

In [2]:
filepath = tf.keras.utils.get_file('shakespeare.txt','https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text = open(filepath, 'rb').read().decode(encoding='utf-8').lower()
text = text[0:len(text)//2] # only use half of the dataset for speed

In [87]:
ds = tfds.load('imdb_reviews', split='train', shuffle_files=True)
ds = ds.take(250)  # Only take a single example

text = ""
for review in ds:
    text += review['text'].numpy().decode(encoding='utf-8').lower()
  
print(type(text), len(text), text)



2024-11-20 21:51:19.646369: W tensorflow/core/kernels/data/cache_dataset_ops.cc:914] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Data Processing

In [88]:
filtered_text = ''.join([char for char in text if char.isalpha() or char == ' '])

words = sorted(set(word for word in filtered_text.split()))
wordToIndex = dict((w, i) for i, w in enumerate(words))
indexToWord = dict((i, w) for i, w in enumerate(words))
wordToIndex[' '] = len(words) # add empty space as a possible word
indexToWord[len(words)] = ' '

characters = sorted(set(filtered_text))
charToIndex = dict((c, i) for i, c in enumerate(characters))
indexToChar = dict((i, c) for i, c in enumerate(characters))

SEQ_LENGTH = 50 # setence length
STEP_SIZE = 10 # num of characters to step when creating a new sentence

sentences = []
nextWord = []

# return word that contains the given index of a given string
def findWord(s, i):
    l, r = i, i
    
    while l >= 0 and s[l] != ' ': l -= 1
    while r < len(s) and s[r] != ' ': r += 1
    
    return s[l+1:r]
    
    

for i in range(0, len(filtered_text)-SEQ_LENGTH, STEP_SIZE):
    sentences.append(filtered_text[i:i+SEQ_LENGTH])
    
    # nextWord can either be a space or a complete word
    if filtered_text[i+SEQ_LENGTH] == " ":
        nextWord.append(' ')
    else:
        nextWord.append(findWord(filtered_text, i+SEQ_LENGTH))

print(sentences[:10])
print(nextWord[:10])
print(f'text length: {len(filtered_text)}, num of distinct words: {len(words)}, num of distinct chars: {len(characters)}')

['this was an absolutely terrible movie dont be lure', 'n absolutely terrible movie dont be lured in by ch', 'ly terrible movie dont be lured in by christopher ', 'e movie dont be lured in by christopher walken or ', 'nt be lured in by christopher walken or michael ir', 'd in by christopher walken or michael ironside bot', 'ristopher walken or michael ironside both are grea', 'walken or michael ironside both are great actors b', 'michael ironside both are great actors but this mu', 'onside both are great actors but this must simply ']
['lured', 'christopher', 'walken', 'michael', 'ironside', 'both', 'great', 'but', 'must', 'be']
text length: 283345, num of distinct words: 8569, num of distinct chars: 36


Predicting next word (instead of character) with sentences:

In [41]:
filtered_text = ''.join([char for char in text if char.isalpha() or char == ' '])
words = set(word for word in filtered_text.split() if len(word) < 5)

print()
print(words)
print(len(words))


{'mind', 'take', 'lets', 'life', 'lame', 'rate', 'sim', 'guts', 'hell', 'sets', 'gore', 'lage', 'goes', 'art', 'loss', 'that', 'miss', 'best', 'yet', 'whom', 'bit', 'all', 'even', 'plug', 'we', 'team', 'swat', 'ears', 'iti', 'cage', 'a', 'king', 'put', 'boy', 'wild', 'were', 'by', 'new', 'any', 'real', 'sexy', 'must', 'my', 'col', 'have', 'turn', 'br', 'lots', 'know', 'here', 'llbr', 'lady', 'make', 'than', 'its', 'half', 'ask', 'so', 'view', 'back', 'word', 'air', 'job', 'own', 'gave', 'oh', 'bar', 'this', 'cast', 'gold', 'love', 'full', 'high', 'has', 'year', 'for', 'what', 'they', 'face', 'fans', 'find', 'red', 'in', 'hits', 'name', 'gets', 'note', 'ive', 'tag', 'law', 'man', 'run', 'stop', 'play', 'say', 'go', 'may', 'c', 'rees', 'why', 'did', 'sign', 'shea', 'ably', 'him', 'g', 'sort', 'show', 'car', 'paid', 'then', 'type', 'body', 'was', 'fare', 'us', 'lead', 'want', 'walk', 'edge', 'out', 'abe', 'fan', 'ol', 'idea', 'mann', 'and', 'yawk', 'mom', 'boat', 'yes', 'two', 'jab', 'no

In [89]:
x = np.zeros((len(sentences), SEQ_LENGTH, len(characters)), dtype=np.bool)
y = np.zeros((len(sentences), len(words)+1), dtype=np.bool)
print(len(words))
for i in range(len(sentences)):
    for j in range(SEQ_LENGTH):
        x[i][j][charToIndex[sentences[i][j]]] = 1
    y[i][wordToIndex[nextWord[i]]] = 1

# print(x)
# print(y)

8569


## Model Building

In [91]:
model = Sequential()
model.add(LSTM(128, input_shape=(SEQ_LENGTH, len(characters))))
model.add(Dense(len(words)+1))
model.add(Activation('softmax'))

model.compile(loss="categorical_crossentropy", optimizer=RMSprop(learning_rate=0.01), metrics=["accuracy"])
model.fit(x, y, batch_size=256, epochs=4)

2024-11-20 21:54:45.269233: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 50994000 exceeds 10% of free system memory.
2024-11-20 21:54:45.364015: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 242788100 exceeds 10% of free system memory.


Epoch 1/4
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 151ms/step - accuracy: 0.1682 - loss: 7.3621
Epoch 2/4
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 148ms/step - accuracy: 0.1919 - loss: 6.6353
Epoch 3/4
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 146ms/step - accuracy: 0.2306 - loss: 6.1798
Epoch 4/4
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 146ms/step - accuracy: 0.2494 - loss: 5.7261


<keras.src.callbacks.history.History at 0x7f74eb8124b0>

## Generate Text

In [92]:
def sample(preds, temparture=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temparture
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_text(length, temperature):
    start_index = random.randint(0, len(filtered_text) - SEQ_LENGTH - 1)
    generated = ""
    sentence = filtered_text[start_index: start_index + SEQ_LENGTH]
    generated += sentence
    
    for i in range(length//5):
        x_predictions = np.zeros((1, SEQ_LENGTH, len(characters)))

        for t, char in enumerate(sentence):
            if t < SEQ_LENGTH:
                x_predictions[0, t, charToIndex[char]] = 1
            
        predictions = model.predict(x_predictions, verbose=0)[0]
        next_index = sample(predictions, temperature)
        next_word = indexToWord[next_index]
        
        generated += next_word + ' '
        sentence = sentence[5:] + next_word
        
    return generated
        

In [93]:
print(generate_text(300, 1.0))
print(generate_text(300, 0.1))


dd has gone back to the boat to check on the young  amazon stunning   say lovethis killed pseudolove compelled films discovery movies tension   movie they the   editing episodes   spirited animation people rulezzz dull certainly boyfriend   confusing hudsons   used only wouldnt   not constantly out super work wellacquainted scary scenes   putsch it guys         character and actors     james were rage 
 holmes but i found myself forgetting that it wasn                                                                                                                        


In [84]:
print(sorted(words, key=len, reverse=True))

