<a href="https://colab.research.google.com/github/Monsterglitch/Text-Generation/blob/master/Text_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#First Code

In [10]:
# Load Larger LSTM network and generate text
import sys
import tensorflow as tf
import numpy as np
import os
import pickle
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from string import punctuation
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical

In [18]:
# importing data as text
sequence_length = 100
BATCH_SIZE = 128
EPOCHS = 50
# read the data
text = open("/content/drive/MyDrive/NN/wonderland.txt", encoding="utf-8").read()
# remove caps, comment this code if you want uppercase characters as well
text = text.lower()
# remove punctuation
text = text.translate(str.maketrans("", "", punctuation))
spl = ['™', 'ù', '—', '•']
for item in spl:
  text = text.replace(item, '')

# print some stats
n_chars = len(text)
vocab = ''.join(sorted(set(text)))
print("unique_chars:", vocab)
n_unique_chars = len(vocab)
print("Number of characters:", n_chars)
print("Number of unique characters:", n_unique_chars)

# converting data into numbers manually

# dictionary that converts characters to integers
char2int = {c: i for i, c in enumerate(vocab)}
# dictionary that converts integers to characters
int2char = {i: c for i, c in enumerate(vocab)}

# save these dictionaries for later generation
pickle.dump(char2int, open("/content/char2int.pickle", "wb"))
pickle.dump(int2char, open("/content/int2char.pickle", "wb"))

# convert all text into integers
encoded_text = np.array([char2int[c] for c in text])  # numpy array

# construct tf.data.Dataset object
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text) # for efficient data-handling use tf.data API

# print first 5 characters
# for char in char_dataset.take(8):
#     print(char.numpy(), int2char[char.numpy()])

# build sequences by batching
sequences = char_dataset.batch(2*sequence_length + 1, drop_remainder=True)

# print sequences
# for sequence in sequences.take(2):
#     print(''.join([int2char[i] for i in sequence.numpy()]))

def split_sample(sample):
    # example :
    # sequence_length is 10
    # sample is "python is a great pro" (21 length)
    # ds will equal to ('python is ', 'a') encoded as integers
    ds = tf.data.Dataset.from_tensors((sample[:sequence_length], sample[sequence_length]))
    for i in range(1, (len(sample)-1) // 2):
        # first (input_, target) will be ('ython is a', ' ')
        # second (input_, target) will be ('thon is a ', 'g')
        # third (input_, target) will be ('hon is a g', 'r')
        # and so on
        input_ = sample[i: i+sequence_length]
        target = sample[i+sequence_length]
        # extend the dataset with these samples by concatenate() method
        other_ds = tf.data.Dataset.from_tensors((input_, target))
        ds = ds.concatenate(other_ds)
    return ds

# prepare inputs and targets
dataset = sequences.flat_map(split_sample)

def one_hot_samples(input_, target):
    # onehot encode the inputs and the targets
    # Example:
    # if character 'd' is encoded as 3 and n_unique_chars = 5
    # result should be the vector: [0, 0, 0, 1, 0], since 'd' is the 4th character
    return tf.one_hot(input_, n_unique_chars), tf.one_hot(target, n_unique_chars)

dataset = dataset.map(one_hot_samples)

# print first 2 samples
for element in dataset.take(2):
    print("Input:", ''.join([int2char[np.argmax(char_vector)] for char_vector in element[0].numpy()]))
    print("Target:", int2char[np.argmax(element[1].numpy())])
    print("Input shape:", element[0].shape)
    print("Target shape:", element[1].shape)
    print("="*50, "\n")

# repeat, shuffle and batch the dataset
ds = dataset.repeat().shuffle(1024).batch(BATCH_SIZE, drop_remainder=True)

model = Sequential([
    LSTM(256, input_shape=(sequence_length, n_unique_chars), return_sequences=True),
    Dropout(0.2),
    LSTM(256),
    Dense(n_unique_chars, activation="softmax"),
])
# define the model path
model_weights_path = f"/content/wonderland-weights.h5"
model.summary()
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

#TRAINING THE MODEL

# make results folder if does not exist yet
if not os.path.isdir("results"):
    os.mkdir("results")

unique_chars: 
 0123456789abcdefghijklmnopqrstuvwxyz‘’“”
Number of characters: 157893
Number of unique characters: 42
Input: the project gutenberg ebook of alices adventures in wonderland
    
this ebook is for the use of any
Target: o
Input shape: (100, 42)
Target shape: (42,)

Input: he project gutenberg ebook of alices adventures in wonderland
    
this ebook is for the use of anyo
Target: n
Input shape: (100, 42)
Target shape: (42,)

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_20 (LSTM)              (None, 100, 256)          306176    
                                                                 
 dropout_10 (Dropout)        (None, 100, 256)          0         
                                                                 
 lstm_21 (LSTM)              (None, 256)               525312    
                                                                 
 dense_

In [6]:
# train the model
model.fit(ds, steps_per_epoch=(len(encoded_text) - sequence_length) // BATCH_SIZE, epochs=EPOCHS)
# save the model
model.save(model_weights_path)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  saving_api.save_model(


In [None]:
# for sequence in sequences.take(1):
#     # print(''.join([int2char[i] for i in sequence.numpy()]))
#     print(sequence)
# print()
# for sequence in sequences.take(1):
#     print(''.join([int2char[i] for i in sequence.numpy()]))
for e in dataset.take(1):
  print([np.argmax(char_vector) for char_vector in element[0].numpy()])

[21, 16, 14, 31, 1, 18, 32, 31, 16, 25, 13, 16, 29, 18, 30, 1, 12, 23, 20, 14, 16, 30, 1, 12, 15, 33, 16, 25, 31, 32, 29, 16, 30, 1, 20, 25, 1, 34, 26, 25, 15, 16, 29, 23, 12, 25, 15, 1, 13, 36, 1, 23, 16, 34, 20, 30, 1, 14, 12, 29, 29, 26, 23, 23, 0, 0, 0, 0, 31, 19, 20, 30, 1, 16, 13, 26, 26, 22, 1, 20, 30, 1, 17, 26, 29, 1, 31, 19, 16, 1, 32, 30, 16, 1, 26, 17, 1, 12, 25, 36]


In [12]:
#GENERATING TEXT

import tqdm
from keras.layers import Activation

sequence_length = 100
# dataset file path
FILE_PATH = "/content/wonderland.txt"
# FILE_PATH = "data/python_code.py"
# BASENAME = os.path.basename(FILE_PATH)

seed = "chapter xi"

# load vocab dictionaries
char2int = pickle.load(open(f"/content/char2int.pickle", "rb"))
int2char = pickle.load(open(f"/content/int2char.pickle", "rb"))
vocab_size = len(char2int)

# building the model
model1 = Sequential([
    LSTM(256, input_shape=(sequence_length, vocab_size), return_sequences=True),
    Dropout(0.3),
    LSTM(256),
    Dense(vocab_size, activation="softmax"),
])

# load the optimal weights
model.load_weights(f"/content/wonderland-weights.h5")


s = seed
n_chars = 300
# generate 400 characters
generated = ""
for i in tqdm.tqdm(range(n_chars), "Generating text"):
    # make the input sequence
    X = np.zeros((1, sequence_length, vocab_size))
    for t, char in enumerate(seed):
        X[0, (sequence_length - len(seed)) + t, char2int[char]] = 1
    # predict the next character
    predicted = model.predict(X, verbose=0)[0]
    # converting the vector to an integer
    next_index = np.argmax(predicted)
    # converting the integer to a character
    next_char = int2char[next_index]
    # add the character to results
    generated += next_char
    # shift seed and the predicted character
    seed = seed[1:] + next_char

print("Seed:", s)
print("Generated text:")
print(generated)

Generating text: 100%|██████████| 300/300 [00:20<00:00, 14.62it/s]

Seed: chapter xi
Generated text:
xs she’s looking at the shater of a long as if you find as feeling and denion compriance to see there as indeed very close
by feeling very much crossed up and read that it here your mands” and she spread out her can follow and deristen to me and for any one we
would be one finger and belong about it





#Second Code

In [19]:
# load ascii text and covert to lowercase
filename = "/content/drive/MyDrive/NN/wonderland.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()
raw_text = raw_text.translate(str.maketrans("", "", punctuation))
spl = ['™', 'ù', '—', '•']
for item in spl:
  raw_text = raw_text.replace(item, '')
vocab = ''.join(sorted(set(raw_text)))
print("Unique characters: ", vocab)
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
 seq_in = raw_text[i:i + seq_length]
 seq_out = raw_text[i + seq_length]
 dataX.append([char_to_int[char] for char in seq_in])
 dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = to_categorical(dataY)

Unique characters:  
 0123456789abcdefghijklmnopqrstuvwxyz‘’“”
Total Characters:  157893
Total Vocab:  42
Total Patterns:  157793


In [None]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(256))
# model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
# define the checkpoint
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
# fit the model
model.fit(X, y, epochs=50, batch_size=64, callbacks=callbacks_list)

In [22]:
int_to_char = dict((i, c) for i, c in enumerate(chars))
# define another LSTM model
model1 = Sequential()
model1.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model1.add(Dropout(0.3))
model1.add(LSTM(256))
# model1.add(Dropout(0.2))
model1.add(Dense(y.shape[1], activation='softmax'))
# load the network weights
filename = "weights.hdf5"
model1.load_weights(filename)
model1.compile(loss='categorical_crossentropy', optimizer='adam')
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
 x = np.reshape(pattern, (1, len(pattern), 1))
 x = x / float(n_vocab)
 prediction = model1.predict(x, verbose=0)
 index = np.argmax(prediction)
 result = int_to_char[index]
 seq_in = [int_to_char[value] for value in pattern]
 sys.stdout.write(result)
 pattern.append(index)
 pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
"  terror “oh there goes his precious nose” as an
unusually large saucepan flew close by it and very n "
early anl the way was the white rabbit with a sree 
“not i was not a rerpent” scid the caterpillar 
“i’ve hately know yhat is the same thing” said the mock turtle “seals throw then i’ve got to grow up and the other side with the tor of the court with one finger and had been of her head to she set to work the lock turtle seplied rather shat she had not a moment to she was not and the words “drink me” but the door she rat down and looked at alice
and looked at alice

“wou couldn’t halds the sreess” said the mock turtle “seals throw then i’d cettainly to shis it was a cat and the others sook them of course you know what it was shat i goow she world gate it in a lou to the cook the cook the cook the cook to she was not and the words “drink me” but the door she rat down and looked at alice
and looked at alice

“wou couldn’t halds the sreess” said the mock turtle “seals throw then i