# Text Generation with LSTM Deep Neural Networks

#### First things first

To prepare the notebook, google drive must be mounted and the directory with the relevant files (weights, modules, data etc) must be navigated to. 


In [None]:
from google.colab import  drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls "/content/drive/My Drive/Colab Notebooks"
%cd "/content/drive/My Drive/Colab Notebooks"

 custom-weights-01-1.2951.hdf5	 second_collab.ipynb
 first_collab.ipynb		 starting-weight.hdf5
 get_docx_text.py		'Summarizing word docs.py'
'Processing word docs.py'	'Tutorial Document Summary.py'
 __pycache__			'Tutorial Summary Generation.py'
 README.md			 wonderland.txt
/content/drive/My Drive/Colab Notebooks


## The Project Aims


*   To create a contextual summary of a given document automatically
*   To compare abilities of deep learning on control and real world data
*   Understand deep learning's abilities and limitations






#### The Project Ingredients


*   A set of control data (the well known and used nltk corpus for Alice in Wonderland was chosen)

*   A set of 'real world' data (some essays on a given topic were used for this experiment)
*   The Spyder IDE, numpy, system modules (later transferred to Google Collab)
*   Keras and related modules
* A decent laptop (16GB RAM, RYZEN 7 CPU, RADEON VEGA GPU)

In [None]:
%%time
#data manipulation
import numpy
import sys

#keras modules
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils



Using TensorFlow backend.


CPU times: user 1.44 s, sys: 150 ms, total: 1.59 s
Wall time: 1.67 s


In [None]:
def txtfile2txt(textfile):
    
    raw_text = open(textfile, 'r', encoding = 'utf-8')
    raw_text = raw_text.read()
    raw_text = raw_text.lower()
    
    return raw_text
    

In [None]:
text_file = "wonderland.txt"
raw_text = txtfile2txt(text_file)

In [None]:
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

n_chars = len(raw_text)
n_vocab = len(chars)

In [None]:
%%time
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i:i + seq_length]
	seq_out = raw_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)


CPU times: user 1.57 s, sys: 73.9 ms, total: 1.64 s
Wall time: 1.65 s


In [None]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))

# normalize
X = X / float(n_vocab)

# one hot encode the output variable
y = np_utils.to_categorical(dataY)


In [None]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.load_weights('starting-weight.hdf5')
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
#define checkpoint
#filepath="tutorial-weights-{epoch:02d}-{loss:.4f}.hdf5"
#checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
#callbacks_list = [checkpoint]


In [None]:
%%time
#fit model
#model.fit(X, y, epochs=50, batch_size=128, callbacks=callbacks_list)


model.fit(X, y, epochs=4, batch_size=128)


Epoch 1/4

In [None]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")


In [None]:
# generate characters
for i in range(500):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print ("\nDone.")