# *Text Generation using LSTM and Keras*

## Functions for Processing Text

### Reading in files as a string text

In [None]:
!pip install tensorflow==2.5
#for this project, use tensorflow=2.5, because generate_new_text has predict_class which is contained only in 2.5

In [None]:
def read_file(filepath):
    with open(filepath) as f:
        str_text=f.read()
    return str_text

In [None]:
read_file('moby_dick_four_chapters.txt')

### Tokenize and Clean Text

In [None]:

import spacy

In [None]:
nlp=spacy.load('en_core_web_sm',disable=['parser','tagger','ner'])

In [None]:
nlp.max_length=1198623


In [None]:
def separate_punc(doc_test):
    return [token.text.lower() for token in nlp(doc_test) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ' ]

In [None]:
d=read_file('moby_dick_four_chapters.txt')

In [None]:
tokens=separate_punc(d)

In [None]:
tokens

In [None]:
len(tokens)

## Create Sequences of Tokens

In [None]:

train_len=25+1
text_sequence=[]  #empty list
for i in range(train_len,len(tokens)):
    seq=tokens[i-train_len:i]
    text_sequence.append(seq)

In [None]:
type(text_sequence)

In [None]:
text_sequence[0]

In [None]:
text_sequence[1]

In [None]:
' '.join(text_sequence[0])

In [None]:
' '.join(text_sequence[1])

In [None]:
' '.join(text_sequence[2])

### Keras Tokenization

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(text_sequence)

In [None]:
sequences=tokenizer.texts_to_sequences(text_sequence) #converts to sequences

In [None]:
sequences[0]

In [None]:
sequences[1]

In [None]:
tokenizer.index_word

In [None]:
for i in sequences[0]:
    print(f"{i}:{tokenizer.index_word[i]}")

In [None]:
tokenizer.word_counts

In [None]:
vocabulary_size=len(tokenizer.word_counts)
vocabulary_size

In [None]:
type(sequences) 

### Convert to Numpy Matrix

In [None]:
import numpy as np


In [None]:
sequences=np.array(sequences)
sequences

# Creat LSTM model

In [None]:
#split the data into features and labels 
#1] X features (first n words of sequences)
#2] Y label (Next word after the sequence)
#fit the model 
#split will break first columns as features and last column as target we want to predict

from tensorflow.keras.utils import to_categorical


In [None]:
X=sequences[:,:-1]

In [None]:
y=sequences[:,-1]

In [None]:
y=to_categorical(y,num_classes=vocabulary_size+1)

In [None]:
y

In [None]:
seq_len=X.shape[1]


In [None]:
X.shape

In [None]:
#to create model
from tensorflow.keras.models import Sequential

In [None]:
from tensorflow.keras.layers import Dense,LSTM,Embedding

In [None]:
def create_model(vocabulary_size,seq_len):
    model=Sequential()
    model.add(Embedding(vocabulary_size,seq_len,input_length=seq_len))
    model.add(LSTM(50,return_sequences=True))
    model.add(LSTM(50)) #another layer of LSTM
    model.add(Dense(50,activation="relu"))
    model.add(Dense(vocabulary_size,activation="softmax")) #because it end, it will give words
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) #treat each vocabular word as an individual category
    model.summary()
    return model

In [None]:
model=create_model(vocabulary_size+1,seq_len)

### Training the Model

In [None]:
from pickle import dump,load 

In [None]:
model.fit(X,y,batch_size=128,epochs=2,verbose=1)

In [None]:
model.save('my_model.h5')  #save the model

In [None]:

dump(tokenizer,open('my_tokenizer','wb'))

# generate new Text Based off a seed 

In [None]:
from  tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:

def generate_new_text(model,tokenizer,seq_len,seed_text,num_gen_works):
    output_text=[]
    input_text=seed_text 
    for i in range(num_gen_works):
        encoded_text=tokenizer.texts_to_sequences([input_text])[0] 
        pad_encoded=pad_sequences([encoded_text],maxlen=seq_len,truncating='pre') 
        pred_word_ind=model.predict_classes(pad_encoded,verbose=0)[0]
        pred_word=tokenizer.index_word[pred_word_ind]
        input_text+=' '+pred_word
        output_text.append(pred_word)
    return ' '.join(output_text)

In [None]:
text_sequence[0]

### Grab a random seed sequence

In [None]:
import random
random.seed(101)
random_pick=random.randint(0,len(text_sequence))

In [None]:
random_seed_text=text_sequence[random_pick]
random_seed_text

In [None]:
seed_text=' '.join(random_seed_text)
seed_text

In [None]:
generate_new_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_works=25)

#loading another model

In [None]:
from tensorflow.keras.models import load_model

In [None]:
model_fin=load_model('epochBIG.h5')

In [None]:
tokenizer=load(open("epochBIG","rb"))


In [None]:
generate_new_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_works=25)