___

<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>
___
# Text Generation with Neural Networks

## Functions for Processing Text

### Reading in files as a string text

In [26]:
def read_file(filepath):
    
    with open(filepath, encoding = 'utf8') as f:
        str_text = f.read()
    
    return str_text

In [27]:
read_file('giant_introduction.txt')

" The ubiquitin/proteasome pathway, responsible for mediating the majority of intracellular proteolysis, plays a crucial role in the regulation of many normal cellular processes, including the cell cycle, differentiation and apoptosis !!!REFs!!!. Defects in proteasome function have been suggested to be involved in the pathogenesis of neurodegenerative diseases, including Alzheimer's disease and Parkinson's disease !!!REF!!!. Proteasome inhibitors have been shown to induce apoptosis in neuronal cells characterized by nuclear fragmentation, loss of mitochondrial membrane potential, cytochrome release and caspase activation !!!REFs!!!. In contrast, proteasome inhibitors lactacystin and AcLLNaI induce apoptotic death in human glioma cells by a mitochondria-independent mechanism !!!REF!!!. In addition, it is uncertain whether the cytotoxicity of MG132 is mediated by elevation of the intracellular Ca2+ levels !!!REFs!!!. Disruption of intracellular calcium homeostasis and defects in mitochon

### Tokenize and Clean Text

In [67]:
import spacy
nlp = spacy.load('en',disable=['parser', 'tagger','ner'])

nlp.max_length = 1398623

In [68]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n !!!refs!!! !!! ref!!! !!!ref !!! refs refs!!! !!!refs !!!REF!!! \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [69]:
d = read_file('giant_introduction.txt')
tokens = separate_punc(d)

In [70]:
tokens

['the',
 'ubiquitin',
 'proteasome',
 'pathway',
 'responsible',
 'for',
 'mediating',
 'the',
 'majority',
 'of',
 'intracellular',
 'proteolysis',
 'plays',
 'a',
 'crucial',
 'role',
 'in',
 'the',
 'regulation',
 'of',
 'many',
 'normal',
 'cellular',
 'processes',
 'including',
 'the',
 'cell',
 'cycle',
 'differentiation',
 'and',
 'apoptosis',
 'refs!!!.',
 'defects',
 'in',
 'proteasome',
 'function',
 'have',
 'been',
 'suggested',
 'to',
 'be',
 'involved',
 'in',
 'the',
 'pathogenesis',
 'of',
 'neurodegenerative',
 'diseases',
 'including',
 'alzheimer',
 "'s",
 'disease',
 'and',
 'parkinson',
 "'s",
 'disease',
 'ref!!!.',
 'proteasome',
 'inhibitors',
 'have',
 'been',
 'shown',
 'to',
 'induce',
 'apoptosis',
 'in',
 'neuronal',
 'cells',
 'characterized',
 'by',
 'nuclear',
 'fragmentation',
 'loss',
 'of',
 'mitochondrial',
 'membrane',
 'potential',
 'cytochrome',
 'release',
 'and',
 'caspase',
 'activation',
 'refs!!!.',
 'in',
 'contrast',
 'proteasome',
 'inhibi

In [71]:
len(tokens)

159900

In [72]:
4431/25

177.24

## Create Sequences of Tokens

In [73]:
# organize into sequences of tokens
train_len = 25+1 # 50 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [74]:
' '.join(text_sequences[0])

'the ubiquitin proteasome pathway responsible for mediating the majority of intracellular proteolysis plays a crucial role in the regulation of many normal cellular processes including the'

In [75]:
' '.join(text_sequences[1])

'ubiquitin proteasome pathway responsible for mediating the majority of intracellular proteolysis plays a crucial role in the regulation of many normal cellular processes including the cell'

In [76]:
' '.join(text_sequences[2])

'proteasome pathway responsible for mediating the majority of intracellular proteolysis plays a crucial role in the regulation of many normal cellular processes including the cell cycle'

In [77]:
len(text_sequences)

159874

# Keras

### Keras Tokenization

In [78]:
from keras.preprocessing.text import Tokenizer

In [79]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [80]:
sequences[0]

[1,
 5020,
 2162,
 496,
 606,
 13,
 821,
 1,
 1638,
 2,
 605,
 4177,
 514,
 6,
 820,
 56,
 3,
 1,
 160,
 2,
 170,
 162,
 310,
 175,
 87,
 1]

In [81]:
tokenizer.index_word

{1: 'the',
 2: 'of',
 3: 'in',
 4: 'and',
 5: 'to',
 6: 'a',
 7: 'is',
 8: 'that',
 9: 'ref!!!.',
 10: 'refs!!!.',
 11: 'by',
 12: 'with',
 13: 'for',
 14: 'as',
 15: 'are',
 16: 'on',
 17: 'been',
 18: 'be',
 19: 'this',
 20: 'have',
 21: 'or',
 22: 'has',
 23: 'an',
 24: 'brain',
 25: 'was',
 26: 'refs',
 27: 'which',
 28: 'these',
 29: 'from',
 30: 'it',
 31: 'we',
 32: 'neurons',
 33: 'also',
 34: 'studies',
 35: 'rats',
 36: 'were',
 37: 'receptors',
 38: 'effects',
 39: 'study',
 40: 'not',
 41: 'activity',
 42: 'receptor',
 43: 'may',
 44: 'can',
 45: 'such',
 46: 'during',
 47: 'induced',
 48: 'cells',
 49: 'sleep',
 50: 'between',
 51: 'at',
 52: 'expression',
 53: 'however',
 54: 'both',
 55: 'system',
 56: 'role',
 57: 'changes',
 58: 'cell',
 59: 'effect',
 60: 'other',
 61: 'task',
 62: 'different',
 63: 'present',
 64: 'but',
 65: 'neuronal',
 66: 'response',
 67: 'stress',
 68: 'after',
 69: 'activation',
 70: 'shown',
 71: 'used',
 72: 'no',
 73: 'memory',
 74: 'reporte

In [82]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

1 : the
5020 : ubiquitin
2162 : proteasome
496 : pathway
606 : responsible
13 : for
821 : mediating
1 : the
1638 : majority
2 : of
605 : intracellular
4177 : proteolysis
514 : plays
6 : a
820 : crucial
56 : role
3 : in
1 : the
160 : regulation
2 : of
170 : many
162 : normal
310 : cellular
175 : processes
87 : including
1 : the


In [83]:
tokenizer.word_counts

OrderedDict([('the', 261844),
             ('ubiquitin', 54),
             ('proteasome', 237),
             ('pathway', 1122),
             ('responsible', 967),
             ('for', 27462),
             ('mediating', 735),
             ('majority', 347),
             ('of', 191312),
             ('intracellular', 982),
             ('proteolysis', 90),
             ('plays', 1105),
             ('a', 66548),
             ('crucial', 743),
             ('role', 6932),
             ('in', 143339),
             ('regulation', 3087),
             ('many', 2907),
             ('normal', 3038),
             ('cellular', 1741),
             ('processes', 2806),
             ('including', 4809),
             ('cell', 6864),
             ('cycle', 936),
             ('differentiation', 1118),
             ('and', 122712),
             ('apoptosis', 1300),
             ('refs!!!.', 33956),
             ('defects', 182),
             ('function', 3614),
             ('have', 19240),
           

In [84]:
vocabulary_size = len(tokenizer.word_counts)

### Convert to Numpy Matrix

In [85]:
import numpy as np

In [86]:
sequences = np.array(sequences)

In [87]:
sequences

array([[   1, 5020, 2162, ...,  175,   87,    1],
       [5020, 2162,  496, ...,   87,    1,   58],
       [2162,  496,  606, ...,    1,   58,  626],
       ...,
       [2823,   48,   19, ...,    2,  717,    4],
       [  48,   19,   39, ...,  717,    4,  310],
       [  19,   39,  264, ...,    4,  310, 6635]])

# Creating an LSTM based model

In [88]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [89]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

### Train / Test Split

In [90]:
from keras.utils import to_categorical

In [91]:
sequences

array([[   1, 5020, 2162, ...,  175,   87,    1],
       [5020, 2162,  496, ...,   87,    1,   58],
       [2162,  496,  606, ...,    1,   58,  626],
       ...,
       [2823,   48,   19, ...,    2,  717,    4],
       [  48,   19,   39, ...,  717,    4,  310],
       [  19,   39,  264, ...,    4,  310, 6635]])

In [92]:
# First 49 words
sequences[:,:-1]

array([[   1, 5020, 2162, ...,  310,  175,   87],
       [5020, 2162,  496, ...,  175,   87,    1],
       [2162,  496,  606, ...,   87,    1,   58],
       ...,
       [2823,   48,   19, ...,  285,    2,  717],
       [  48,   19,   39, ...,    2,  717,    4],
       [  19,   39,  264, ...,  717,    4,  310]])

In [93]:
# last Word
sequences[:,-1]

array([   1,   58,  626, ...,    4,  310, 6635])

In [94]:
X = sequences[:,:-1]

In [95]:
y = sequences[:,-1]

In [96]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [97]:
seq_len = X.shape[1]

In [98]:
seq_len

25

### Training the Model

In [99]:
# define model
model = create_model(vocabulary_size+1, seq_len)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 25, 25)            277650    
_________________________________________________________________
lstm_5 (LSTM)                (None, 25, 150)           105600    
_________________________________________________________________
lstm_6 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense_5 (Dense)              (None, 150)               22650     
_________________________________________________________________
dense_6 (Dense)              (None, 11106)             1677006   
Total params: 2,263,506
Trainable params: 2,263,506
Non-trainable params: 0
_________________________________________________________________


---

----

In [100]:
from pickle import dump,load

In [103]:
# fit model
model.fit(X, y, batch_size=128, epochs=150,verbose=2)

Epoch 1/150
 - 141s - loss: 6.7272 - acc: 0.0967
Epoch 2/150
 - 140s - loss: 6.3824 - acc: 0.1184
Epoch 3/150
 - 141s - loss: 6.1500 - acc: 0.1363
Epoch 4/150
 - 140s - loss: 5.9735 - acc: 0.1491
Epoch 5/150
 - 140s - loss: 5.8279 - acc: 0.1573
Epoch 6/150
 - 140s - loss: 5.7055 - acc: 0.1629
Epoch 7/150
 - 140s - loss: 5.6027 - acc: 0.1677
Epoch 8/150
 - 139s - loss: 5.4969 - acc: 0.1730
Epoch 9/150
 - 139s - loss: 5.4040 - acc: 0.1774
Epoch 10/150
 - 141s - loss: 5.3187 - acc: 0.1815
Epoch 11/150
 - 140s - loss: 5.2407 - acc: 0.1866
Epoch 12/150
 - 139s - loss: 5.1697 - acc: 0.1902
Epoch 13/150
 - 139s - loss: 5.1035 - acc: 0.1943
Epoch 14/150
 - 139s - loss: 5.0384 - acc: 0.1976
Epoch 15/150
 - 140s - loss: 4.9789 - acc: 0.2013
Epoch 16/150
 - 139s - loss: 5.2744 - acc: 0.1809
Epoch 17/150
 - 139s - loss: 5.0134 - acc: 0.1963
Epoch 18/150
 - 139s - loss: 4.9330 - acc: 0.2017
Epoch 19/150
 - 140s - loss: 4.8690 - acc: 0.2062
Epoch 20/150
 - 142s - loss: 4.8063 - acc: 0.2108
Epoch 21/

<keras.callbacks.History at 0x1d7fa5e9a58>

In [104]:
# save the model to file
model.save('epochBIG.h5')
# save the tokenizer
dump(tokenizer, open('epochBIG', 'wb'))

# Generating New Text

In [105]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [106]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

### Grab a random seed sequence

In [107]:
text_sequences[0]

['the',
 'ubiquitin',
 'proteasome',
 'pathway',
 'responsible',
 'for',
 'mediating',
 'the',
 'majority',
 'of',
 'intracellular',
 'proteolysis',
 'plays',
 'a',
 'crucial',
 'role',
 'in',
 'the',
 'regulation',
 'of',
 'many',
 'normal',
 'cellular',
 'processes',
 'including',
 'the']

In [108]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [109]:
random_seed_text = text_sequences[random_pick]

In [110]:
random_seed_text

['spontaneously',
 'after',
 'systemic',
 'administration',
 'of',
 '3-np',
 'in',
 'spite',
 'of',
 'extensive',
 'research',
 'this',
 'devastating',
 'hereditary',
 'disease',
 'remains',
 'incurable',
 'warranting',
 'further',
 'studies',
 'to',
 'determine',
 'the',
 'causes',
 'and',
 'cure']

In [115]:
seed_text = ' '.join(random_seed_text[:-4])

In [116]:
seed_text

'spontaneously after systemic administration of 3-np in spite of extensive research this devastating hereditary disease remains incurable warranting further studies to determine'

In [132]:
seed_text = 'In the current study we investigated whether '

In [133]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=3)

'there are predominantly'

In [66]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)

'the brain and the brain and the brain and the brain and the brain and the brain and the brain and the brain and the brain and the brain and the brain and the brain and the brain and the brain and the brain and the brain and the brain'

### Exploring Generated Sequence

In [118]:
full_text = read_file('giant_introduction.txt')

In [None]:
for i,word in enumerate(full_text.split()):
    if word == 'inkling':
        print(' '.join(full_text.split()[i-20:i+20]))
        print('\n')

# Great Job!