### Trial 1: Markov model

In [17]:
import numpy as np
import string

np.random.seed(1234)#to get consistent results

In [18]:
initial = {} 
first_order = {} 
second_order = {} 

In [19]:
# def remove_punctuation(s):
#     return s.translate(str.maketrans('','',string.punctuation))
def remove_punctuation(s):
    return s

In [20]:
def add2dict(d, k, v):
    if k not in d:
        d[k] = []
    d[k].append(v)

In [21]:
for line in open("Poems\Emily Dickinson\Emily Dickinsons' poems.txt"):
    tokens = remove_punctuation(line.rstrip().lower()).split()

    T = len(tokens)
    for i in range(T):
        t = tokens[i]
        if i == 0:
            initial[t] = initial.get(t, 0.) + 1
        else:
            t_1 = tokens[i-1]
            if i == T - 1:

                add2dict(second_order, (t_1, t), 'END')
            if i == 1:

                add2dict(first_order, t_1, t)
            else:
                t_2 = tokens[i-2]
                add2dict(second_order, (t_2, t_1), t)

In [22]:

initial_total = sum(initial.values())
for t, c in initial.items():
    initial[t] = c / initial_total

In [23]:
def list2pdict(ts):
    d = {}
    n = len(ts)
    for t in ts:
        d[t] = d.get(t, 0.) + 1
    for t, c in d.items():
        d[t] = c / n
    return d

In [24]:
for t_1, ts in first_order.items():
    first_order[t_1] = list2pdict(ts)

In [25]:
for k, ts in second_order.items():
    second_order[k] = list2pdict(ts)

In [26]:
def sample_word(d):
    p0 = np.random.random()
    cumulative = 0
    for t, p in d.items():
        cumulative += p
        if p0 < cumulative:
            return t
    assert(False)

In [27]:
def generate():
    for i in range(6): 
        sentence = []

        w0 = sample_word(initial)
        sentence.append(w0)

        w1 = sample_word(first_order[w0])
        sentence.append(w1)

        while True:
            w2 = sample_word(second_order[(w0, w1)])
            if w2 == 'END':
                break
            sentence.append(w2)
            w0 = w1
            w1 = w2
        print(' '.join(sentence))

In [28]:
generate()

i'm so accustomed to the heart that broke so long—
then thought of us, and return—
those boys and girls
but internal difference
eden—a legend—dimly told—
or what circassian land?


### Trial 2: LSTM

In [33]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [34]:
with open("Poems\Emily Dickinson\Emily Dickinsons' poems.txt", 'r', encoding='utf-8') as file:
    text = file.read()

text = text.lower()  
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = tf.keras.utils.to_categorical(label, num_classes=total_words)

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit(predictors, label, epochs=100, verbose=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 11, 100)           1325500   
                                                                 
 lstm (LSTM)                 (None, 11, 150)           150600    
                                                                 
 lstm_1 (LSTM)               (None, 100)               100400    
                                                                 
 dense (Dense)               (None, 13255)             1338755   
                                                                 
Total params: 2,915,255
Trainable params: 2,915,255
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch

In [36]:
seed_text = "I am"
next_words = 20  

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)

I am alive—because bold so far— suspect me with him the sea of that— by steel he not be ended— – along


### Implementing Bidirectional LSTM

In [66]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Bidirectional

In [67]:
with open("Poems\Emily Dickinson\Emily Dickinsons' poems.txt", 'r', encoding='utf-8') as file:
    text = file.read()

text = text.lower()  
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = tf.keras.utils.to_categorical(label, num_classes=total_words)

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit(predictors, label, epochs=10, verbose=1)

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 11, 100)           1325500   
                                                                 
 lstm_15 (LSTM)              (None, 11, 150)           150600    
                                                                 
 bidirectional_3 (Bidirectio  (None, 200)              200800    
 nal)                                                            
                                                                 
 dense_11 (Dense)            (None, 13255)             2664255   
                                                                 
Total params: 4,341,155
Trainable params: 4,341,155
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/1

<keras.callbacks.History at 0x1d750caba30>

In [75]:
seed_text = "Summer is"
next_words = 20  

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word



In [76]:
print(seed_text)

Summer is the sun of the sun of the sun air come— swain i come— come— swain swain swain one santa workman—


In [70]:
# Generate poetry in multiple lines
seed_text = "Summer is"
next_lines = 5 
words_per_line = 5  
for _ in range(next_lines):
    generated_words = []
    for _ in range(words_per_line):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
        generated_words.append(output_word)
    generated_line = " ".join(generated_words)
    print(generated_line)


the sun of the sun
of the sun air come—
swain i come— come— swain
swain swain one santa workman—
workman— santa santa workman— workman—


### Bidirectional LSTM without punctuations

In [64]:
import numpy as np
import tensorflow as tf
import string
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Bidirectional

# Read the text file
with open("Poems\Emily Dickinson\Emily Dickinsons' poems.txt", 'r', encoding='utf-8') as file:
    text = file.read()

# Convert text to lowercase
text = text.lower()

# Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = tf.keras.utils.to_categorical(label, num_classes=total_words)

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit(predictors, label, epochs=10, verbose=1)

seed_text = "I am"
next_words = 20  

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

    if output_word == '\n':
        print(seed_text.strip())
        seed_text = ""

print(seed_text.strip())


Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 11, 100)           1321000   
                                                                 
 lstm_13 (LSTM)              (None, 11, 150)           150600    
                                                                 
 bidirectional_2 (Bidirectio  (None, 200)              200800    
 nal)                                                            
                                                                 
 dense_10 (Dense)            (None, 13210)             2655210   
                                                                 
Total params: 4,327,610
Trainable params: 4,327,610
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
 258/1547 [====>.........................] - ETA: 1:17 - loss: 8.1571 - accuracy: 0.0624

KeyboardInterrupt: 

### With attention layer

In [44]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, LSTM, Layer
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical



"Poems\Emily Dickinson\Emily Dickinsons' poems.txt"

In [46]:
class BahdanauAttention(Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, inputs):
        query, value = inputs
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(value)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * value
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

with open("Poems\Emily Dickinson\Emily Dickinsons' poems.txt", 'r') as file:
    poems_text = file.read()

poems_text = poems_text.lower()
poems_text = poems_text.replace('\n', ' \n ')
poems_text = poems_text.replace('\r', ' ')
poems_text = ' '.join(poems_text.split())

# Creating the vocabulary
words = poems_text.split()
word_to_index = {word: i for i, word in enumerate(set(words))}
index_to_word = {i: word for word, i in word_to_index.items()}
vocab_size = len(word_to_index)

# Generate input-output pairs
sequences = []
next_words = []
sequence_length = 10

for i in range(len(words) - sequence_length):
    sequence = words[i:i+sequence_length]
    target = words[i+sequence_length]
    sequences.append([word_to_index[word] for word in sequence])
    next_words.append(word_to_index[target])

sequences = np.array(sequences)
next_words = np.array(next_words)

embedding_dim = 100
hidden_units = 256

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=sequence_length))
model.add(LSTM(hidden_units, return_sequences=True))
model.add(BahdanauAttention(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(sequences, next_words, epochs=10, batch_size=32)

OperatorNotAllowedInGraphError: Exception encountered when calling layer "bahdanau_attention_1" (type BahdanauAttention).

in user code:

    File "C:\Users\lmbmo\AppData\Local\Temp\ipykernel_7876\4115574913.py", line 10, in call  *
        query, value = inputs

    OperatorNotAllowedInGraphError: Iterating over a symbolic `tf.Tensor` is not allowed in Graph execution. Use Eager execution or decorate this function with @tf.function.


Call arguments received by layer "bahdanau_attention_1" (type BahdanauAttention):
  • inputs=tf.Tensor(shape=(None, 10, 256), dtype=float32)

In [None]:
start_sequence = "i"
generated_poem = [start_sequence]
num_lines = 10

for _ in range(num_lines):
    encoded_input = [word_to_index[word] for word in generated_poem]
    encoded_input = pad_sequences([encoded_input], maxlen=sequence_length)
    predicted_index = np.argmax(model.predict(encoded_input))
    predicted_word = index_to_word[predicted_index]
    generated_poem.append(predicted_word)

for line in generated_poem:
    print(line)
