In [2]:
pip install kaggle

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\users\justf\appdata\local\programs\python\python37\python.exe -m pip install --upgrade pip' command.




In [3]:
# !mkdir .kaggle
# !copy kaggle.json .kaggle
# !chmod 600 .kaggle/kaggle.json

In [4]:
!kaggle datasets download johnhallman/complete-poetryfoundationorg-dataset

complete-poetryfoundationorg-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
# !unzip complete-poetryfoundationorg-dataset.zip

In [6]:
!kaggle datasets download rtatman/glove-global-vectors-for-word-representation

glove-global-vectors-for-word-representation.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
! unzip glove-global-vectors-for-word-representation.zip

'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [14]:
import numpy as np
import pandas as pd 
from scipy import spatial
from collections import Counter

ds_path = "./complete-poetryfoundationorg-dataset/kaggle_poem_dataset.csv"
glove_path = "./glove-global-vectors-for-word-representation/glove.6B.200d.txt"

In [9]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\users\justf\appdata\local\programs\python\python37\python.exe -m pip install --upgrade pip' command.


In [15]:
import keras.backend as K
from keras.models import Model
from tensorflow.keras.optimizers import Adam, SGD
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, Input, LSTM, GRU

import matplotlib.pyplot as plt
%matplotlib inline

In [16]:
poems_df = pd.read_csv(ds_path)
poems_df.head(5)

Unnamed: 0.1,Unnamed: 0,Author,Title,Poetry Foundation ID,Content
0,0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what ..."
1,1,Hailey Leithauser,0,41729,"Philosophic\nin its complex, ovoid emptiness,\..."
2,2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...
3,3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...
4,4,Ted Berrigan,3 Pages,51624,For Jack Collom\n10 Things I do Every Day\n\np...


In [17]:
poems_df.groupby("Author").agg({"Content": "count"}).sort_values("Content", ascending=False).head(5)

Unnamed: 0_level_0,Content
Author,Unnamed: 1_level_1
William Shakespeare,85
Anonymous,82
"Alfred, Lord Tennyson",78
Rae Armantrout,62
William Wordsworth,59


In [18]:
#submission is from Alfred, Lord Tennyson, for his semi-modern poetry
anonymous_poems = (poems_df[poems_df["Author"] == "Alfred, Lord Tennyson"])
print("Line examples: ")
print(anonymous_poems.iloc[0,4].split('\n')[:10])

Line examples: 
['Break, break, break,', 'On thy cold gray stones, O Sea!', 'And I would that my tongue could utter', 'The thoughts that arise in me.', '', "O, well for the fisherman's boy,", 'That he shouts with his sister at play!', 'O, well for the sailor lad,', 'That he sings in his boat on the bay!', '']


In [19]:
poems_combined = "\n".join(anonymous_poems.iloc[:,4].values)
print("Total number of characters: ", len(poems_combined))

Total number of characters:  186569


In [20]:
poem_lines = poems_combined.split('\n')
print("Number of lines: ", len(poem_lines))

Number of lines:  5246


In [21]:
# prepare the input and target lines
input_lines = ["<sos> "+line for line in poem_lines] # in each of the input we add <sos> token idicating the begining of a line
target_lines = [line+ " <eos>" for line in poem_lines] # while target lines are appended with with <eos> token indicating end of the line

In [24]:
EPOCHS = 250 # number of times the model is trained on the entire training dataset
BATCH_SIZE = 64 # number of data points to consider to train at a single point of time
LATENT_DIM = 200 # the size of the hidden state/vector
EMBEDDING_DIM = 200 # size of the word embeddings 
MAX_VOCAB_SIZE = 30000 # the maximum number of words to consider
VALIDATION_SPLIT = 0.2 # % of validation dataset

In [25]:
class SequenceGenerator():
    
    # takes as input an input and output sequence
    def __init__(self, input_lines, target_lines, max_seq_len=None, max_vocab_size=10000, embedding_dim=200):        
        self.input_lines = input_lines
        self.target_lines = target_lines
        
        self.MAX_SEQ_LEN = max_seq_len
        self.MAX_VOCAB_SIZE = max_vocab_size
        self.EMBEDDING_DIM = embedding_dim
    
    
    def initialize_embeddings(self):
        """Reads the GloVe word-embeddings and creates embedding matrix and word to index and index to word mapping."""
        
        # load the word embeddings
        self.word2vec = {}
        with open(glove_path, 'r', encoding="utf8") as file:
            for line in file:
                vectors = line.split()
                self.word2vec[vectors[0]] = np.asarray(vectors[1:], dtype="float32")
                
        # get the embeddings matrix
        self.num_words = min(self.MAX_VOCAB_SIZE, len(self.word2idx)+1)
        self.embeddings_matrix = np.zeros((self.num_words, self.EMBEDDING_DIM))
        
        for word, idx in self.word2idx.items():
            if idx <= self.num_words:
                word_embeddings = self.word2vec.get(word)
                if word_embeddings is not None:
                    self.embeddings_matrix[idx] = word_embeddings
                    
        self.idx2word = {v:k for k,v in self.word2idx.items()}
    
    
    def prepare_sequences(self, filters=''):
        # train the tokenizer
        self.tokenizer = Tokenizer(num_words=self.MAX_VOCAB_SIZE, filters='')
        self.tokenizer.fit_on_texts(self.input_lines+self.target_lines)
        
        # get the word-index mapping and initialize embeddings
        self.word2idx = self.tokenizer.word_index
        self.initialize_embeddings()
        
        # tokenize the input and target lines
        self.input_sequences = self.tokenizer.texts_to_sequences(self.input_lines)
        self.target_sequences = self.tokenizer.texts_to_sequences(self.target_lines)
        
        # get the max sequence len from the data
        max_seq_len = max(list(map(len, self.input_lines+self.target_lines)))
        if self.MAX_SEQ_LEN:
            self.MAX_SEQ_LEN = min(self.MAX_SEQ_LEN, max_seq_len)
        else:
            self.MAX_SEQ_LEN = max_seq_len
            
        # pad the sequences
        self.input_sequences = pad_sequences(self.input_sequences, maxlen=self.MAX_SEQ_LEN, padding="post")
        self.target_sequences = pad_sequences(self.target_sequences, maxlen=self.MAX_SEQ_LEN, padding="post")
        
        print("1st input sequence: ", self.input_sequences[0])
        print("1st target sequence: ", self.target_sequences[0])
        
        
    def one_hot_encoding(self):
        
        # it will be a 3 dimensional array where
        # first-dim is the number of target lines
        # second-dim is the size of the sequences
        # third-dim is the number of words in the dataset
        self.one_hot_targets = np.zeros((len(self.target_sequences), self.MAX_SEQ_LEN, self.num_words))
        
        for seq_idx, seq in enumerate(self.target_sequences):
            for word_idx, word_id in enumerate(self.target_sequences[seq_idx]):
                if word_id > 0:
                    self.one_hot_targets[seq_idx, word_idx, word_id] = 1
    
    
    def get_closest_word(self, word_vec):
        max_dist = 9999999999
        closest_word = "NULL"
        
        # iterate overall the words and find the closest one
        for word, vec in self.word2vec.items():
            
            # get the cosine distance between the words
            dist = spatial.distance.cosine(word_vec, vec)
            
            # compare the distance and keep the minimum
            if dist < max_dist:
                max_dist = dist
                closest_word = word
        
        return closest_word


# create an object of the class
sg_obj = SequenceGenerator(input_lines, target_lines, max_seq_len=12, 
                           max_vocab_size=MAX_VOCAB_SIZE, embedding_dim=EMBEDDING_DIM)

# prepare the input & target sequences
sg_obj.prepare_sequences()
# create the One-hot encoding on the target sequences
sg_obj.one_hot_encoding()

# make sure the tokenized words contains <sos> & <eos>
assert '<sos>' in sg_obj.word2idx
assert '<eos>' in sg_obj.word2idx


1st input sequence:  [  1 742 742 742   0   0   0   0   0   0   0   0]
1st target sequence:  [742 742 742   2   0   0   0   0   0   0   0   0]


In [26]:
# add the embedding layer
# weights: is the embedding_matrix we created in the SequenceGenerator class
embedding = Embedding(
    input_dim=sg_obj.num_words,
    output_dim=sg_obj.EMBEDDING_DIM,
    weights=[sg_obj.embeddings_matrix]
)

state_h = Input(shape=(LATENT_DIM,)) #hidden state
state_c = Input(shape=(LATENT_DIM,))#cell state

sequence_input = Input(shape=(sg_obj.MAX_SEQ_LEN,))

# the below layer gets the embeddings for the words in the sequence
embedding_ = embedding(sequence_input)

lstm = LSTM(LATENT_DIM, return_state=True, return_sequences=True)

x, h_, c_ = lstm(embedding_, initial_state=[state_h, state_c])

dense = Dense(sg_obj.num_words, activation="softmax")
output = dense(x)

Encoder = Model([sequence_input, state_h, state_c], output)

In [27]:
deco_inp = Input(shape=(1,))

deco_embed = embedding(deco_inp)

deco_x, h, c = lstm(deco_embed, initial_state=[state_h, state_c])
deco_output = dense(deco_x)

Decoder = Model([deco_inp, state_h, state_c], [deco_output, h, c])

In [28]:
Encoder.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=0.1),
    metrics=['accuracy']
)

# initial hidden/cell state vector containing all zeros
# this will be passed into the LSTM model
initial_state = np.zeros((len(sg_obj.input_sequences), LATENT_DIM))

# train the model
history = Encoder.fit(
    [sg_obj.input_sequences, initial_state, initial_state], # pass the input sequences and the state vectors
    sg_obj.one_hot_targets, # the one-hot encoding of the target sequences
    batch_size=BATCH_SIZE, # the batch size
    epochs=EPOCHS, # number of times to train the model
    validation_split=VALIDATION_SPLIT, # % of data for validation
    verbose=0 # to suppress the information printed for each epoch
)

In [29]:
def get_context(sequences, query_word):
    assert query_word in sg_obj.word2idx
    
    # null vector containing all zeroes
    query_word_embed = sg_obj.word2vec.get(query_word, np.zeros(shape=(EMBEDDING_DIM)))
    
    if sequences == []:
        return query_word_embed
    
    # to keep all the sentence embeddings
    seq_embeddings = []
    for seq in sequences:
        
        # add up all the word embeddings of a sequence
        zero_vector = np.zeros(shape=(EMBEDDING_DIM))
        for word in seq:
            zero_vector += sg_obj.word2vec.get(word, np.zeros(shape=(EMBEDDING_DIM)))
            
        seq_embeddings.append(zero_vector)
    seq_embeddings = np.array(seq_embeddings)
            
    weights = []
    for seq_embed in seq_embeddings:
        # get the distance between the query word and the sentence embeddings
        dist = spatial.distance.cosine(seq_embed, query_word_embed)
        weights.append(np.array([dist]))
        
    # normalize the distances
    weights = np.array(weights/max(weights))
        
    # get the final weighted context
    context = sum(weights * seq_embeddings)
    
    return context


In [30]:
def get_sample_line(context):
    # sentence start token
    sos_token = np.array([[sg_obj.word2idx.get("<sos>")]])
    
    # create the empty lstm state vectors
    h = np.array([context])    
    c = np.zeros(shape=(1, LATENT_DIM))
    
    # so we know when to quit
    eos_token = sg_obj.word2idx['<eos>']
    
    output_sequence = []
    
    # limit the length of the generated line
    for i in range(sg_obj.MAX_SEQ_LEN):
        
        # predict the first word
        # the outputed stated are passed to the lstm to generate the next word in the sequence
        o, h, c = Decoder.predict([sos_token, h, c])
        
        # get the probabilities generated from the dense layer
        probs = o[0,0]
        
        if np.argmax(probs) ==0:
            print("Something went wrong!!")
        
        probs = np.nan_to_num(probs)
        # the word-indices starts from 1 so 1st value does not count
        probs[0] = 0 
        
        # normalize the probabilities
        probs /= probs.sum()
        
        # select a random word with provided probability of being selected
        selected_idx = np.random.choice(len(probs), p=probs)
        
        # if the generated word is equal to eos_token, terminate
        if selected_idx == eos_token:
            break
        
        # append the generated word to the output_sequence
        output_sequence.append(sg_obj.idx2word.get(selected_idx, "Error <%d>" % selected_idx))
        
        # the word generated will be used as an input to generated the new word
        sos_token[0][0] = selected_idx
    
    # return the sequence
    return output_sequence

In [36]:
# the theme of the poem - only single word (for simplicity)
query_word = "love"

# to append the generated poem lines
poem_lines = []

# first sequence containing only ones, this will be used to generate the context
sequences = []

# we will be generating 8 lines, you can play around with this
for line_no in range(10):
    
    # get the context, for the first line the context will contain the embeddings of the theme words itself
    context = get_context(sequences, query_word)
    
    try:
        # generate a new line and append it
        sequences.append(get_sample_line(context))
    except:
        pass
    
    poem_lines.append(" ".join(sequences[-1]))
    
print("\n\n")
print("\n".join(poem_lines))


Something went wrong!!
Something went wrong!!
Something went wrong!!
Something went wrong!!



and die.
but want and weeping with sharp a moaning him, roar'd
myriads but it? with toil,
among with me?"
'i as came dangling by, was ringing to sport of shalott."







