In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import random
import re
import pandas as pd
from nltk.tokenize import TweetTokenizer
from google.colab import drive

In [3]:
using_drive = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Problem A: Pre-Processing the Data

In [4]:
# Define helper load_doc function
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [5]:
# Import shakespeare.txt
if (using_drive):
    drive.mount("/content/drive/")
    cwd = "drive/MyDrive/HW_03"
    raw_text = load_doc(f"{cwd}/poem_data/shakespeare.txt")
else:
    raw_text = open("poem_data/shakespeare.txt", 'r').read()

# Clean raw text data and split text by poem
raw_text = "".join(filter(lambda x: not x.isdigit(), raw_text)) 
raw_text = raw_text.lower().strip()
raw_text = re.sub(r'(\n\s*)+\n', '\n\n', raw_text)
raw_poems = raw_text.split("\n\n")

Mounted at /content/drive/


In [6]:
# Import Syllable dictionary data 
if (using_drive):
    syllable_dict = pd.read_table(
        f"{cwd}/poem_data/Syllable_dictionary.txt", 
        header=None, 
        names = ["word"]
    )
else:
    syllable_dict = pd.read_table(
        "poem_data/Syllable_dictionary.txt", 
        header=None, 
        names = ["word"]
    )

# Clean Syllable dictionary and seperate syllables into distinct columns 
syllable_dict['non_end'] = syllable_dict['word'].apply(
    lambda x: np.array(re.findall(" (\d)", x), dtype=int)
)
syllable_dict['end'] = syllable_dict['word'].apply(
    lambda x: np.array(re.findall("\d", x), dtype=int)
)
syllable_dict['word'] =  syllable_dict['word'].apply(
    lambda x: x.split(' ')[0]
)

# Add punctuation with 0 syllables
for punctuation in ", . ? ; : ) ( \n".split(" "):
    syllable_dict.loc[len(syllable_dict.index)] = [punctuation, np.array([0]), np.array([0])]

syllable_dict.tail()

Unnamed: 0,word,non_end,end
3208,;,[0],[0]
3209,:,[0],[0]
3210,),[0],[0]
3211,(,[0],[0]
3212,\n,[0],[0]


In [7]:
# Create word map to tokenize words based on Syllable Dictionary
word_to_index = {w:i for i,w in enumerate(syllable_dict['word'])}
index_to_word = {i:w for i,w in enumerate(syllable_dict['word'])}
word_to_nonend = {w:s for w,s in zip(syllable_dict['word'], syllable_dict['non_end'])}
word_to_end = {w:s for w,s in zip(syllable_dict['word'], syllable_dict['end'])}

In [8]:
# Goal: tokenize each word in the sentence with the structure: (word, syllable)
# Create helper function to get correct sequence of syllables
def correct_syl(words, target, pos, prev_sum):
    if (pos == len(words)): 
        return False
    
    for syl in words[pos]:
      cur_sum = prev_sum + syl
      if ((pos == (len(words) - 1)) and (cur_sum == target)):
          words[pos] = syl
          return True
      if(correct_syl(words, target, pos+1, cur_sum)):
          words[pos] = syl
          return True
    
    return False

# Function to correctly tokenize syllable info about a single line of poetry
def syllable_tokenize(line):
    # tokenize words
    words = list(filter(lambda x: x in word_to_index.keys(), TweetTokenizer().tokenize(line)))
    words.append("\n")

    # add list of possible syllables for each word
    word_syl = [word_to_nonend.get(word, [1]) for word in words]

    # change last word to syllable list with alternate ending syllable count
    # i.e. [2, 3] for "acquainted" instead of just [3]
    for i in range(len(word_syl)-1, -1, -1):
        if word_syl[i][0] > 0:
            word_syl[i] = word_to_end.get(words[i], [1])
            break
    
    # replace list of syllables with correct syllable used for each word
    if (not (correct_syl(word_syl, target=10, pos=0, prev_sum=0))):
        word_syl = [syl[0] for syl in word_syl]

    # return 2D matrix of tokenized line
    return np.array(
        [[word_to_index[word], syl] for word,syl in zip(words, word_syl)]
    )

In [9]:
tokenized_poems = []
for poem in raw_poems:
    lines = poem.split("\n")
    tokenized_lines = [syllable_tokenize(line) for line in lines]
    tokenized_lines = np.concatenate(tokenized_lines, axis=0)

    tokenized_poems.append(torch.tensor(tokenized_lines).to(device))

In [10]:
tokenized_poems[0][:10]

tensor([[1109,    1],
        [ 936,    2],
        [ 574,    2],
        [3025,    1],
        [ 692,    2],
        [1403,    2],
        [3205,    0],
        [3212,    0],
        [2719,    1],
        [2733,    2]], device='cuda:0')

# Problem B:
## Preprocessing the Data

In [11]:
chars = sorted(list(set(raw_text)))
char_map = dict((c, i) for i, c in enumerate(chars))
index_map = dict((i, c) for i, c in enumerate(chars))

In [12]:
data = []
for poem in raw_poems:
    poem_list = list(poem)
    for i, ch in enumerate(poem):
        poem_list[i] = char_map[ch]
    data.append(torch.tensor(poem_list).to(device))
data[0][:10]

tensor([17, 29, 26, 24,  1, 17, 12, 20, 29, 16], device='cuda:0')

## Model

In [13]:
class RNN(nn.Module):
    def __init__(self, input_size, embedding_size, output_size, hidden_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size)
        self.decoder = nn.Linear(hidden_size, output_size)
    
    def forward(self, input_seq, hidden_state):
        embedding = self.embedding(input_seq)
        output, hidden_state = self.rnn(embedding, hidden_state)
        output = self.decoder(output)
        return output, (hidden_state[0].detach(), hidden_state[1].detach())

## Training Process

In [14]:
def train(model, poems, epochs=10, seq_length=40, step=3, lr=0.001):
    model.train()
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for i_epoch in range(1, epochs+1):
        n = 0
        running_loss = 0
        
        for poem_i in range(len(poems)):
            poem = poems[poem_i]
            for i in range(seq_length,len(poem)-1, step):
                hidden_state = None
                input_seq = poem[i-seq_length : i]
                target_seq = poem[i-seq_length+1 : i+1]
                
                # forward pass
                output, _ = model(input_seq, hidden_state)
                
                # compute loss
                loss = loss_fn(torch.squeeze(output), torch.squeeze(target_seq))
                running_loss += loss.item()
                n += 1
                
                # compute gradients and take optimizer step
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
            print(f"\rEpoch: {i_epoch} " + 
                  f"\t Progress: {100 * poem_i / len(poems):.2f}% " + 
                  f"\tLoss: {loss.item():.4f}", 
                  end="")
            
        # print loss after every epoch
        print(f"\rEpoch: {i_epoch} \tLoss: {loss.item():.4f}")

In [15]:
vocab_size = len(chars)
model = RNN(
    input_size=vocab_size, 
    embedding_size=vocab_size, 
    output_size=vocab_size, 
    hidden_size=200
    ).to(device)

seq_length = 40
n_epochs = 15

# train the model
train(
    model, 
    data,
    epochs=n_epochs, 
    seq_length=seq_length, 
    lr=0.001,
    step=5
)

Epoch: 1 	Loss: 1.6265
Epoch: 2 	Loss: 1.5611
Epoch: 3 	Loss: 1.5270
Epoch: 4 	Loss: 1.4730
Epoch: 5 	Loss: 1.4531
Epoch: 6 	Loss: 1.6952
Epoch: 7 	Loss: 1.5129
Epoch: 8 	Loss: 1.4741
Epoch: 9 	Loss: 1.5668
Epoch: 10 	Loss: 1.4903
Epoch: 11 	Loss: 1.3710
Epoch: 12 	Loss: 1.5388
Epoch: 13 	Loss: 1.5875
Epoch: 14 	Loss: 1.4408
Epoch: 15 	Loss: 1.5600


## Poem Generation

In [16]:
def generate_poem(model, temp=1, poem_length=1000, prompt="shall i compare thee to a summers day\n"):
    print(prompt, end="")

    model.eval()
    prompt = list(prompt)
    for i, ch in enumerate(prompt):
        prompt[i] = char_map[ch]

    with torch.no_grad():
        prompt = torch.tensor(prompt).to(device).long()

        hidden_init = None
        output, hidden = model(prompt, hidden_init)

        for _ in range(poem_length):
            output = output[-1]
            output = nn.Softmax(dim=0)(output / temp)

            if (torch.cuda.is_available()):
              output = output.detach().cpu().numpy()
            else:
              output = output.detach().numpy()

            prediction = np.random.choice(np.arange(len(index_map)), p=output)
            print(index_map[int(prediction)],end="")
            if (torch.cuda.is_available()):
              input = torch.tensor([prediction]).cuda()
            else:
              input = torch.tensor([prediction])
            output, hidden = model(input, hidden)

In [17]:
generate_poem(model, 1.5)

shall i compare thee to a summers day
sweetlt-greateth falounting i ne'eehprisic!
 o lame from yet no corl,
double this erreunity angeing me with admitted dear words,
aptoon arrand days,
dever brund,
as give propouting mora not
onsore mered tempticed, nexcueth, thus my, angornd?
again tolatness misers curetts of nto vowved cure,
which can if grachates pray your creatune this love's life woe,
last femeing a pigamer fair-nught of the walssfs of his
needs' herns bosom's sighs hushine too deat trmys
less of butly likiens of swory,
but love touch doty of by,
who mad dilnessed, heit thinding thymed
and soul being dwell my geed
yeab against  unkeep, i sleep, but with hits of all mt tyrann's ids is my brain.
becaily hymn thy weary by hof sweeter for proud ill,
of healt-pyrss,
a love did which .uf trutt tellsatury be,
nor from thee brouchalt kean layig.
thipy are withed must bath my ttargf,
he black stetay 'rished he datered sins's, 'tis,
  than all bond her for lipse oncetened'st, so en better

In [18]:
generate_poem(model, 0.75)

shall i compare thee to a summers day
dring of my vows still my body be steal brand:
that heat that thou deserved i brand like hell proved,
and i am the truth the stansual seemn see and brand of hate.
  are and this his stand then the world love's chise a mand,
when found by ill, fire to thee bad the worthence for my body statee,
and my self speak to mistress be.
  and love by thy heart brings of mistress's,
that hear thy deep my self a drudge thee,
  and the treasure and grant and build thy bristred by, love's sweets,
a drowned prize be fire the heart with friend,
the truth as then call my friend,
which is in grown to love's head not breed i am thou lov'st they discolled minding,
for fire to thy foes them this landered still brand accripses exchers,
nest of death in my friend,
but and cheater by of worthing a from my brandle on crown, and fairers bandress in thy worst nood descredping,
learse and heart's mad mine eyes tell my saked reason hand many self all large,
i may fell becove th

In [19]:
generate_poem(model, 0.25)

shall i compare thee to a summers day
and the found the fairest with the bath fair, and there is a seet?
which in the motions of heart thee be state,
and this i love to break to breathers's brand,
  if the bath my self by thy sweet self i am that i am stand
which i am the bath fair state,
which heart thee they hate and the lily by thy heart by thy sweet betrictance thee,
which in my heart the bath fall by thy breast,
  lest see thee be state thee and this by thy sweet brand doth bath lies,
and sick of heart-pity love's brand that i have swear thee,
  and they hate the breast the world by heart-pirting thee i am seemed,
which in the treasure of love's brand my self a bath fair,
the for my self be state of thy sweet brand,
  and that is a dranges of this care and this,
  and my heart the state and so despise,
the bath a coldence thee a better and this by thee thee to me,
  but when i am the bath fairest thee be true strange,
which in thee but swear thee be true strange,
the bath fire to 

# Problem C
## Model

In [58]:
# Used starter code from Udacity Char-Level LSTM exercise:
# https://github.com/udacity/deep-learning-v2-pytorch/blob/master/recurrent-neural-networks/char-rnn/Character_Level_RNN_Exercise.ipynb
def get_batches(data, batch_size, seq_length):
    # get total batches for data
    batch_area = batch_size * seq_length
    num_batches = len(data) // batch_area

    # split data into batches
    batch_data = data[:num_batches * batch_area]
    batch_data = batch_data.reshape((batch_size, -1))

    for n in range(0, batch_data.shape[1], seq_length):
        X = batch_data[:, n:n + seq_length]

        # Same as X, but shifted over by 1
        y = np.zeros_like(X)
        y[:,:-1] = X[:, 1:]

        try:
            y[:, -1] = batch_data[:, n + seq_length]
        except IndexError:
            y[:, -1] = batch_data[:, 0]

        yield X, y



In [136]:
class BatchRNN(nn.Module):
    def __init__(self, input_size, embedding_size, output_size, hidden_size, drop_prob=0.25):
        super(BatchRNN, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(
            input_size=input_size, 
            hidden_size=hidden_size,
            dropout=drop_prob,
            batch_first=True
        )
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(drop_prob)
        self.decoder = nn.Linear(hidden_size, output_size)
    
    def forward(self, input_seq, hidden_state):
        embedding = self.embedding(input_seq)
        dropout = self.dropout(embedding)
        output, hidden_state = self.rnn(dropout, hidden_state)
        output = output.contiguous().view(-1, self.hidden_size)
        output = self.decoder(output)
        return output, (hidden_state[0].detach(), hidden_state[1].detach())
    

## Training Process

In [137]:
def batch_train(model, poems, epochs=10, batch_size=10, seq_length=40, lr=0.001):
    model.train()
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for i_epoch in range(1, epochs+1):
        n = 0
        running_loss = 0
        
        for poem_i in range(len(poems)):
            poem = poems[poem_i]
            for X, y in get_batches(poem.cpu().numpy(), batch_size, seq_length):
                hidden_state = None
                input_seq, target_seq = torch.from_numpy(X), torch.from_numpy(y)
                
                if(torch.cuda.is_available()):
                    input_seq, target_seq = input_seq.cuda(), target_seq.cuda()
                
                # forward pass
                output, _ = model(input_seq, hidden_state)
                
                # compute loss
                loss = loss_fn(
                    output, 
                    target_seq.view(batch_size*seq_length).long()
                    )
                running_loss += loss.item()
                n += 1
                
                # compute gradients and take optimizer step
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
            print(f"\rEpoch: {i_epoch} " + 
                  f"\t Progress: {100 * poem_i / len(poems):.2f}% " + 
                  f"\tLoss: {loss.item():.4f}", 
                  end="")
            
        # print loss after every epoch
        print(f"\rEpoch: {i_epoch} \tLoss: {loss.item():.4f}")

In [142]:
vocab_size = len(chars)
model2 = BatchRNN(
    input_size=vocab_size, 
    embedding_size=vocab_size, 
    output_size=vocab_size, 
    hidden_size=256
    ).to(device)

seq_length = 80
n_epochs = 100

# train the model
batch_train(
    model2, 
    data,
    epochs=n_epochs, 
    batch_size=7,
    seq_length=seq_length, 
    lr=0.001
)

Epoch: 1 	Loss: 2.4015
Epoch: 2 	Loss: 2.2400
Epoch: 3 	Loss: 2.1297
Epoch: 4 	Loss: 2.0625
Epoch: 5 	Loss: 2.0121
Epoch: 6 	Loss: 1.9814
Epoch: 7 	Loss: 1.9307
Epoch: 8 	Loss: 1.9055
Epoch: 9 	Loss: 1.8867
Epoch: 10 	Loss: 1.8481
Epoch: 11 	Loss: 1.8103
Epoch: 12 	Loss: 1.8128
Epoch: 13 	Loss: 1.8159
Epoch: 14 	Loss: 1.8028
Epoch: 15 	Loss: 1.7470
Epoch: 16 	Loss: 1.7472
Epoch: 17 	Loss: 1.7250
Epoch: 18 	Loss: 1.7303
Epoch: 19 	Loss: 1.6992
Epoch: 20 	Loss: 1.6914
Epoch: 21 	Loss: 1.6672
Epoch: 22 	Loss: 1.6836
Epoch: 23 	Loss: 1.6261
Epoch: 24 	Loss: 1.6380
Epoch: 25 	Loss: 1.5884
Epoch: 26 	Loss: 1.5695
Epoch: 27 	Loss: 1.5724
Epoch: 28 	Loss: 1.5485
Epoch: 29 	Loss: 1.5724
Epoch: 30 	Loss: 1.5374
Epoch: 31 	Loss: 1.5248
Epoch: 32 	Loss: 1.5159
Epoch: 33 	Loss: 1.5438
Epoch: 34 	Loss: 1.4830
Epoch: 35 	Loss: 1.4842
Epoch: 36 	Loss: 1.4765
Epoch: 37 	Loss: 1.4733
Epoch: 38 	Loss: 1.4714
Epoch: 39 	Loss: 1.4485
Epoch: 40 	Loss: 1.4039
Epoch: 41 	Loss: 1.4731
Epoch: 42 	Loss: 1.4434
E

## Poem Generation

In [139]:
generate_poem(model2, 1.5)

shall i compare thee to a summers day
th't pursuens take my faces,
si poery sparlets grief, nor and i'fat every pwiarion spate,
wo therewour aboverge am confwer sluil-not syet bwelclite;
thee vaulied not, and byhatch, with bebuadry what gixeoter were
should evinate a alter possess, lov'lightst so 'nuse ofle to lie),
  but live you it tereh induridater yet en, my ainu of niggard wrong as,
pityin  mac the rur'st entoenty:
as others' seeing on me! many be sick swiftee,
sily seaves alodreds death, appety, and, nor, no, hads defised,
  and you it mbration loving dhame edein:
many; loved it hot woremm, andseds in odoug?
where yet carelier thinking one,s eyef worth do i frobi the heat spect,
faitiers that yet love? thou goders thy pows inwermienced,
yet not doving other me brought, mignch thou tade ogers recues.
and that love's quents, father's lrats arl, to dull,
as take thou art is mesc eorer is afatrlet's rughing covnhatch,
or it (this costarit mer;, and by my wirous thy niffern bloich? ef

In [140]:
generate_poem(model2, 0.75)

shall i compare thee to a summers day
do my time's chilfing thy sweet self to me.
  and yet this sweet self to thee virtue not of thee, return and beauties and thou art true as a winded a byrane,
and to wit as make my joy, and write do not be forcoand,
like not be, upon thy dear heart thou so poor self of mine oen suns excuse new, could make me.
  and i do not love thou more my self or nought ipverse of thy dids,
and such a for thy worsh than their in a love
i ne'er knows i not love,
  and into my dear heart to make me not my self all my love and proved,
  once i am that my self in thee i say my self and space should on thy humoures' lasted before,
though not the some words have with true longed and where therefore live in this prove,
that as both to be disprisiase of a mortal kentle come,
crowing of thee my self alone seem but drame.
  hen i do not love thee against my self thy carcoul my friend's husband,
in all all with the string she not the learned in my self alone,
and summer's b

In [141]:
generate_poem(model2, 0.25)

shall i compare thee to a summers day
be thy record the world with me,
the world with the world and lived and sorreth can be such far the painter to the care,
so the sweet self to be dispraise,
and thou hast the strength of beauty self i do not love thee with thee more delight.
  and yet the best is thy self thy heart that i do not love thee that thou art too my love thou shouldst thy self word of praise,
thou art the praise to show thy self thy praise to my love thee that i do not so,
of thou shouldst thou dost too much conscience a face,
and thou wilt thou dost to my love thou art to pay as for thee is to my sinful eyes thee frown on the world did stand
hath not first i love thee that i calls not so much come prove,
  and thou shouldst thy self in their birth (for thy self at the look what wear,
thou art the proud fair from my self thy praise thee against my self and strong,
to suffer upon thy praise to set the world must be to me.
  this to thee that i was not love, thy love is see 