# 20. LSTM with Moby Dick

In [1]:
import torch
import torch.nn as nn

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import CountVectorizer

import random
import numpy as np

## 20.1 Prepare Data

In [2]:
nltk.download("gutenberg")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\slcf\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [3]:
raw = nltk.corpus.gutenberg.raw("melville-moby_dick.txt")
print(raw[21945:23000])
raw = raw[21945:200000]

CHAPTER 1

Loomings.


Call me Ishmael.  Some years ago--never mind how long
precisely--having little or no money in my purse, and nothing
particular to interest me on shore, I thought I would sail about a
little and see the watery part of the world.  It is a way I have of
driving off the spleen and regulating the circulation.  Whenever I
find myself growing grim about the mouth; whenever it is a damp,
drizzly November in my soul; whenever I find myself involuntarily
pausing before coffin warehouses, and bringing up the rear of every
funeral I meet; and especially whenever my hypos get such an upper
hand of me, that it requires a strong moral principle to prevent me
from deliberately stepping into the street, and methodically knocking
people's hats off--then, I account it high time to get to sea as soon
as I can.  This is my substitute for pistol and ball.  With a
philosophical flourish Cato throws himself upon his sword; I quietly
take to the ship.  There is nothing

## 20.2 BoW

In [4]:
tokens = word_tokenize(raw)
print(tokens[:50])

['CHAPTER', '1', 'Loomings', '.', 'Call', 'me', 'Ishmael', '.', 'Some', 'years', 'ago', '--', 'never', 'mind', 'how', 'long', 'precisely', '--', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', ',', 'I', 'thought', 'I', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of']


In [5]:
cnv = CountVectorizer(analyzer = 'word',
                      tokenizer=lambda x: x.split(', '),
                      preprocessor = None, 
                      stop_words = None,
                      ngram_range=(1, 1), 
                      lowercase=False
                      )

data = cnv.fit_transform(tokens).toarray()

In [6]:
len(data)

36924

In [7]:
cnv.inverse_transform(data)

[array(['CHAPTER'], dtype='<U23'),
 array(['1'], dtype='<U23'),
 array(['Loomings'], dtype='<U23'),
 array(['.'], dtype='<U23'),
 array(['Call'], dtype='<U23'),
 array(['me'], dtype='<U23'),
 array(['Ishmael'], dtype='<U23'),
 array(['.'], dtype='<U23'),
 array(['Some'], dtype='<U23'),
 array(['years'], dtype='<U23'),
 array(['ago'], dtype='<U23'),
 array(['--'], dtype='<U23'),
 array(['never'], dtype='<U23'),
 array(['mind'], dtype='<U23'),
 array(['how'], dtype='<U23'),
 array(['long'], dtype='<U23'),
 array(['precisely'], dtype='<U23'),
 array(['--'], dtype='<U23'),
 array(['having'], dtype='<U23'),
 array(['little'], dtype='<U23'),
 array(['or'], dtype='<U23'),
 array(['no'], dtype='<U23'),
 array(['money'], dtype='<U23'),
 array(['in'], dtype='<U23'),
 array(['my'], dtype='<U23'),
 array(['purse'], dtype='<U23'),
 array([','], dtype='<U23'),
 array(['and'], dtype='<U23'),
 array(['nothing'], dtype='<U23'),
 array(['particular'], dtype='<U23'),
 array(['to'], dtype='<U23'),
 array(

In [8]:
dim = len(cnv.vocabulary_)
print("Dimension of input & output :", dim)

Dimension of input & output : 6069


In [9]:
index2word = cnv.get_feature_names()
word2index = cnv.vocabulary_

## 20.3 Define Model

In [10]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.hidden = torch.zeros(1, 1, hidden_size).cuda()
        self.cell = torch.zeros(1, 1, hidden_size).cuda()
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        # input of shape (seq_len, 1, input_size)
        # output of shape (seq_len, 1, hidden_size)
        out, (hidden, cell) = self.lstm(x, (self.hidden, self.cell))
        self.hidden = hidden
        self.cell = cell
        
        # fc_output of shape (seq_len, 1, output_size)
        out = self.fc(out)
        return out

    def init_hidden_cell(self):
        # h, c of shape (1, 1, hidden_size)
        self.hidden = torch.zeros(1, 1, self.hidden_size).cuda()
        self.cell = torch.zeros(1, 1, self.hidden_size).cuda()

In [11]:
model = LSTM(dim, 500, dim).cuda()

## 20.4 Train Model

In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss = nn.CrossEntropyLoss()

In [13]:
seq_len = 5
num_epochs = 3

In [14]:
for epoch in range(num_epochs):
    
    sp = list(range(0, len(data) - 2 * seq_len, seq_len))
    sp = np.add(sp, random.randint(0, seq_len))
    random.shuffle(sp)
    
    for i in range(len(sp)) :
    
        model.init_hidden_cell()

        X = torch.from_numpy(data[sp[i] : sp[i] + seq_len]).type(torch.FloatTensor)
        X = X.reshape(seq_len, 1, dim).cuda()
        y = torch.from_numpy(data[sp[i] + 1 : sp[i] + seq_len + 1]).cuda()

        _, y = y.max(dim=1)

        pre = model(X)
        cost = loss(pre.reshape(seq_len, dim), y.reshape(seq_len))

        optimizer.zero_grad()
        cost.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), 5)
        
        optimizer.step()

        if (i + 1) % 2000 == 0 :
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f'
                     %(epoch+1, num_epochs, i + 1, len(sp), cost.item()))

Epoch [1/3], Iter [2000/7383] Loss: 6.5583
Epoch [1/3], Iter [4000/7383] Loss: 7.3252
Epoch [1/3], Iter [6000/7383] Loss: 6.0903
Epoch [2/3], Iter [2000/7383] Loss: 6.1044
Epoch [2/3], Iter [4000/7383] Loss: 6.0418
Epoch [2/3], Iter [6000/7383] Loss: 5.0151
Epoch [3/3], Iter [2000/7383] Loss: 4.0048
Epoch [3/3], Iter [4000/7383] Loss: 4.9516
Epoch [3/3], Iter [6000/7383] Loss: 5.7380


## 20.5 Test Model

In [15]:
def get_top_index(x, num) :
    
    top_index = np.argsort(x)[::-1][:num]
    top_prob = x[top_index]
    
    # Softmax
    top_prob = np.exp(top_prob)
    top_prob = top_prob / top_prob.sum()
    
    random_index = np.random.choice(top_index, 1, p = top_prob)[0]
    
    return random_index

In [16]:
text = "CHAPTER"

model.eval()
model.init_hidden_cell()

X_test = torch.from_numpy(data[0]).type(torch.FloatTensor)
X_test = X_test.reshape(1, 1, dim).cuda()

for pos in range(100) :
    
    pre = model(X_test)
    pre = pre.reshape(-1).cpu().data.numpy()
    
    pre = get_top_index(pre, 10)
    new_word = index2word[pre]
    text += " " + new_word
    
    X_test = torch.from_numpy(cnv.transform([new_word]).toarray()).type(torch.FloatTensor)
    X_test = X_test.reshape(1, 1, dim).cuda()
    
print("* Generated Text : \n", text)

* Generated Text : 
 CHAPTER 8 , and all the same way that his face to the bed ; and the same time he did not seem to Feegeeans with the sea and all the Pequod is the Pequod 's the picture of the sea . The Sword-Fish of his eyes , and then went to the sea , which I could not have no doubt he must be me to see the ship , '' said I am . I do n't n't you do n't you ? '' said a little -- a white man -- a very man . '' `` What
