# 20. LSTM with Moby Dick

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import CountVectorizer

import random
import numpy as np

## 20.1 Prepare Data

In [2]:
nltk.download("gutenberg")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\slcf\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [3]:
raw = nltk.corpus.gutenberg.raw("melville-moby_dick.txt")
print(raw[21945:23000])
raw = raw[21945:200000]

CHAPTER 1

Loomings.


Call me Ishmael.  Some years ago--never mind how long
precisely--having little or no money in my purse, and nothing
particular to interest me on shore, I thought I would sail about a
little and see the watery part of the world.  It is a way I have of
driving off the spleen and regulating the circulation.  Whenever I
find myself growing grim about the mouth; whenever it is a damp,
drizzly November in my soul; whenever I find myself involuntarily
pausing before coffin warehouses, and bringing up the rear of every
funeral I meet; and especially whenever my hypos get such an upper
hand of me, that it requires a strong moral principle to prevent me
from deliberately stepping into the street, and methodically knocking
people's hats off--then, I account it high time to get to sea as soon
as I can.  This is my substitute for pistol and ball.  With a
philosophical flourish Cato throws himself upon his sword; I quietly
take to the ship.  There is nothing

## 20.2 BoW

In [4]:
tokens = word_tokenize(raw)
print(tokens[:50])

['CHAPTER', '1', 'Loomings', '.', 'Call', 'me', 'Ishmael', '.', 'Some', 'years', 'ago', '--', 'never', 'mind', 'how', 'long', 'precisely', '--', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', ',', 'I', 'thought', 'I', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of']


In [5]:
def tokenizer(doc):
    return ["/".join(p) for p in pos_tag(doc)]

tokens = tokenizer(tokens)

print(tokens[:50])

['CHAPTER/NN', '1/CD', 'Loomings/NNP', './.', 'Call/VB', 'me/PRP', 'Ishmael/NNP', './.', 'Some/DT', 'years/NNS', 'ago/RB', '--/:', 'never/RB', 'mind/VB', 'how/WRB', 'long/JJ', 'precisely/RB', '--/:', 'having/VBG', 'little/JJ', 'or/CC', 'no/DT', 'money/NN', 'in/IN', 'my/PRP$', 'purse/NN', ',/,', 'and/CC', 'nothing/NN', 'particular/JJ', 'to/TO', 'interest/NN', 'me/PRP', 'on/IN', 'shore/NN', ',/,', 'I/PRP', 'thought/VBD', 'I/PRP', 'would/MD', 'sail/VB', 'about/IN', 'a/DT', 'little/JJ', 'and/CC', 'see/VB', 'the/DT', 'watery/JJ', 'part/NN', 'of/IN']


In [6]:
cnv = CountVectorizer(analyzer = 'word',
                             tokenizer=lambda x: x.split(', '),
                             preprocessor = None, 
                             stop_words = None,
                             ngram_range=(1, 1),
                             lowercase=False
                            )

data = cnv.fit_transform(tokens).toarray()

In [7]:
len(data)

36924

In [8]:
cnv.inverse_transform(data)

[array(['CHAPTER/NN'], dtype='<U26'),
 array(['1/CD'], dtype='<U26'),
 array(['Loomings/NNP'], dtype='<U26'),
 array(['./.'], dtype='<U26'),
 array(['Call/VB'], dtype='<U26'),
 array(['me/PRP'], dtype='<U26'),
 array(['Ishmael/NNP'], dtype='<U26'),
 array(['./.'], dtype='<U26'),
 array(['Some/DT'], dtype='<U26'),
 array(['years/NNS'], dtype='<U26'),
 array(['ago/RB'], dtype='<U26'),
 array(['--/:'], dtype='<U26'),
 array(['never/RB'], dtype='<U26'),
 array(['mind/VB'], dtype='<U26'),
 array(['how/WRB'], dtype='<U26'),
 array(['long/JJ'], dtype='<U26'),
 array(['precisely/RB'], dtype='<U26'),
 array(['--/:'], dtype='<U26'),
 array(['having/VBG'], dtype='<U26'),
 array(['little/JJ'], dtype='<U26'),
 array(['or/CC'], dtype='<U26'),
 array(['no/DT'], dtype='<U26'),
 array(['money/NN'], dtype='<U26'),
 array(['in/IN'], dtype='<U26'),
 array(['my/PRP$'], dtype='<U26'),
 array(['purse/NN'], dtype='<U26'),
 array([',/,'], dtype='<U26'),
 array(['and/CC'], dtype='<U26'),
 array(['nothing/NN'], 

## 20.3 Define Model

In [9]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size,hidden_size,num_layers, dropout = 0.5)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input, hidden, cell):
        out, (hidden, cell) = self.lstm(input.view(1,1,-1),(hidden,cell))
        out = self.fc(out.view(1,-1))
        return out,hidden, cell

    def init_hidden_cell(self):
        hidden = Variable(torch.zeros(self.num_layers, 1, self.hidden_size)).cuda()
        cell = Variable(torch.zeros(self.num_layers, 1, self.hidden_size)).cuda()
        return hidden, cell

In [10]:
model = LSTM(len(data[0]), 1000, len(data[0]), 1).cuda()

## 20.4 Train Model

In [11]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss = nn.CrossEntropyLoss()

In [12]:
step = 20
num_epochs = 5

In [13]:
for epoch in range(num_epochs):
    
    sp = list(range(0, len(data) - 2 * step, step))
    sp = np.add(sp, random.randint(0, step))
    random.shuffle(sp)
    
    for i in range(len(sp)) :
    
        (hidden, cell) = model.init_hidden_cell()

        cost = 0

        for pos in range(sp[i], sp[i] + step):
            X = Variable(torch.from_numpy(data[pos]).type(torch.FloatTensor)).cuda()
            y = torch.from_numpy(data[pos+1]).cuda()
            _, y = y.max(dim=0)

            pre, hidden, cell = model(X,hidden,cell)
            cost += loss(pre,Variable(y).cuda())

        cost.backward()
        optimizer.step()

        if (i + 1) % 100 == 0 :
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f'
                     %(epoch+1, num_epochs, i + 1, len(sp), cost.data[0]))

Epoch [1/5], Iter [100/1845] Loss: 150.3479
Epoch [1/5], Iter [200/1845] Loss: 169.9080
Epoch [1/5], Iter [300/1845] Loss: 164.2813
Epoch [1/5], Iter [400/1845] Loss: 135.7209
Epoch [1/5], Iter [500/1845] Loss: 211.2850
Epoch [1/5], Iter [600/1845] Loss: 131.7099
Epoch [1/5], Iter [700/1845] Loss: 135.3472
Epoch [1/5], Iter [800/1845] Loss: 166.1768
Epoch [1/5], Iter [900/1845] Loss: 140.0419
Epoch [1/5], Iter [1000/1845] Loss: 143.5765
Epoch [1/5], Iter [1100/1845] Loss: 151.6849
Epoch [1/5], Iter [1200/1845] Loss: 128.8878
Epoch [1/5], Iter [1300/1845] Loss: 164.5333
Epoch [1/5], Iter [1400/1845] Loss: 130.0825
Epoch [1/5], Iter [1500/1845] Loss: 149.1143
Epoch [1/5], Iter [1600/1845] Loss: 147.1165
Epoch [1/5], Iter [1700/1845] Loss: 151.7488
Epoch [1/5], Iter [1800/1845] Loss: 148.7168
Epoch [2/5], Iter [100/1845] Loss: 122.2937
Epoch [2/5], Iter [200/1845] Loss: 138.7654


KeyboardInterrupt: 

## 20.5 Test Model

Multinomial :
Returns a tensor where each row contains num_samples indices sampled from the multinomial probability distribution located in the corresponding row of tensor input.

In [None]:
start_num = 5
text = cnv.inverse_transform(data[start_num])[0][0].split('/')[0]

model.eval()
hidden, cell = model.init_hidden_cell()

X_test = Variable(torch.from_numpy(data[start_num]).type(torch.FloatTensor)).cuda()

for pos in range(100) :
    
    pre, hidden, cell = model(X_test, hidden, cell)
    
    m = torch.nn.Softmax(dim = pre.shape[0])
    
    pre = m(pre)
    
    pre = torch.multinomial(pre, 1).data[0][0]
    
    temp = np.zeros(len(data[0]))
    
    temp[pre] = 1
    
    text += " " + cnv.inverse_transform(temp)[0][0].split('/')[0]
    
    X_test = Variable(torch.from_numpy(temp).type(torch.FloatTensor)).cuda()
    
print("* Generated Text : \n", text)