# 20. LSTM with Moby Dick

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import CountVectorizer

import random
import numpy as np

## 20.1 Preparing Data

In [2]:
nltk.download("gutenberg")
nltk.download("stopwords")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\slcf\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\slcf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
raw = nltk.corpus.gutenberg.raw("melville-moby_dick.txt")
print(raw[21945:23000])
raw = raw[21945:200000]

CHAPTER 1

Loomings.


Call me Ishmael.  Some years ago--never mind how long
precisely--having little or no money in my purse, and nothing
particular to interest me on shore, I thought I would sail about a
little and see the watery part of the world.  It is a way I have of
driving off the spleen and regulating the circulation.  Whenever I
find myself growing grim about the mouth; whenever it is a damp,
drizzly November in my soul; whenever I find myself involuntarily
pausing before coffin warehouses, and bringing up the rear of every
funeral I meet; and especially whenever my hypos get such an upper
hand of me, that it requires a strong moral principle to prevent me
from deliberately stepping into the street, and methodically knocking
people's hats off--then, I account it high time to get to sea as soon
as I can.  This is my substitute for pistol and ball.  With a
philosophical flourish Cato throws himself upon his sword; I quietly
take to the ship.  There is nothing

## 20.2 BoW

In [4]:
tokens = word_tokenize(raw)
print(tokens[:50])

['CHAPTER', '1', 'Loomings', '.', 'Call', 'me', 'Ishmael', '.', 'Some', 'years', 'ago', '--', 'never', 'mind', 'how', 'long', 'precisely', '--', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', ',', 'I', 'thought', 'I', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of']


In [5]:
def tokenizer(doc):
    return ["/".join(p) for p in pos_tag(doc)]

tokens = tokenizer(tokens)

print(tokens[:50])

['CHAPTER/NN', '1/CD', 'Loomings/NNP', './.', 'Call/VB', 'me/PRP', 'Ishmael/NNP', './.', 'Some/DT', 'years/NNS', 'ago/RB', '--/:', 'never/RB', 'mind/VB', 'how/WRB', 'long/JJ', 'precisely/RB', '--/:', 'having/VBG', 'little/JJ', 'or/CC', 'no/DT', 'money/NN', 'in/IN', 'my/PRP$', 'purse/NN', ',/,', 'and/CC', 'nothing/NN', 'particular/JJ', 'to/TO', 'interest/NN', 'me/PRP', 'on/IN', 'shore/NN', ',/,', 'I/PRP', 'thought/VBD', 'I/PRP', 'would/MD', 'sail/VB', 'about/IN', 'a/DT', 'little/JJ', 'and/CC', 'see/VB', 'the/DT', 'watery/JJ', 'part/NN', 'of/IN']


In [6]:
cnv = CountVectorizer(analyzer = 'word',
                             tokenizer=lambda x: x.split(', '),
                             preprocessor = None, 
                             stop_words = None,
                             ngram_range=(1, 1),
                             lowercase=False
                            )

data = cnv.fit_transform(tokens).toarray()

In [7]:
len(data)

36924

In [8]:
cnv.inverse_transform(data)

[array(['CHAPTER/NN'], dtype='<U26'),
 array(['1/CD'], dtype='<U26'),
 array(['Loomings/NNP'], dtype='<U26'),
 array(['./.'], dtype='<U26'),
 array(['Call/VB'], dtype='<U26'),
 array(['me/PRP'], dtype='<U26'),
 array(['Ishmael/NNP'], dtype='<U26'),
 array(['./.'], dtype='<U26'),
 array(['Some/DT'], dtype='<U26'),
 array(['years/NNS'], dtype='<U26'),
 array(['ago/RB'], dtype='<U26'),
 array(['--/:'], dtype='<U26'),
 array(['never/RB'], dtype='<U26'),
 array(['mind/VB'], dtype='<U26'),
 array(['how/WRB'], dtype='<U26'),
 array(['long/JJ'], dtype='<U26'),
 array(['precisely/RB'], dtype='<U26'),
 array(['--/:'], dtype='<U26'),
 array(['having/VBG'], dtype='<U26'),
 array(['little/JJ'], dtype='<U26'),
 array(['or/CC'], dtype='<U26'),
 array(['no/DT'], dtype='<U26'),
 array(['money/NN'], dtype='<U26'),
 array(['in/IN'], dtype='<U26'),
 array(['my/PRP$'], dtype='<U26'),
 array(['purse/NN'], dtype='<U26'),
 array([',/,'], dtype='<U26'),
 array(['and/CC'], dtype='<U26'),
 array(['nothing/NN'], 

## 20.3 Define Model

In [9]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size,hidden_size,num_layers, dropout = 0.5)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input, hidden, cell):
        out, (hidden, cell) = self.lstm(input.view(1,1,-1),(hidden,cell))
        out = self.fc(out.view(1,-1))
        return out,hidden, cell

    def init_hidden_cell(self):
        hidden = Variable(torch.zeros(self.num_layers, 1, self.hidden_size)).cuda()
        cell = Variable(torch.zeros(self.num_layers, 1, self.hidden_size)).cuda()
        return hidden, cell

In [10]:
model = LSTM(len(data[0]), 1000, len(data[0]), 2).cuda()

## 20.4 Model Training

In [11]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss = nn.CrossEntropyLoss()

In [12]:
step = 10
num_epochs = 10

In [13]:
for epoch in range(num_epochs):
    
    sp = list(range(0, len(data) - 2 * step, step))
    sp = np.add(sp, random.randint(0, step))
    random.shuffle(sp)
    
    for i in range(len(sp)) :
    
        (hidden, cell) = model.init_hidden_cell()

        cost = 0

        for pos in range(sp[i], sp[i] + step):
            X = Variable(torch.from_numpy(data[pos]).type(torch.FloatTensor)).cuda()
            y = torch.from_numpy(data[pos+1]).cuda()
            _, y = y.max(dim=0)

            pre, hidden, cell = model(X,hidden,cell)
            cost += loss(pre,Variable(y).cuda())

        cost.backward()
        optimizer.step()

        if (i + 1) % 100 == 0 :
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f'
                     %(epoch+1, num_epochs, i + 1, len(sp), cost.data[0]))

Epoch [1/10], Iter [100/3691] Loss: 63.3960
Epoch [1/10], Iter [200/3691] Loss: 100.2390
Epoch [1/10], Iter [300/3691] Loss: 83.4005
Epoch [1/10], Iter [400/3691] Loss: 119.0685
Epoch [1/10], Iter [500/3691] Loss: 140.6733
Epoch [1/10], Iter [600/3691] Loss: 111.3818
Epoch [1/10], Iter [700/3691] Loss: 118.6159
Epoch [1/10], Iter [800/3691] Loss: 129.9223
Epoch [1/10], Iter [900/3691] Loss: 173.4347
Epoch [1/10], Iter [1000/3691] Loss: 178.0553
Epoch [1/10], Iter [1100/3691] Loss: 250.2615
Epoch [1/10], Iter [1200/3691] Loss: 165.1613
Epoch [1/10], Iter [1300/3691] Loss: 239.2156
Epoch [1/10], Iter [1400/3691] Loss: 197.4339
Epoch [1/10], Iter [1500/3691] Loss: 225.6682
Epoch [1/10], Iter [1600/3691] Loss: 186.3174
Epoch [1/10], Iter [1700/3691] Loss: 265.4194
Epoch [1/10], Iter [1800/3691] Loss: 261.4159
Epoch [1/10], Iter [1900/3691] Loss: 285.1147
Epoch [1/10], Iter [2000/3691] Loss: 280.6012
Epoch [1/10], Iter [2100/3691] Loss: 270.9442
Epoch [1/10], Iter [2200/3691] Loss: 283.5450

Epoch [6/10], Iter [3300/3691] Loss: 623.5885
Epoch [6/10], Iter [3400/3691] Loss: 794.7583
Epoch [6/10], Iter [3500/3691] Loss: 745.4621
Epoch [6/10], Iter [3600/3691] Loss: 858.6005
Epoch [7/10], Iter [100/3691] Loss: 556.8375
Epoch [7/10], Iter [200/3691] Loss: 479.4013
Epoch [7/10], Iter [300/3691] Loss: 349.9353
Epoch [7/10], Iter [400/3691] Loss: 748.7648
Epoch [7/10], Iter [500/3691] Loss: 633.2181
Epoch [7/10], Iter [600/3691] Loss: 704.1283
Epoch [7/10], Iter [700/3691] Loss: 321.8327
Epoch [7/10], Iter [800/3691] Loss: 291.7192
Epoch [7/10], Iter [900/3691] Loss: 617.5550
Epoch [7/10], Iter [1000/3691] Loss: 863.3464
Epoch [7/10], Iter [1100/3691] Loss: 442.3184
Epoch [7/10], Iter [1200/3691] Loss: 841.8356
Epoch [7/10], Iter [1300/3691] Loss: 477.3044
Epoch [7/10], Iter [1400/3691] Loss: 949.3643
Epoch [7/10], Iter [1500/3691] Loss: 757.1786
Epoch [7/10], Iter [1600/3691] Loss: 688.6234
Epoch [7/10], Iter [1700/3691] Loss: 392.3531
Epoch [7/10], Iter [1800/3691] Loss: 646.60

## 20.5 Model Test

Multinomial :
Returns a tensor where each row contains num_samples indices sampled from the multinomial probability distribution located in the corresponding row of tensor input.

In [20]:
start_num = 5
text = cnv.inverse_transform(data[start_num])[0][0].split('/')[0]

model.eval()
hidden, cell = model.init_hidden_cell()

X_test = Variable(torch.from_numpy(data[start_num]).type(torch.FloatTensor)).cuda()

for pos in range(100) :
    
    pre, hidden, cell = model(X_test, hidden, cell)
    
    m = torch.nn.Softmax(dim = pre.shape[0])
    
    pre = m(pre)
    
    pre = torch.multinomial(pre, 1).data[0][0]
    
    temp = np.zeros(len(data[0]))
    
    temp[pre] = 1
    
    text += " " + cnv.inverse_transform(temp)[0][0].split('/')[0]
    
    X_test = Variable(torch.from_numpy(temp).type(torch.FloatTensor)).cuda()
    
print("* Generated Text : \n", text)

* Generated Text : 
 Call me must , so you could so want worthy the heels cried spurs following colt I grunt interminable you cabin-boy feelings find , satin and gradually , Whitehall could hope became little could I could alarmed feelings called find hope alarmed engage goblets high heels order could you willingness calculating find peered , engage knows other marchant right peered 've the hell spurs matrimonial gradually staring find could you feelings peered knows could cabin-boy broiled you so so hell eh closed have cabin-boy philosophers closed you closed spurs knows following want crowd engage Remembering 'Oh any you I says mark
