https://machinetalk.org/2019/02/08/text-generation-with-pytorch/
https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#example-an-lstm-for-part-of-speech-tagging
https://blog.exxactcorp.com/getting-started-with-natural-language-processing-using-pytorch/

In [1]:
import numpy as np
import torch 
from torch import nn,optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torch.nn.functional as F

import numpy as np
from collections import Counter
import os
from argparse import Namespace

torch.manual_seed(1)

<torch._C.Generator at 0x26645a01190>

In [2]:
flags = Namespace(
    train_file='Oliver.txt',
    seq_size=32,
    batch_size=16,
    embedding_size=64,
    lstm_size=64,
    gradients_norm=5,
    initial_words=['I', 'am'],
    predict_top_k=5,
    checkpoint_path='checkpoint',
)

In [3]:
def get_data_from_file(train_file, batch_size, seq_size):
    with open(train_file, 'r') as f:
        text = f.read()
    text = text.split()
    print("length = ",len(text))
    word_counts = Counter(text)
    
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)

    print('Vocabulary size', n_vocab)

    int_text = [vocab_to_int[w] for w in text]
    num_batches = int(len(int_text) / (seq_size * batch_size))
    in_text = int_text[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text

In [4]:
def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]

In [5]:
int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(flags.train_file,flags.batch_size,flags.seq_size)

length =  9873
Vocabulary size 3187


In [6]:

print(in_text.shape[0]*in_text.shape[1])

9728


In [7]:
class TransformerToTensor():
    def __init__(self):
        pass
    def __call__(self,x):
        return torch.tensor(x,dtype=torch.long)

In [8]:
class dataset(Dataset):
    def __init__(self,corpus_file,sequance_length=32,batch_size=16,transform=None):
        self.transform = transform
        self.sequance_length = sequance_length
        self.batch_size = batch_size
        self.text = self.read_from_file(corpus_file)
        self.tokens = self.tokenize_corpus(self.text)
        self.vocabulary = self.get_vocabulary(self.tokens)
        self.word2idx = self.get_word2idx(self.vocabulary)
        self.idx2word = self.get_idx2word(self.vocabulary)
        self.indices= self.tokens_to_indices(self.tokens)
        self.dataset = self.idx_pair(self.tokens)
        self.data_loader = self.batchify()
    def read_from_file(self,file):
        with open(file, 'r') as f:
            text = f.read()
        return text
    def tokens_to_indices(self,tokens):
        return [self.word2idx[token] for token in tokens]
        
    def tokenize_corpus(self,corpus):
        tokens = corpus.split()
        return tokens
    
    def get_vocabulary(self,tokens):
        vocabulary = []
        for word in tokens:
            if word not in vocabulary:
                vocabulary.append(word)
        return vocabulary
    
    def get_word2idx(self,vocabulary):
        return {w: idx for (idx, w) in enumerate(vocabulary)}
    
    def get_idx2word(self,vocabulary):
        return {idx: w for (idx, w) in enumerate(vocabulary)}
    
    
    def idx_pair(self,data):
        dataset = []
        x_shape = len(data)//self.sequance_length
        for i in range(0,x_shape-1): 
            x_start_idx = i*self.sequance_length
            x_end_idx = (i+1)*self.sequance_length
            y_start_idx = x_start_idx + 1
            y_end_idx = x_end_idx + 1
            x_i = [self.word2idx[word] for word in data[x_start_idx:x_end_idx]]
            y_i = [self.word2idx[word] for word in data[y_start_idx:y_end_idx]]
            dataset.append(([x_i,y_i]))
        return dataset
    
    def show_dataset(self):
        for input_word,output_word in self.dataset:
            print(self.idx2word[input_word],self.idx2word[output_word])
            
    def batchify(self):
        batches_length = len(self.dataset)//self.batch_size
        data_loader = []
        self.dataset = np.array(self.dataset)
        for i in range(0,batches_length):
            start_idx = i*self.batch_size
            end_idx = (i+1)*self.batch_size
            data_loader.append([self.dataset[start_idx:end_idx,0],self.dataset[start_idx:end_idx,1]])
        return data_loader

    def __getitem__(self,idx):
        batch_x,batch_y = self.data_loader[idx]
        if(self.transform):
            batch_x = self.transform(batch_x)
            batch_y = self.transform(batch_y)
        return batch_x,batch_y 
        #it must be returned like to to be casting as set for dataLoader

    def __len__(self):
        return len(self.dataset)
    
        

In [9]:
batch_size =307
sequance_length = 32
data_set = dataset(flags.train_file,sequance_length,batch_size,transform=TransformerToTensor()) 
assert data_set[0][0].shape == (batch_size,sequance_length)
print(len(data_set.vocabulary))

3187


In [10]:
print(len(data_set))
batches = data_set.data_loader
print("shape of one batch :",data_set[0][0].shape)
for x,y in batches:
    print(x)

307
shape of one batch : torch.Size([307, 32])
[[   0    1    2 ...    2   25   26]
 [  27   28   29 ...   51   52   53]
 [  54   55   56 ...   79   66   80]
 ...
 [ 547  141 1949 ...  673 3159   83]
 [3160   59  103 ...  919 2503 1255]
 [  99  152 3167 ...   66 1340 2223]]


In [11]:
class TextGenerationModel(nn.Module):
    def __init__(self,batch_size,sequance_length,embedd_dim,hidden_dim,vocab_size,bidirectional=False):
        super(TextGenerationModel,self).__init__()
        self.bidirectional = bidirectional
        self.num_of_directions = 1
        if(self.bidirectional):
            self.num_of_directions = 2
        self.batch_size = batch_size
        self.sequance_length = sequance_length
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size,embedd_dim)
        self.lstm = nn.LSTM(embedd_dim,hidden_dim//self.num_of_directions,bidirectional=self.bidirectional ,batch_first=True)
        self.dense = nn.Linear(hidden_dim,vocab_size)

    def forward(self,x,prev_state):
        embedded_vector = self.embedding(x)
        a_out ,prev_state = self.lstm(embedded_vector,prev_state)
        #don't specify a_out ,self.hidden = self.lstm(embedded_vector.view(self.batch_size,self.sequance_length, -1),self.hidden)
        dense = self.dense(a_out)
        output_scores = F.log_softmax(dense.view(embedded_vector.shape[0],-1,embedded_vector.shape[1]),dim = 1)        
        return output_scores,prev_state
        
        

In [12]:
def init_states(num_of_directions,batch_size,hidden_dim):
    (torch.zeros(num_of_directions, batch_size, hidden_dim//num_of_directions).cuda(),
     torch.zeros(num_of_directions, batch_size, hidden_dim//num_of_directions).cuda())

In [13]:
def predict(init_words,num_of_words_predict,top_k=3):
    model.eval()
    states = init_states(num_of_directions=1,batch_size=1,hidden_dim=hidden_dim)
    words = []
    for word in init_words:
        word_to_idx = torch.tensor([[data_set.word2idx[word]]])
        out,states= model(word_to_idx.cuda(),states)
    _, top_ix = torch.topk(out[0,:,0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])
    words.append(data_set.idx2word[choice])
    for _ in range(num_of_words_predict):
        word_to_idx = torch.tensor([[data_set.word2idx[word]]])
        out,states = model(word_to_idx.cuda(),states)
        
        _, top_ix = torch.topk(out[0,:,0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(data_set.idx2word[choice])
    print(' '.join(words))

In [14]:
vocab_size = len(data_set.word2idx)
embedding_dims = flags.embedding_size
hidden_dim = flags.lstm_size
model = TextGenerationModel(batch_size,sequance_length,embedding_dims,hidden_dim,vocab_size,True).cuda()
optimizer = optim.Adam(model.parameters(),lr=0.001)
criterion = nn.NLLLoss()
prev_state = init_states(num_of_directions=1,batch_size=batch_size,hidden_dim=hidden_dim)

def train_model(prev_state,model,optimizer,criterion,dataset,iter = 10):
    for i in range(iter):     
        for x,y in dataset:
            model.train()
            yhat,prev_state = model(x.cuda(),prev_state)
            loss = criterion(yhat,y.cuda())
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            optimizer.step()
        print("loss at epoch :",i ," = ",loss.item())
        print("SAMPLE :")
        predict("the".split(),10)

In [15]:
trained_model = train_model(prev_state,model,optimizer,criterion,data_set,iter = 300)

loss at epoch : 0  =  8.075037002563477
SAMPLE :
little garden-gate. engendered, perhaps fed, did Among trouble, Vilkins. coverlet death
loss at epoch : 1  =  8.068414688110352
SAMPLE :
indubitably end wretched badged troublesome. this accident; floor sitting somewhat direction
loss at epoch : 2  =  8.06159782409668
SAMPLE :
proceeded did only end PLACE think,' 'Just darkly offered quite be.
loss at epoch : 3  =  8.054750442504883
SAMPLE :
refusal, mother, hungry, With children! Apparently dignity was beadle. human Wrapped
loss at epoch : 4  =  8.04787826538086
SAMPLE :
beadle, both sickened hungry putting glanced did merit implicitly about estimate
loss at epoch : 5  =  8.04096508026123
SAMPLE :
necessary fire: one, 'Oliver father, buildings washing. for, perception destitute accurate
loss at epoch : 6  =  8.033980369567871
SAMPLE :
not time 'I, consolatory That chair hungry, AND deposited Ah! good,
loss at epoch : 7  =  8.0269136428833
SAMPLE :
comprised dear mind opportunity nothing

loss at epoch : 66  =  7.116554260253906
SAMPLE :
certain stage Among (2) TWIST reasons of was CIRCUMSTANCES This possible
loss at epoch : 67  =  7.098197937011719
SAMPLE :
anciently TREATS no (2) small: not Part be consequence Edition which
loss at epoch : 68  =  7.079169750213623
SAMPLE :
HIS which most this is this OF be Dickens buildings which
loss at epoch : 69  =  7.059575080871582
SAMPLE :
prudent Oliver OLIVER consequence OF ATTENDING buildings Index assign is which
loss at epoch : 70  =  7.039809226989746
SAMPLE :
anciently (2) part Leigh Gaugy. need towns, and myself consequence This
loss at epoch : 71  =  7.020282745361328
SAMPLE :
BORN trouble BY public Among (2) buildings reader, name, Dickens reasons
loss at epoch : 72  =  7.00117301940918
SAMPLE :
WHERE town, wit, this AND myself Leigh reader, wit, PROGRESS workhouse
loss at epoch : 73  =  6.982322692871094
SAMPLE :
OR at TREATS HIS is other workhouse; editing Little. buildings or
loss at epoch : 74  =  6.963362216949463

loss at epoch : 137  =  5.412140369415283
SAMPLE :
no DICKENS BY PROGRESS no not 1 date in ATTENDING most
loss at epoch : 138  =  5.384805202484131
SAMPLE :
wit, this Dickens Next day Leigh which mentioning, is Leigh great
loss at epoch : 139  =  5.3546600341796875
SAMPLE :
on mentioning, out part mentioning, by can from CHAPTER name, this
loss at epoch : 140  =  5.324007034301758
SAMPLE :
CIRCUMSTANCES small: name, trouble AND BORN PARISH DICKENS a fictitious Gaugy.
loss at epoch : 141  =  5.295896530151367
SAMPLE :
was WHERE other refrain common CIRCUMSTANCES not PARISH all Twist DICKENS
loss at epoch : 142  =  5.270724296569824
SAMPLE :
1 inasmuch created CHARLES wit, one OF date PARISH in reasons
loss at epoch : 143  =  5.243423938751221
SAMPLE :
PARISH Little. public CIRCUMSTANCES no 1 OR OF repeat, which OF
loss at epoch : 144  =  5.209590911865234
SAMPLE :
(2) PLACE Leigh Index a WAS homepage FullBooks.com Gaugy. myself PROGRESS
loss at epoch : 145  =  5.173271179199219
SAMPLE :

loss at epoch : 208  =  3.534588098526001
SAMPLE :
1 reader, in THE assign reader, be at or small: no


RuntimeError: CUDA out of memory. Tried to allocate 120.00 MiB (GPU 0; 6.00 GiB total capacity; 3.87 GiB already allocated; 119.06 MiB free; 4.19 GiB reserved in total by PyTorch)

In [19]:
print(trained_model.parameters())
print("SAMPLE : ",predict("the".split(),100))

NameError: name 'trained_model' is not defined