In [1]:
str_1 = "Hi,there!"

In [2]:
list(str_1)

['H', 'i', ',', 't', 'h', 'e', 'r', 'e', '!']

In [3]:
str_2 = "Hi, how are you?"

In [4]:
str_2.split()

['Hi,', 'how', 'are', 'you?']

In [5]:
with open("files/anna.txt","r") as f:
    text=f.read()    
words=text.split(" ")    
print(words[:20])

['Chapter', '1\n\n\nHappy', 'families', 'are', 'all', 'alike;', 'every', 'unhappy', 'family', 'is', 'unhappy', 'in', 'its', 'own\nway.\n\nEverything', 'was', 'in', 'confusion', 'in', 'the', "Oblonskys'"]


In [6]:
clean_text=text.lower().replace("\n", " ")    
clean_text=clean_text.replace("-", " ")    
for x in ",.:;?!$()/_&%*@'`":
    clean_text=clean_text.replace(f"{x}", f" {x} ")
clean_text=clean_text.replace('"', ' " ')    
text=clean_text.split()

In [7]:
from collections import Counter   
word_counts = Counter(text)    
words=sorted(word_counts, key=word_counts.get,
                      reverse=True)
print(words[:10])

[',', '.', 'the', '"', 'and', 'to', 'of', 'he', "'", 'a']


In [11]:
len(words)

12778

In [13]:
len(text)

437098

In [14]:
text_length=len(text)    
num_unique_words=len(words)    
print(f"the text contains {text_length} words")
print(f"there are {num_unique_words} unique tokens")

the text contains 437098 words
there are 12778 unique tokens


In [15]:
word_to_int={v:k for k,v in enumerate(words)}    
int_to_word={k:v for k,v in enumerate(words)}    
print({k:v for k,v in word_to_int.items() if k in words[:10]})
print({k:v for k,v in int_to_word.items() if v in words[:10]})

{',': 0, '.': 1, 'the': 2, '"': 3, 'and': 4, 'to': 5, 'of': 6, 'he': 7, "'": 8, 'a': 9}
{0: ',', 1: '.', 2: 'the', 3: '"', 4: 'and', 5: 'to', 6: 'of', 7: 'he', 8: "'", 9: 'a'}


In [16]:
### converting text to indexes 
print(text[0:20])
wordidx=[word_to_int[w] for w in text]  
print([word_to_int[w] for w in text[0:20]])

['chapter', '1', 'happy', 'families', 'are', 'all', 'alike', ';', 'every', 'unhappy', 'family', 'is', 'unhappy', 'in', 'its', 'own', 'way', '.', 'everything', 'was']
[208, 2755, 280, 2981, 83, 31, 2419, 35, 202, 685, 362, 38, 685, 10, 236, 147, 166, 1, 149, 12]


In [17]:
word_to_int["anna"]

62

In [18]:
len(wordidx)

437098

In [19]:
### Creating batches of training
import torch
seq_len=100    
xys=[]
for n in range(0, len(wordidx)-seq_len-1):
    x = wordidx[n:n+seq_len]    
    y = wordidx[n+1:n+seq_len+1]    
    xys.append((torch.tensor(x),(torch.tensor(y))))

In [20]:
xys

[(tensor([ 208, 2755,  280, 2981,   83,   31, 2419,   35,  202,  685,  362,   38,
           685,   10,  236,  147,  166,    1,  149,   12,   10, 1462,   10,    2,
          2269,    8,  223,    1,    2,  128,   18, 2270,   11,    2,  159,   12,
          1054,   30,   67, 3562,   19,    9,  476,  347,    0,   63,   18,   52,
             9,  925,   10,   70,  362,    0,    4,   17,   18, 1644,    5,   14,
           159,   11,   17,   55,   20,   72,   30,  399,   10,    2,  115,  223,
            19,   23,    1,   40,  184,    6,  956,   18,   58, 2153,  266,  348,
             0,    4,   20,   78,    2,  159,    4,  128,  553,    0,   24,   31,
             2, 1583,    6,   70]),
  tensor([2755,  280, 2981,   83,   31, 2419,   35,  202,  685,  362,   38,  685,
            10,  236,  147,  166,    1,  149,   12,   10, 1462,   10,    2, 2269,
             8,  223,    1,    2,  128,   18, 2270,   11,    2,  159,   12, 1054,
            30,   67, 3562,   19,    9,  476,  347,    0,   63

In [21]:
from torch.utils.data import DataLoader
torch.manual_seed(42)
batch_size=32
loader = DataLoader(xys, batch_size=batch_size, shuffle=True)

### Building LSTM model

In [22]:
from torch import nn
device = "cuda" if torch.cuda.is_available() else "cpu"

In [35]:
class WordLSTM(nn.Module):
    def __init__(self, input_size=128, n_embed=128, n_layers=3, drop_prob=0.2):
        super().__init__()
        self.input_size = input_size
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_embed = n_embed
        vocab_size = len(word_to_int)

        self.embedding = nn.Embedding(vocab_size,n_embed) # lookup table
        self.lstm = nn.LSTM(input_size=self.input_size,
                            hidden_size=self.n_embed,
                            num_layers=self.n_layers,
                            dropout=self.drop_prob,
                            batch_first=True)
        self.fc = nn.Linear(input_size,vocab_size)

    def forward(self, x, hc):
        embed = self.embedding(x)
        x, hc = self.lstm(embed, hc)
        x = self.fc(x)
        return x, hc
    
    def init_hidden(self, n_seqs):
        weight = next(self.parameters()).data

        return (weight.new(self.n_layers,
                           n_seqs, self.n_embed).zero_(),
                weight.new(self.n_layers,
                           n_seqs, self.n_embed).zero_())

In [36]:
model=WordLSTM().to(device)

In [37]:
lr=0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_func = nn.CrossEntropyLoss()

### Training the LSTM model

In [38]:
model.train()

WordLSTM(
  (embedding): Embedding(12778, 128)
  (lstm): LSTM(128, 128, num_layers=3, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=128, out_features=12778, bias=True)
)

In [None]:
for epoch in range(50):
    tloss = 0

    sh, sc = model.init_hidden(batch_size)

    for i , (x,y) in enumerate(loader):
        if x.shape[0] == batch_size:
            inputs, targets = x.to(device), y.to(device)
            optimizer.zero_grad()
            output, (sh,sc) = model(inputs, (sh,sc))
            loss = loss_func(output.transpose(1,2), targets)
            sh, sc = sh.detach(), sc.detach()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()
            tloss+=loss.item()

        if (i+1) % 1000 == 0:
            print(f"at epoch {epoch} iteration {i+1}\
            average loss = {tloss/(i+1)}")

torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size([32, 100])
torch.Size([32, 100]) torch.Size

In [79]:
import pickle
with open("files/word_to_int.p","wb") as fb:    
    pickle.dump(word_to_int, fb)

### Generating text with the trained LSTM model

In [80]:
model.load_state_dict(torch.load("files/wordLSTM.pth",
                                    map_location=device))
with open("files/word_to_int.p","rb") as fb:    
    word_to_int = pickle.load(fb)      
int_to_word={v:k for k,v in word_to_int.items()}

  model.load_state_dict(torch.load("files/wordLSTM.pth",


In [81]:
import numpy as np
def sample(model, prompt, length=200):
    model.eval()
    text = prompt.lower().split(' ')
    hc = model.init_hidden(1)
    length = length - len(text)
    for i in range(0, length):
        # if the text length is less than seq_len, use text to predict 
        if len(text)<=seq_len:
            x = torch.tensor([[word_to_int[w] for w in text]])
        # otherwise use the last seq_len tokens to predict
        else:
            x = torch.tensor([[word_to_int[w] for w in text[-seq_len:]]])            
        inputs = x.to(device)
        output, hc = model(inputs, hc)
        logits = output[0][-1]
        p = nn.functional.softmax(logits, dim=0).detach().cpu().numpy()
        idx = np.random.choice(len(logits), p=p)
        text.append(int_to_word[idx])
    text=" ".join(text)
    for m in ",.:;?!$()/_&%*@'`":
        text=text.replace(f" {m}", f"{m} ")
    text=text.replace('"  ', '"')   
    text=text.replace("'  ", "'")  
    text=text.replace('" ', '"')   
    text=text.replace("' ", "'")     
    return text 

In [82]:
torch.manual_seed(42)
np.random.seed(42)
print(sample(model, prompt='Anna and the prince'))

anna and the prince did not forget what he had not spoken.  when the softening barrier was not so long as he had talked to his brother,  all the hopelessness of the impression.  "official tail,  a man who had tried him,  though he had been able to get across his charge and locked close,  and the light round the snow was in the light of the altar villa.  the article in law levin was first more precious than it was to him so that if it was most easy as it would be as the same.  this was now perfectly interested.  when he had got up close out into the sledge,  but it was locked in the light window with their one grass,  and in the band of the leaves of his projects,  and all the same stupid woman,  and really,  and i swung his arms round that thinking of bed.  a little box with the two boys were with the point of a gleam of filling the boy,  noiselessly signed the bottom of his mouth,  and answering them took the red


#### Temperature and top-k in text generation

In [83]:
def generate(model, prompt, top_k=None, 
             length=200, temperature=1):
    model.eval()
    text = prompt.lower().split(' ')
    hc = model.init_hidden(1)
    length = length - len(text)    
    for i in range(0, length):
        # if the text length is less than seq_len, use text to predict 
        if len(text)<=seq_len:
            x = torch.tensor([[word_to_int[w] for w in text]])
        # otherwise use the last seq_len tokens to predict
        else:
            x = torch.tensor([[word_to_int[w] for w in text[-seq_len:]]])    
        inputs = x.to(device)
        output, hc = model(inputs, hc)
        logits = output[0][-1]
        # scale the logits with the temperature 
        logits = logits/temperature
        p = nn.functional.softmax(logits, dim=0).detach().cpu()    
        if top_k is None:
            idx = np.random.choice(len(logits), p=p.numpy())
        # top-K sampling
        else:
            ps, tops = p.topk(top_k)
            ps=ps/ps.sum()
            idx = np.random.choice(tops, p=ps.numpy())          
        text.append(int_to_word[idx])
    text=" ".join(text)
    for m in ",.:;?!$()/_&%*@'`":
        text=text.replace(f" {m}", f"{m} ")
    text=text.replace('"  ', '"')   
    text=text.replace("'  ", "'")  
    text=text.replace('" ', '"')   
    text=text.replace("' ", "'")     
    return text  

In [85]:
# next token using default setting
prompt="I ' m not going to see"
torch.manual_seed(42)
np.random.seed(42)
for _ in range(10):
    print(generate(model, prompt, top_k=None, 
         length=len(prompt.split(" "))+3, temperature=1)) 

i'm not going to see you;  there
i'm not going to see you,  "
i'm not going to see the petrovs. 
i'm not going to see me,  do
i'm not going to see my son, 
i'm not going to see her.  i
i'm not going to see you,  "
i'm not going to see his wife. 
i'm not going to see you at the
i'm not going to see you.  "


In [86]:
# next token using conservative predictions
prompt="I ' m not going to see"
torch.manual_seed(42)
np.random.seed(42)
for _ in range(10):
    print(generate(model, prompt, top_k=3, 
         length=len(prompt.split(" "))+1, temperature=0.5))

i'm not going to see you
i'm not going to see the
i'm not going to see her
i'm not going to see you
i'm not going to see you
i'm not going to see you
i'm not going to see you
i'm not going to see her
i'm not going to see you
i'm not going to see her


In [87]:
torch.manual_seed(42)
np.random.seed(42)
print(generate(model, prompt='Anna and the prince',
               top_k=3,
               temperature=0.5)) 

anna and the prince had no milk.  but,  "answered levin,  and he stopped.  "i've been skating to look at you all the harrows,  and i'm glad. . .  ""no,  i'm going to the country.  ""no,  it's not a nice fellow.  ""yes,  sir.  ""well,  what do you think about it?  ""why,  what's the matter?  ""yes,  yes,  "answered levin,  smiling,  and he went into the hall.  "yes,  i'll come for him and go away,  "he said,  looking at the crumpled front of his shirt.  "i have not come to see him,  "she said,  and she went out.  "i'm very glad,  "she said,  with a slight bow to the ambassador's hand.  "i'll go to the door.  "she looked at her watch,  and she did not know what to say


In [88]:
# next token using creative predictions
prompt="I ' m not going to see"
torch.manual_seed(42)
np.random.seed(42)
for _ in range(10):
    print(generate(model, prompt, top_k=None, 
         length=len(prompt.split(" "))+1, temperature=2))

i'm not going to see them
i'm not going to see scarlatina
i'm not going to see behind
i'm not going to see us
i'm not going to see it
i'm not going to see it
i'm not going to see a
i'm not going to see misery
i'm not going to see another
i'm not going to see seryozha
