In [None]:
# ! wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
# ! unzip '/content/wiki-news-300d-1M.vec.zip'
# import gensim.models.wrappers.fasttext
# model = gensim.models.KeyedVectors.load_word2vec_format('/content/wiki-news-300d-1M.vec')
# word_vectors = model.wv

# import torch
# import torch.nn as nn

# weights = torch.FloatTensor(word_vectors.vectors)
# embedding = nn.Embedding.from_pretrained(weights)

**Imports**

In [None]:
import torch
import torch.nn as nn
import numpy as np

**Cleaning the dataset**


In [None]:
with open('/content/wiki.train.tokens') as f:
  content = f.readlines()

In [None]:
clean = []
for c in content:
  clean.append(c.split(' \n')[0])

**Network**

In [None]:
dropout = 0.5

class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()        
    self.embedding = nn.Embedding.from_pretrained(weights)
    self.lstm = nn.LSTM(weights.shape[1], weights.shape[1], bidirectional=True, dropout=dropout)
    self.fc1 = nn.Linear(300, 300)  
    self.softmax = nn.Softmax(dim=1)
    self.output = nn.Linear(300, word_len)
  
  def forward(self, sentence, previous_state):        
    embeds = self.embedding(torch.LongTensor([word_vectors.vocab[sentence].index]))
    lstm_out, state = self.lstm(embeds, previous_state)
    # lstm_out = self.fc1(lstm_out)
    # lstm_out = self.output(lstm_out)
    # lstm_out = self.softmax(lstm_out)
    return lstm_out, state
    
# input_layer = torch.rand(10)
# net = Net()
# result = net(input_layer)


In [7]:
'''
Code taken from https://github.com/ChunML/NLP/blob/32a52dc6a252175c60b44389a020fda17a6339b7/text_generation/train_pt.py#L24
Blog: https://trungtran.io/2019/02/08/text-generation-with-pytorch/
'''

import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
from collections import Counter
import os
from argparse import Namespace


flags = Namespace(
    train_file='/content/drive/MyDrive/PoliTo/wiki.train.tokens',
    seq_size=32,
    batch_size=16,
    embedding_size=64,
    lstm_size=64,
    gradients_norm=5,
    initial_words=['I', 'am'],
    predict_top_k=5,
    checkpoint_path='checkpoint',
)


def get_data_from_file(train_file, batch_size, seq_size):
    with open(train_file, 'r', encoding='utf-8') as f:
        text = f.read()
    text = text.split()
    text = text[:int(len(text) * 0.1)]

    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)

    print('Vocabulary size', n_vocab)

    int_text = [vocab_to_int[w] for w in text]
    num_batches = int(len(int_text) / (seq_size * batch_size))
    in_text = int_text[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text


def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]


class RNNModule(nn.Module):
    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm = nn.LSTM(embedding_size,
                            lstm_size,
                            batch_first=True)
        self.dense = nn.Linear(lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)

        return logits, state

    def zero_state(self, batch_size):
        return (torch.zeros(1, batch_size, self.lstm_size),
                torch.zeros(1, batch_size, self.lstm_size))


def get_loss_and_train_op(net, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    return criterion, optimizer


def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    net.eval()
    words = ['I', 'am']

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])

    for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    print(' '.join(words).encode('utf-8'))


def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(
        flags.train_file, flags.batch_size, flags.seq_size)

    net = RNNModule(n_vocab, flags.seq_size,
                    flags.embedding_size, flags.lstm_size)
    net = net.to(device)

    criterion, optimizer = get_loss_and_train_op(net, 0.01)

    iteration = 0

    for e in range(2):
        batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
        state_h, state_c = net.zero_state(flags.batch_size)
        state_h = state_h.to(device)
        state_c = state_c.to(device)
        for x, y in batches:
            iteration += 1
            net.train()

            optimizer.zero_grad()

            x = torch.tensor(x).to(device)
            y = torch.tensor(y).to(device)

            logits, (state_h, state_c) = net(x, (state_h, state_c))
            loss = criterion(logits.transpose(1, 2), y)

            loss_value = loss.item()

            loss.backward()

            state_h = state_h.detach()
            state_c = state_c.detach()

            _ = torch.nn.utils.clip_grad_norm_(
                net.parameters(), flags.gradients_norm)

            optimizer.step()

            if iteration % 100 == 0:
                print('Epoch: {}/{}'.format(e, 200),
                      'Iteration: {}'.format(iteration),
                      'Loss: {}'.format(loss_value))

            if iteration % 1000 == 0:
                predict(device, net, flags.initial_words, n_vocab,
                        vocab_to_int, int_to_vocab, top_k=5)
                torch.save(net.state_dict(),
                           'checkpoint_pt/model-{}.pth'.format(iteration))


if __name__ == '__main__':
    main()                  

Vocabulary size 138141
Epoch: 0/200 Iteration: 100 Loss: 7.188634395599365
Epoch: 0/200 Iteration: 200 Loss: 7.038171768188477
Epoch: 0/200 Iteration: 300 Loss: 7.027667045593262
Epoch: 0/200 Iteration: 400 Loss: 6.703015327453613
Epoch: 0/200 Iteration: 500 Loss: 6.618274688720703
Epoch: 0/200 Iteration: 600 Loss: 7.052701473236084
Epoch: 0/200 Iteration: 700 Loss: 6.380314350128174
Epoch: 0/200 Iteration: 800 Loss: 6.258045196533203
Epoch: 0/200 Iteration: 900 Loss: 6.458171367645264
Epoch: 0/200 Iteration: 1000 Loss: 6.372194290161133
b'I am a pass . = = = Following a new man , which the ball in their pass for his pass for his debut for his debut , and and the gods was an example to be " a major @-@ hour and " Born for a pass for his first goal in the end in a pass for a major . In his own in a major @-@ game and a first @-@ century . " In 2009 was an average @-@ game , " . = During the end , the Bears had the gods was a major first .'


FileNotFoundError: ignored

In [None]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# import numpy as np
# from collections import Counter
# import os
# from argparse import Namespace


# def get_data_from_file(train_file, batch_size, seq_size):
#     with open(train_file, 'r', encoding='utf-8') as f:
#         text = f.read()
#     text = text.split()

#     word_counts = Counter(text)
#     sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
#     int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
#     vocab_to_int = {w: k for k, w in int_to_vocab.items()}
#     n_vocab = len(int_to_vocab)

#     print('Vocabulary size', n_vocab)

#     int_text = [vocab_to_int[w] for w in text]
#     num_batches = int(len(int_text) / (seq_size * batch_size))
#     in_text = int_text[:num_batches * batch_size * seq_size]
#     out_text = np.zeros_like(in_text)
#     out_text[:-1] = in_text[1:]
#     out_text[-1] = in_text[0]
#     in_text = np.reshape(in_text, (batch_size, -1))
#     out_text = np.reshape(out_text, (batch_size, -1))
#     return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text

# train_file, batch_size, seq_size = '/content/without-stop-words.txt', 256, 512

# with open(train_file, 'r', encoding='utf-8') as f:
#     text = f.read()
# text = text.split()

# word_counts = Counter(text)
# sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
# int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
# vocab_to_int = {w: k for k, w in int_to_vocab.items()}
# n_vocab = len(int_to_vocab)

# print('Vocabulary size', n_vocab)

# int_text = [vocab_to_int[w] for w in text]
# num_batches = int(len(int_text) / (seq_size * batch_size))
# in_text = int_text[:num_batches * batch_size * seq_size]
# out_text = np.zeros_like(in_text)
# out_text[:-1] = in_text[1:]
# out_text[-1] = in_text[0]
# in_text = np.reshape(in_text, (batch_size, -1))
# out_text = np.reshape(out_text, (batch_size, -1))
# return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text