In [2]:
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [4]:
class Dictionary(object):

    def __init__(self):
        self.word_to_index = {}
        self.index_to_word = {}
        self.index = 0

    def add_word(self, x):
        if x not in self.word_to_index:
            self.word_to_index[x] = self.index
            self.index_to_word[self.index] = x
            self.index += 1

    def __len__(self):
        return self.index

In [5]:
class TextProcess(object):

    def __init__(self):
        self.dict = Dictionary()

    def get_data(self, path, batch_size=20):
        embedding = []
        with open(path, "r") as texts:
            for text in texts:
                words = text.split() + ["<end_line>"]
                for word in words:
                    self.dict.add_word(word)
                    embedding.append(self.dict.word_to_index[word])
        texts.close()
        embedding = torch.tensor(embedding).long()
        n_batch = embedding.shape[0] // batch_size
        embedding = embedding[: n_batch * batch_size]
        embedding = embedding.view(batch_size, -1)
        print("Embedding Completed!")
        return embedding

In [6]:
# Configurations
embed_size = 128
hidden_size = 1024
num_layers = 1
epochs = 20
batch = 20
time_steps = 30
lr = 0.002

In [7]:
corpus = TextProcess()

In [8]:
rep = corpus.get_data("Data/train.txt", batch)

Embedding Completed!


In [9]:
rep.shape

torch.Size([20, 1484])

In [10]:
vocab_size = corpus.dict.__len__()
vocab_size

5290

In [11]:
num_batch = rep.shape[1] // time_steps
num_batch

49

In [12]:
class TextGenerator(nn.Module):

    def __init__(self, vocal_size, embed_size, hidden_size, num_layers):
        super(TextGenerator, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h):
        x = self.embed(x)
        out, (h, c) = self.lstm(x, h)
        out = out.reshape(out.size(0) * out.size(1), out.size(2))
        out = self.fc(out)
        return out, (h, c)

In [25]:
def train_model():
    model = TextGenerator(vocab_size, embed_size, hidden_size, num_layers)
    model = model.to(device)
    Loss = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    # Train the model.
    for epoch in range(epochs):
        # Initial state
        a = torch.zeros(num_layers, batch, hidden_size)
        a = a.to(device)
        states = (a, a)
        for i in range(0, rep.size(1) - time_steps, time_steps):
            inputs = rep[:, i: i + time_steps]
            targets = rep[:, i + 1: i + 1 + time_steps]
            inputs = inputs.to(device)
            targets = targets.to(device)
            yh, _ = model(inputs, states)
            loss = Loss(yh, targets.reshape(-1))
            model.zero_grad()
            loss.backward()
            clip_grad_norm(model.parameters(), 0.5)
            optimizer.step()

            step = (i + 1) // time_steps
            if not step % 100:
                print("Epoch [{}/{}], Loss: {:.4f}".format(epoch + 1, epochs, loss.item()))
    return model

In [32]:
def test_model(model):
    # Testing the model.
    with torch.no_grad():
        with open("Data/result.txt", "w") as texts:
            # Initial hidden states
            a = torch.zeros(num_layers, 1, hidden_size)
            a = a.to(device)
            state = (a, a)
            inputs = torch.randint(0, vocab_size, (1, )).long().unsqueeze(1)
            inputs = inputs.to(device)
            for i in range(500):
                op, _ = model(inputs, state)
                prob = op.exp()
                word_id = torch.multinomial(prob, num_samples=1).item()
                inputs.fill_(word_id)
                word = corpus.dict.index_to_word[word_id]
                word = "\n" if word == "<end_line>" else word + " "
                texts.write(word)

In [27]:
trained_model = train_model()



Epoch [1/20], Loss: 8.5693
Epoch [2/20], Loss: 5.9631
Epoch [3/20], Loss: 5.2476
Epoch [4/20], Loss: 4.7057
Epoch [5/20], Loss: 4.1619
Epoch [6/20], Loss: 3.7850
Epoch [7/20], Loss: 3.3741
Epoch [8/20], Loss: 2.9140
Epoch [9/20], Loss: 2.5403
Epoch [10/20], Loss: 2.2082
Epoch [11/20], Loss: 1.8849
Epoch [12/20], Loss: 1.6248
Epoch [13/20], Loss: 1.2832
Epoch [14/20], Loss: 1.0423
Epoch [15/20], Loss: 0.7381
Epoch [16/20], Loss: 0.4757
Epoch [17/20], Loss: 0.2877
Epoch [18/20], Loss: 0.1609
Epoch [19/20], Loss: 0.1096
Epoch [20/20], Loss: 0.0810


In [33]:
test_model(trained_model)