In [1]:
import torch
import torch.nn.functional as F
from torch import optim
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import os
import pickle

In [26]:
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
file = open("datasets/a_room_with_a_view.txt", "r", encoding = "utf8")
lines = []

for i in file:
    lines.append(i)
    
print("The first line: ", lines[0])

The first line:  ﻿The Project Gutenberg eBook of A Room With A View, by E. M. Forster



### Romove escape characters and symbols

In [4]:
data = ""

for i in lines:
    data = ' '. join(lines)
    
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data[:360]

import string

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
data = data.translate(translator)

data[:500]

'The Project Gutenberg eBook of A Room With A View  by E  M  Forster  This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever  You may copy it  give it away or re use it under the terms of the Project Gutenberg License included with this eBook or online at www gutenberg org  If you are not located in the United States  you will have to check the laws of the country where you are located before using '

### Tokenization

In [5]:
from transformers import XLNetTokenizer

In [6]:
# text = "I am a good boy with handrads of friends"
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
tokenized = tokenizer.tokenize(data)
ids = tokenizer.convert_tokens_to_ids(tokenized)

In [7]:
print(tokenized[512:532])
print(tokenizer.convert_tokens_to_ids(tokenized[512:532]))

['▁and', '▁red', '▁bottles', '▁of', '▁wine', '▁that', '▁ran', '▁between', '▁the', '▁English', '▁people', '▁at', '▁the', '▁portraits', '▁of', '▁the', '▁late', '▁Queen', '▁and', '▁the']
[21, 1170, 10596, 20, 2680, 29, 1662, 161, 18, 897, 104, 38, 18, 16738, 20, 18, 471, 3631, 21, 18]


In [8]:
vocab_size = tokenizer.vocab_size

In [9]:
ids_arr = np.array(ids)

sequences = np.empty((0,4))
for i in range(3, len(ids)):
    words = np.expand_dims(ids_arr[i-3:i+1], axis=0)
    sequences = np.append(sequences, words,axis=0)

In [10]:
X = []
Y = []
for i in sequences:
    X.append(i[0:3])
    Y.append(i[3])
X = np.array(X)
Y = np.array(Y)


In [11]:
# Y = np.eye(vocab_size)[Y.astype(int)]

### parameters

In [34]:
input_size = 1
hidden_size = 256
num_layers = 2
num_classes = vocab_size
sequence_length = 3
learning_rate = 0.001
batch_size = 64
num_epochs = 1
embedding_dim = 10
load_model = True

### RNN model

In [22]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embed = torch.nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size * sequence_length, num_classes)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        
        rnn_in = self.embed(x)
#         rnn_in = rnn_in.unsqueeze(1)

        # Forward propagate LSTM
        out, _ = self.rnn(rnn_in, h0)
        out = out.reshape(out.shape[0], -1)

        # Decode the hidden state of the last time step
        out = self.fc(out)
        return out

In [23]:
# Load Data
num_train = len(X)*2//10
print(num_train)
# train_x = np.expand_dims(X[:num_train], axis=2)
# test_x = np.expand_dims(X[num_train:], axis=2)
train_x = X[:num_train]
test_x = X[num_train:]
print(train_x[:5])

train_y = Y[:num_train]
test_y = Y[num_train:]


train = torch.utils.data.TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
train_loader = torch.utils.data.DataLoader(train, batch_size=64, shuffle=True)
test = torch.utils.data.TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))
test_loader = torch.utils.data.DataLoader(test, batch_size=64, shuffle=True)

17483
[[3.2000e+01 3.4830e+03 2.1054e+04]
 [3.4830e+03 2.1054e+04 6.7510e+03]
 [2.1054e+04 6.7510e+03 2.8983e+04]
 [6.7510e+03 2.8983e+04 2.0000e+01]
 [2.8983e+04 2.0000e+01 7.9000e+01]]


In [24]:
# Initialize network (try out just using simple RNN, or GRU, and then compare with LSTM)
model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [35]:
# Train Network
if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
    
for epoch in range(num_epochs):
    epoch_loss = 0.0
    
    if epoch % 3 == 0:
        checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
        # Try save checkpoint
        save_checkpoint(checkpoint)
    
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Get data to cuda if possible
#         data = data.to(device=device).squeeze(1)
#         print(data.shape)
        
        targets = targets.to(device=device)

        # forward
        scores = model(data.long())
        loss = criterion(scores, targets.long())

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent update step/adam step
        optimizer.step()
        epoch_loss += data.shape[0] * loss.item()
#         print(f"Batch: {epoch+1}, Loss: {loss.item()}")
    print(f"Epoch: {epoch+1}, Loss: {float(epoch_loss) / float(len(train))}")


=> Loading checkpoint
=> Saving checkpoint
Epoch: 1, Loss: 1.687454989005136


In [31]:

# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0

    # Set model to eval
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device)
            y = y.to(device=device)

            scores = model(x.long())
            _, predictions = scores.max(1)
            num_correct += (predictions == y.long()).sum()
            num_samples += predictions.size(0)

    # Toggle model back to train
    model.train()
    return num_correct / num_samples


print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:2f}")
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")

Accuracy on training set: 0.000000
Accuracy on test set: 0.00


In [70]:
idx = num_train-20
ids_x = X[idx]
ids_y = Y[idx]
input_x = ids_x.astype(int).tolist()
output_y = ids_y.astype(int).tolist()
print(tokenizer.convert_ids_to_tokens(input_x))
print(tokenizer.convert_ids_to_tokens([output_y]))

[10369.   868.   107.]
[10369, 868, 107]
615
['▁rushing', '▁below', '▁them']
['▁almost']


In [71]:
model.eval()
with torch.no_grad():
    x = np.expand_dims(ids_x.astype(int), axis=0)
    x = torch.tensor(x).to(device=device)

    scores = model(x.long())
    _, predictions = scores.max(1)
    class_prob = torch.softmax(scores, dim=1)
    # get most probable class and its probability:
    class_prob, topclass = torch.topk(class_prob, 12, dim=1)
    print(topclass)
    print(class_prob)
    
    ys = np.array(topclass.squeeze(0))

tensor([[615,  31,  17,  20,  21,  18, 280, 200, 175,  35,  34, 102]])
tensor([[0.8662, 0.0245, 0.0189, 0.0101, 0.0076, 0.0058, 0.0058, 0.0045, 0.0032,
         0.0027, 0.0027, 0.0024]])


In [72]:
ys = np.array(topclass.squeeze(0)).tolist()

print(tokenizer.convert_ids_to_tokens(ys))

['▁almost', '▁on', '▁', '▁of', '▁and', '▁the', '▁come', '▁They', '▁know', '▁I', '▁as', '▁so']
