In [None]:
import torch

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
filename = '/content/shakespeare_data.txt'
lines = []
counter = 0

with open(filename) as files:
    for line in files:
        pure_line = line.strip()

        if pure_line:
            lines.append(pure_line)

n_lines = len(lines)
print(f"Number of lines: {n_lines}")

Number of lines: 49562


In [None]:
print("\n".join(lines[506:514]))

BENVOLIO	Here were the servants of your adversary,
And yours, close fighting ere I did approach:
I drew to part them: in the instant came
The fiery Tybalt, with his sword prepared,
Which, as he breathed defiance to my ears,
He swung about his head and cut the winds,
Who nothing hurt withal hiss'd him in scorn:
While we were interchanging thrusts and blows,


In [None]:
text = "\n".join(lines)
vocab = sorted(set(text))
vocab.insert(0,"[UNK]")
vocab.insert(1,"")
print(f'{len(vocab)} unique characters')
print(" ".join(vocab))

81 unique characters
[UNK]  	 
   ! & ' ( ) , - . 0 1 2 3 4 5 6 7 8 9 : ; ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ ] a b c d e f g h i j k l m n o p q r s t u v w x y z |


In [None]:
line = "Hello world!"
chars = list(line)
print(chars)

['H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']


In [None]:
print(vocab.index('a'))
print(vocab.index('e'))
print(vocab.index('i'))
print(vocab.index('o'))
print(vocab.index('u'))
print(vocab.index(' '))
print(vocab.index('2'))
print(vocab.index('3'))

54
58
62
68
74
4
15
16


In [None]:
vocab.index(" ")

4

In [None]:
len(vocab)

81

In [None]:
train_lines = lines[:-1000] # Leave the rest for training
eval_lines = lines[-1000:] # Create a holdout validation set

print(f"Number of training lines: {len(train_lines)}")
print(f"Number of validation lines: {len(eval_lines)}")

Number of training lines: 48562
Number of validation lines: 1000


In [None]:
maxx = 0
for line in train_lines:
  maxx = max(maxx , len(line))

for line in eval_lines:
  maxx = max(maxx , len(line))
maxx

81

In [None]:
train_lines[0]

"A LOVER'S COMPLAINT"

In [None]:
train_lines[1]

'FROM off a hill whose concave womb reworded'

In [None]:
len(train_lines)

48562

In [None]:
line = train_lines[0]
print(type(line))
chartoids = [vocab.index(i) for i in line]
print(chartoids)

<class 'str'>
[26, 4, 37, 40, 47, 30, 43, 7, 44, 4, 28, 40, 38, 41, 37, 26, 34, 39, 45]


In [None]:
def padding(text,max_length=85):
  diff = max_length - len(text)
  text = text + ([0]*diff)
  return text

In [None]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        line = self.data[idx]
        chartoids = [vocab.index(char) for char in line]
        padtext = padding(chartoids)
        input_ids = torch.tensor(padtext,dtype=torch.long)
        inputs = input_ids[:-1]
        target = input_ids[1:]
        return inputs,target

In [None]:
dataset = CustomDataset(train_lines)

dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [None]:
next(iter(dataloader))

[tensor([[61, 58, 65, 66, 58, 57,  4, 66, 74, 72, 73,  4, 74, 69, 68, 67,  4, 54,
           4, 76, 54, 71, 71, 54, 67, 73, 58, 57,  4, 67, 58, 58, 57,  4, 60, 62,
          75, 58,  4, 61, 62, 66,  4, 54,  4, 55, 58, 73, 73, 58, 71,  0,  0,  0,
           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [45, 61, 58,  4, 64, 62, 67, 60,  4, 72, 61, 54, 65, 65,  4, 61, 54, 75,
          58,  4, 66, 78,  4, 72, 58, 71, 75, 62, 56, 58, 23,  4, 55, 74, 73,  4,
          66, 78,  4, 69, 71, 54, 78, 58, 71, 72,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]]),
 tensor([[58, 65, 66, 58, 57,  4, 66, 74, 72, 73,  4, 74, 69, 68, 67,  4, 54,  4,
          76, 54, 71, 71, 54, 67, 73, 58, 57,  4, 67, 58, 58, 57,  4, 60, 62, 75,
          58,  4, 61, 62, 66,  4, 54,  4, 55, 58, 73, 73, 58

In [None]:
import torch
import torch.nn as nn

class GRULM(nn.Module):

    def __init__(self, vocab_size=256, embedding_dim=256, rnn_units=128):
        super(GRULM, self).__init__()


        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, rnn_units, batch_first=True)
        self.dense = nn.Linear(rnn_units, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=-1)


    def forward(self, inputs, states=None, return_state=False):
        x = self.embedding(inputs)
        if states is None:
            states = torch.zeros(1, inputs.size(0), self.gru.hidden_size).to(device)
        x, states = self.gru(x, states)
        x = self.dense(x)
        x = self.log_softmax(x)
        if return_state:
            return x, states
        else:
            return x


In [None]:

vocab_size = 82

embedding_dim = 256

rnn_units = 512

model = GRULM(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units = rnn_units)

In [None]:
import torch.optim as optim
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr= 0.00125)

In [None]:
model.to(device)

data_loader = dataloader

model.train()
num_epochs = 15

for epoch in range(num_epochs):
    for inputs, targets in data_loader:

        inputs, targets = inputs.to(device), targets.to(device)


        optimizer.zero_grad()

        outputs = model(inputs)

        outputs = outputs.view(-1, model.dense.out_features)
        targets = targets.view(-1)
        loss = loss_fn(outputs, targets)

        loss.backward()

        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

Epoch 1/15, Loss: 0.15330572426319122
Epoch 2/15, Loss: 0.7431403398513794
Epoch 3/15, Loss: 0.5682281255722046
Epoch 4/15, Loss: 0.6760185956954956
Epoch 5/15, Loss: 0.6278946995735168
Epoch 6/15, Loss: 0.7835381031036377
Epoch 7/15, Loss: 0.11920179426670074
Epoch 8/15, Loss: 0.6999744772911072
Epoch 9/15, Loss: 0.3609185814857483
Epoch 10/15, Loss: 0.7994700074195862
Epoch 11/15, Loss: 0.5642611384391785
Epoch 12/15, Loss: 0.9430555701255798
Epoch 13/15, Loss: 0.7064498662948608
Epoch 14/15, Loss: 0.6940537691116333
Epoch 15/15, Loss: 0.8866020441055298


In [None]:
def log_perplexity(preds, target):

    PADDING_ID = 1

    log_p = torch.sum(preds * F.one_hot(target, num_classes=preds.size(-1)), dim=-1).float()
    non_pad = 1.0 - torch.eq(target, PADDING_ID).float()
    log_p = log_p * non_pad
    log_ppx = torch.sum(log_p, dim=1) / torch.sum(non_pad, dim=1)
    log_ppx = torch.mean(log_ppx)


    return -log_ppx.item()


In [None]:
import torch

def evaluate(model, test_loader, loss_fn, device):
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)

            outputs = outputs.view(-1, model.dense.out_features)
            targets = targets.view(-1)

            loss = loss_fn(outputs, targets)

            test_loss += loss.item() * inputs.size(0)

            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

    avg_loss = test_loss / len(test_loader.dataset)
    accuracy = correct / total

    return avg_loss, accuracy

model.to(device)
test_loader = DataLoader(CustomDataset(eval_lines), batch_size=64, shuffle=False)
avg_loss, accuracy = evaluate(model, test_loader, loss_fn, device)
print(f'Average Test Loss: {avg_loss}, Test Accuracy: {accuracy}')


Average Test Loss: 0.6056719660758972, Test Accuracy: 0.8100595238095238


In [None]:
def temperature_random_sampling(log_probs, temperature=1.0):

    u = torch.rand(log_probs.size()).clamp(min=1e-6, max=1.0 - 1e-6).to(device)

    g = -torch.log(-torch.log(u)).to(device)

    return torch.argmax(log_probs + g * temperature, dim=-1).item()


In [None]:
def text_from_ids(ids, vocab):

    id_to_char = {i: char for i, char in enumerate(vocab)}
    #print(ids)
    chars = [id_to_char[id] for id in ids]
    return ''.join(chars)


In [None]:
import torch
import torch.nn as nn

class GenerativeModel(nn.Module):
    def __init__(self, model, vocab, temperature=0.1):

        super(GenerativeModel, self).__init__()
        self.temperature = temperature
        self.model = model
        self.vocab = vocab

    def generate_one_step(self, inputs, states=None):

        inputs = [vocab.index(char) for char in inputs]

        input_ids = torch.tensor(inputs,dtype=torch.long).to(device)
        input_ids = input_ids.unsqueeze(0)

        predicted_logits, states = self.model(input_ids,states,True)
        predicted_logits = predicted_logits[0, -1, :]
        #print("last : " , torch.argmax(predicted_logits).item())

        predicted_ids = temperature_random_sampling(predicted_logits, self.temperature)
        #print("ids : " , predicted_ids)
        predicted_chars = text_from_ids([predicted_ids], self.vocab)
        #print("predict : " , predicted_chars)

        return predicted_chars, states

    def generate_n_chars(self, num_chars, prefix):

        states = None
        next_char = list(prefix)
        #print(next_char)
        result = []+next_char
        for n in range(num_chars):
            next_char, states = self.generate_one_step(next_char, states=states)
            result.append(next_char)

        return ''.join(result)


In [None]:
import torch

torch.manual_seed(272)
gen = GenerativeModel(model, vocab, temperature=0.1)

#print(gen.generate_one_step('i'))
print(gen.generate_n_chars(3, "i love y"), '\n\n' + '_'*80)
#print(gen.generate_n_chars(30, "i wan to fuck"), '\n\n' + '_'*80)
#print(gen.generate_n_chars(32, "KING"), '\n\n' + '_'*80)


i love your 

________________________________________________________________________________
