In [1]:
#download the training data
import requests

url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
response = requests.get(url)

with open('sample.txt', 'wb') as file:
    file.write(response.content) # write the content of the response to a new file

In [2]:
#read the data
with open('sample.txt', 'r') as file:
    text = file.read()
print("length of text in characters: ", len(text))
#print(text[:1000])

length of text in characters:  1115394


In [7]:
#get all the unique characters in the file
vocab = sorted(set(text)) #use the set function to get all the unique characters in the text, then sort the
vocab_size = len(vocab)
print("Number of unique characters: ", vocab_size, ''.join(vocab))

Number of unique characters:  65 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [None]:
#tokenize the text
char_to_index = {char:index for index, char in enumerate(vocab)} #create a dictionary that maps each unique character to a unique index
index_to_char = {index:char for char, index in char_to_index.items()} #create a dictionary that maps each unique index to a unique character

def encode(text):
    return [char_to_index[char] for char in text] #convert the text to a list of indices that maps to some character, using the char_to_index dictionary

def decode(indices):
    return ''.join([index_to_char[index] for index in indices]) #convert the list of indices back to text, using the index_to_char dictionary

#print('encoded text: ', encode(text[:100]), 'decoded text: ', decode(encode(text[:100])))

In [9]:
#encode the entire dataset
encoded_text = encode(text)
#turn this into a pytorch dataset
import torch
data = torch.tensor(encoded_text, dtype=torch.int64) #convert the encoded text to a tensor
print(data.shape, data.dtype)
#split the data into training and validation sets
train_data = data[:int(0.9*len(data))] #90% of the data for training
val_data = data[int(0.9*len(data)):] #10% of the data for validation

torch.Size([1115394]) torch.int64


In [115]:
#batch the data
torch.manual_seed(1337) # Set seed for reproducibility

sequence_size = 8 # the length of charcters in the sequence that we will use to make the prediction
batch_size = 32 # the number of sequences in a batch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #check if cuda is available

def get_batches(data, sequence_size): #function to get the batches of data and the corresponding ouput sequences (with 1 extra character)
    ix =torch.randint(len(data)-sequence_size, (batch_size,)) #get a random starting index for each sequence in the batch
    
    x = torch.stack([data[i:i+sequence_size] for i in ix]) #get the sequences, and stack them into a tensor of shape (batch_size, sequence_size)
    y = torch.stack([data[i+1:i+1+sequence_size] for i in ix]) #get the target sequences with 1 extra character
    x, y = x.to(device), y.to(device) #move the data to the device 
    return x, y

# print saple batch
x, y = get_batches(train_data, sequence_size)
print('input',x.shape, '\n', x ,'\n', 'output', y.shape,'\n', y)

input torch.Size([32, 8]) 
 tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54],
        [57, 43, 60, 43, 52,  1, 63, 43],
        [60, 43, 42,  8,  0, 25, 63,  1],
        [56, 42,  5, 57,  1, 57, 39, 49],
        [43, 57, 58, 63,  6,  1, 58, 46],
        [43,  1, 51, 39, 63,  1, 40, 43],
        [58, 46, 43,  1, 43, 39, 56, 57],
        [39, 58, 47, 53, 52, 12,  1, 37],
        [53, 56, 43,  1, 21,  1, 41, 39],
        [50, 39, 52, 63,  1, 47, 58, 57],
        [56, 53, 63,  1, 42, 47, 42,  1],
        [39, 51,  1, 39, 44, 56, 39, 47],
        [17, 24, 21, 38, 13, 14, 17, 32],
        [ 1, 39, 52, 42,  1, 45, 43, 50],
        [ 1, 58, 46, 39, 58,  1, 42, 53],
        [ 1, 61, 53, 59, 50, 42,  1, 21],
        [59, 57, 40, 39, 52, 42,  1, 40],
        [52, 42,  8,  0,  0, 23, 21, 26],
        [45, 53, 42, 57,  0, 23, 43, 43],
        [52,  1, 61, 39, 57,  1, 51, 53],
      

In [116]:
import torch.nn as nn

# Define the criterion
criterion = nn.CrossEntropyLoss()

#estimate loss
def estimate_loss(model, x, y):
    with torch.no_grad(): #we don't need to compute the gradients
        y_pred, _, _ = model(x) #get the predictions
        loss = criterion(y_pred.transpose(1,2), y) #calculate the loss
        return loss.item() #return the loss as a scalar

In [209]:
#build the model
import torch.nn as nn
from torch.nn import functional as F


class TransformerModel(nn.Module):
    def __init__(self, vocab_size, hidden_size=128, num_layers=1, nhead=8, lr=0.001):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        
        self.pos_encoder = nn.Parameter(torch.zeros(1, 1000, hidden_size))  # Positional encoding
        encoder_layers = nn.TransformerEncoderLayer(hidden_size, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.optimizer = torch.optim.Adam(self.parameters(), lr)  # Adjusted learning rate

    def forward(self, x, h=None):
        x = self.embedding(x) + self.pos_encoder[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        x = self.fc(x)
        loss = F.cross_entropy(x.permute(0, 2, 1), y)
        return x, h, loss

    def generate(self, h=None, max_len=128, temperature=0.1):
        output = []
        x = torch.randint(vocab_size, (1, 1), dtype=torch.int64)
        for _ in range(max_len):
            x = self.embedding(x) + self.pos_encoder[:, :x.size(1), :]
            x = self.transformer_encoder(x)
            x = self.fc(x)
            #print(x.shape)
            x = x.squeeze(1) / temperature # Softmax temperature adjustment
            #print(x.shape)
            x = F.softmax(x, dim=1)
            x = torch.multinomial(x, 1)
            output.append(x.item())
        return output

    def backwardprop(self, loss):
        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()

    def train(self, x, y, epochs=10):
        losses = []
        for epoch in range(epochs):
            x, y = get_batches(train_data, sequence_size)
            y_pred, _, loss = neural_network(x)
            self.backwardprop(loss)
            losses.append(loss.item())
            if epoch % 100 == 0:
                estimated_loss = estimate_loss(neural_network, x, y)
                print(f'Epoch: {epoch}, Loss: {loss.item()} estimated loss: {estimated_loss}')
        return losses

nn = TransformerModel(vocab_size)
neural_network = nn.to(device)
x, y = get_batches(train_data, sequence_size) #get a sample batch

# generate some text
generated_text = neural_network.generate(max_len=100)

In [278]:
import torch.nn as nn
from torch.nn import functional as F

class AutoencoderModel(nn.Module):
    def __init__(self, vocab_size, hidden_size=128, num_layers=1, nhead=8, lr=0.001):
        super(AutoencoderModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(hidden_size, nhead, batch_first=True), num_layers)
        self.decoder = nn.TransformerDecoder(nn.TransformerDecoderLayer(hidden_size, nhead, batch_first=True), num_layers)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.optimizer = torch.optim.Adam(self.parameters(), lr)
        
    def forward(self, x, y):
        x_embedded = self.embedding(x)
        y_embedded = self.embedding(y)
        x_encoded = self.encoder(x_embedded)
        y_decoded = self.decoder(y_embedded, x_encoded)
        output = self.fc(y_decoded)
        loss = F.cross_entropy(output.view(-1, output.size(-1)), y.view(-1))
        return output, loss
    
    def backwardprop(self, loss):
        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()
        
    def train(self, x, y, epochs=10):
        losses = []
        for epoch in range(epochs):
            y_pred, loss = self.forward(x, y)
            self.backwardprop(loss)
            losses.append(loss.item())
            if epoch % 100 == 0:
                #estimated_loss = estimate_loss(self, x, y)
                print(f'Epoch: {epoch}, Loss: {loss.item()}')
        return losses
    
    def generate_text(self, input_sequence, max_length=50):
        generated_sequence = input_sequence
        x_embedded = self.embedding(input_sequence)
        x_encoded = self.encoder(x_embedded)
    
        for _ in range(max_length):
            last_token = generated_sequence[:, -1:]
            y_embedded = self.embedding(last_token)
            y_decoded = self.decoder(y_embedded, x_encoded)
            output = self.fc(y_decoded)
    
            next_token = output.argmax(dim=-1) 
            
            #next_token = next_token.view(-1).cpu().numpy()
            next_token = next_token[:, -1:]  # Ensure the shape is [batch_size, 1]
            generated_sequence = torch.cat((generated_sequence, next_token), dim=1)
           
    
        return generated_sequence
    
nn = AutoencoderModel(vocab_size)

neural_network = nn.to(device)
x, y = get_batches(train_data, sequence_size) #get a sample batch

In [279]:
#train the model
neural_network.train(x, y, epochs=100)

x, y = get_batches(train_data, sequence_size) #get a sample batch

#input_text = 'hello and welcome to the world of tomorrow'
#x = torch.tensor(encode(input_text), dtype=torch.int64).unsqueeze(0).to(device)

#generate some text
generated_text = neural_network.generate_text(x, max_length=10)
generated_text = generated_text.view(-1).cpu().numpy()



generated_text = remove_consecutive_duplicates(generated_text)
print(decode(generated_text), )

Epoch: 0, Loss: 4.244791030883789
th is a ices heaefs,
Tele done, e, sir, o both t fare yobotom of me?
I often he feasti' the ha pair oriend: a goad us new caryour cluBRUTUS:
ld escapst kind wil ape biter from thst apeaeart thi you likIUS:
I d would wFORD:
Igold andwn this,rd for a


In [211]:
#predict the output

y_pred, _, loss = neural_network(x) #get the predictions


generated_text = neural_network.generate(max_len=100)
print(generated_text)
print(decode(generated_text)) #print the generated text "verry good text"


losses = neural_network.train(x, y, epochs=500) #train the neural network


[11, 55, 59, 6, 39, 40, 9, 34, 5, 3, 40, 9, 34, 5, 28, 60, 3, 29, 13, 51, 26, 1, 47, 58, 47, 10, 64, 36, 34, 5, 45, 54, 49, 27, 17, 37, 30, 44, 46, 41, 19, 25, 33, 29, 25, 17, 37, 42, 38, 5, 3, 45, 54, 49, 9, 34, 5, 28, 60, 60, 3, 40, 9, 47, 50, 27, 3, 45, 54, 49, 29, 43, 49, 9, 34, 5, 15, 60, 3, 45, 54, 49, 9, 47, 61, 56, 57, 35, 37, 42, 25, 57, 10, 64, 36, 56, 28, 60, 3, 45]
;qu,ab3V'$b3V'Pv$QAmN iti:zXV'gpkOEYRfhcGMUQMEYdZ'$gpk3V'Pvv$b3ilO$gpkQek3V'Cv$gpk3iwrsWYdMs:zXrPv$g
Epoch: 0, Loss: 4.320104122161865 estimated loss: 3.8228702545166016
Epoch: 100, Loss: 3.0788838863372803 estimated loss: 3.6573145389556885
Epoch: 200, Loss: 2.7641119956970215 estimated loss: 3.6711440086364746
Epoch: 300, Loss: 2.6521413326263428 estimated loss: 4.234762191772461


KeyboardInterrupt: 

In [141]:

import matplotlib.pyplot as plt

losses = neural_network.train(x, y, epochs=10000) #train the neural network
print(losses[-1]) #print the losses
print(decode(neural_network.generate(max_len=100))) #print the generated text "sligthly better text"

plt.plot(losses) #plot the losses

RuntimeError: Expected target size [32, 8], got [32, 8, 65]