In [None]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [1]:
import string
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import random

In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
class Tokenizer:
    def __init__(self):
        # Generate the allowed characters inside the class
        self.special_chars=""
        self.allowed_chars = list(string.ascii_lowercase + string.digits + ".,;()[]{}?!:'" + " \t\n"+self.special_chars)
        self.vocab_size = len(self.allowed_chars)
        self.stoi = {ch: i for i, ch in enumerate(self.allowed_chars)}
        self.itos = {i: ch for i, ch in enumerate(self.allowed_chars)}

    def clean_text(self, text):
        """
        Simplify the input text using set operations. Keep only allowed characters and convert uppercase to lowercase.
        Prints a warning if any character is removed.

        :param text: The input text to be simplified.
        :return: Simplified text as a string.
        """
        text_set = set(text.lower())
        allowed_set = set(self.allowed_chars)
        intersection_set = text_set.intersection(allowed_set)
        removed_chars = text_set.difference(intersection_set)

        if removed_chars:
            print("Warning: The following characters were removed:", ''.join(sorted(removed_chars)))

        return ''.join(char for char in text.lower() if char in allowed_set)

    def encode(self, text):
        """ Encodes a string into a list of integers based on the stoi mapping. """
        return [self.stoi[char] for char in text]

    def decode(self, encoded_text):
        """ Decodes a list of integers into a string based on the itos mapping. """
        return ''.join([self.itos[i] for i in encoded_text])

# Example usage
tokenizer = Tokenizer()

In [4]:
cleaned_text=tokenizer.clean_text(text)
data=tokenizer.encode(cleaned_text)



In [5]:
split_idx=int(len(data)*0.85)
train_data=data[:split_idx]
test_data=data[split_idx:]

In [6]:
class TextRNN(nn.Module):
    def __init__(self,vocab_size,embed_dim,hidden_dim):
        super(TextRNN, self).__init__()
        self.embedding=nn.Embedding(vocab_size,embed_dim) #[batch_size,seq_length] -> [batch_size,seq_length,vocab_size] (one hot) -> [batch_size,seq_len,embed_dim] (embedding)
        self.rnn=nn.RNN(embed_dim,hidden_dim,batch_first=True) #[batch_size,seq_length,embed_dim] -> [batch_size,seq_length,hidden_size](output) , [1,batch_size,hidden_dim](hidden)
        self.fc=nn.Linear(hidden_dim,vocab_size) #[hidden_dim] -> [vocab_size]
        self.hidden_dim=hidden_dim

    def forward(self,x,hidden):
        x=self.embedding(x)
        output,hidden=self.rnn(x,hidden)
        output=self.fc(output)
        return output,hidden
    
    def init_hidden(self,batch_size):
        return torch.zeros(1,batch_size,self.hidden_dim)

In [7]:
class TextDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, index):
        return (torch.tensor(self.data[index:index+self.seq_length]),
                torch.tensor(self.data[index+1:index+self.seq_length+1]))

def create_data_loader(data, seq_length, batch_size):
    dataset = TextDataset(data, seq_length)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

In [8]:
# Parameters
vocab_size = tokenizer.vocab_size # From your tokenizer
embed_dim = 32
hidden_dim = 64
batch_size = 64
seq_length = 200

# Data Loader
train_loader = create_data_loader(train_data, seq_length, batch_size)

# Model Initialization
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TextRNN(vocab_size, embed_dim, hidden_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
epochs = 10

In [9]:
model.train()
hidden=model.init_hidden(batch_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=8e-4,weight_decay=1e-4)

for i in range(epochs):
    for inputs, targets in train_loader:
        inputs,targets=inputs.to(device),targets.to(device)
        hidden=hidden.detach()
        optimizer.zero_grad()
        outputs,hidden=model(inputs,hidden)
        targets=targets.reshape(-1)
        outputs=outputs.reshape(-1,vocab_size)
        loss=criterion(outputs,targets)
        loss.backward()
        optimizer.step()
        print(f"Epoch {i}, Loss: {loss.item()}")

Epoch 0, Loss: 4.004654407501221
Epoch 0, Loss: 3.9801807403564453
Epoch 0, Loss: 3.959139108657837
Epoch 0, Loss: 3.9377193450927734
Epoch 0, Loss: 3.915971279144287
Epoch 0, Loss: 3.89738392829895
Epoch 0, Loss: 3.8746819496154785
Epoch 0, Loss: 3.853184700012207
Epoch 0, Loss: 3.82606840133667
Epoch 0, Loss: 3.803192377090454
Epoch 0, Loss: 3.7757062911987305
Epoch 0, Loss: 3.7557013034820557
Epoch 0, Loss: 3.729123592376709
Epoch 0, Loss: 3.69992733001709
Epoch 0, Loss: 3.669398307800293
Epoch 0, Loss: 3.636410713195801
Epoch 0, Loss: 3.5978260040283203
Epoch 0, Loss: 3.557934522628784
Epoch 0, Loss: 3.5326225757598877
Epoch 0, Loss: 3.483281135559082
Epoch 0, Loss: 3.4439361095428467
Epoch 0, Loss: 3.4060986042022705
Epoch 0, Loss: 3.3591089248657227
Epoch 0, Loss: 3.325748920440674
Epoch 0, Loss: 3.2922234535217285
Epoch 0, Loss: 3.260511875152588
Epoch 0, Loss: 3.226857900619507
Epoch 0, Loss: 3.200868844985962
Epoch 0, Loss: 3.1785662174224854
Epoch 0, Loss: 3.1556882858276367


KeyboardInterrupt: 

In [10]:
def predict_next_tokens(model, initial_text, num_tokens_to_generate, tokenizer, temperature=1.0):
    model.eval()  # Set the model to evaluation mode

    # Initialize hidden state
    hidden = model.init_hidden(1).to(device)

    # Encode the initial text
    encoded_text = tokenizer.encode(initial_text)

    for i in range(len(encoded_text)):
        input_sequence = torch.tensor([encoded_text[i]]).unsqueeze(0).to(device)  # Use only the last token of the initial text
        output, hidden = model(input_sequence, hidden)

    

    generated_tokens = encoded_text  # Start with the initial encoded text

    for _ in range(num_tokens_to_generate):
        with torch.no_grad():
            output, hidden = model(input_sequence, hidden)
        
        # Apply softmax to the output to get probability distribution
        probabilities = F.softmax(output / temperature, dim=2).squeeze()

        # Sample from the probability distribution
        predicted_token = torch.multinomial(probabilities, 1).item()
        generated_tokens.append(predicted_token)

        # Update the input for the next prediction
        input_sequence = torch.tensor([[predicted_token]]).to(device)

    # Decode the generated tokens back to text
    predicted_text = tokenizer.decode(generated_tokens)
    return predicted_text

# Example usage
initial_text = "hello world!"
num_tokens_to_generate = 2000
temperature = 0.5  # Adjust for randomness
predicted_text = predict_next_tokens(model, initial_text, num_tokens_to_generate, tokenizer, temperature)
print(predicted_text)


hello world!

porrust fair and so for sire and east an them my the could the minds of they sir, you but thy good the love that of last and how shall all this of is of the martheming is to hath him in your let we and the but is is nencest see do good is to he to and not the right to good it serven the most this i have not my lord of the bauled to set the by the had her be mine thou so for the comes her it and soust that seets is beare and the canthere it bound in resellage, so math thee he want a to poon your here mane the wark'd in the but the sours, who speak of here be more some that he and the come of that in him of this here here he in good to good to the good it these greise to am the shall a lord of his some the charge,
i ding so the for the end in the earth him and the love for all the serven the poor the come as for the man the streacuse, the best this heard lord not the could me parry of hath her who are be a concence the mortengle and stand to shall and the concest it to here

In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

total_params = count_parameters(model)
print(f"Total trainable parameters: {total_params}")


Total trainable parameters: 11316
