In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [1]:
import string
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import random

In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
class Tokenizer:
    def __init__(self):
        # Generate the allowed characters inside the class
        self.special_chars=""
        self.allowed_chars = list(string.ascii_lowercase + string.digits + ".,;()[]{}?!:'" + " \t\n"+self.special_chars)
        self.vocab_size = len(self.allowed_chars)
        self.stoi = {ch: i for i, ch in enumerate(self.allowed_chars)}
        self.itos = {i: ch for i, ch in enumerate(self.allowed_chars)}

    def clean_text(self, text):
        """
        Simplify the input text using set operations. Keep only allowed characters and convert uppercase to lowercase.
        Prints a warning if any character is removed.

        :param text: The input text to be simplified.
        :return: Simplified text as a string.
        """
        text_set = set(text.lower())
        allowed_set = set(self.allowed_chars)
        intersection_set = text_set.intersection(allowed_set)
        removed_chars = text_set.difference(intersection_set)

        if removed_chars:
            print("Warning: The following characters were removed:", ''.join(sorted(removed_chars)))

        return ''.join(char for char in text.lower() if char in allowed_set)

    def encode(self, text):
        """ Encodes a string into a list of integers based on the stoi mapping. """
        return [self.stoi[char] for char in text]

    def decode(self, encoded_text):
        """ Decodes a list of integers into a string based on the itos mapping. """
        return ''.join([self.itos[i] for i in encoded_text])

# Example usage
tokenizer = Tokenizer()

In [4]:
cleaned_text=tokenizer.clean_text(text)
data=tokenizer.encode(cleaned_text)



In [5]:
split_idx=int(len(data)*0.85)
train_data=data[:split_idx]
test_data=data[split_idx:]

In [51]:
class TextLSTM(nn.Module):
    def __init__(self,vocab_size,embed_dim,hidden_dim):
        super(TextLSTM, self).__init__()
        self.embedding=nn.Embedding(vocab_size,embed_dim)
        self.lstm=nn.LSTM(embed_dim,hidden_dim,batch_first=True)
        #self.fc2=nn.Linear(hidden_dim,vocab_size)
        self.fc1=nn.Linear(hidden_dim,4*vocab_size)
        self.fc2=nn.Linear(4*vocab_size,vocab_size)
        self.hidden_dim=hidden_dim

    def forward(self,x,hidden):
        x=self.embedding(x)
        output,(hidden,cell)=self.lstm(x,hidden)
        output=F.tanh(self.fc1(output))
        output=self.fc2(output)
        return output,(hidden,cell)
    
    def init_hidden(self, batch_size):
        return (torch.zeros(1, batch_size, self.hidden_dim),
                torch.zeros(1, batch_size, self.hidden_dim))

In [52]:
class TextDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, index):
        return (torch.tensor(self.data[index:index+self.seq_length]),
                torch.tensor(self.data[index+1:index+self.seq_length+1]))

def create_data_loader(data, seq_length, batch_size):
    dataset = TextDataset(data, seq_length)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

In [55]:
# Parameters
vocab_size = tokenizer.vocab_size # From your tokenizer
embed_dim = 32
hidden_dim = 200
batch_size = 64
seq_length = 200

# Data Loader
train_loader = create_data_loader(train_data, seq_length, batch_size)

# Model Initialization
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TextLSTM(vocab_size, embed_dim, hidden_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
epochs = 1

In [56]:
# Training loop
model.train()
for i in range(epochs):
    step=0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        # Initialize hidden and cell states for each batch
        hidden, cell = model.init_hidden(batch_size)
        hidden, cell = hidden.to(device), cell.to(device)

        optimizer.zero_grad()

        # Detach the states from the history of previous computations
        hidden = hidden.detach()
        cell = cell.detach()

        outputs, (hidden, cell) = model(inputs, (hidden, cell))
        targets = targets.reshape(-1)
        outputs = outputs.reshape(-1, vocab_size)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        if step%100==0:
            print(f"Epoch {i}, Step {step/100}, Loss: {loss.item()}")
        step+=1



Epoch 0, Step 0.0, Loss: 3.9405481815338135
Epoch 0, Step 1.0, Loss: 2.387551784515381
Epoch 0, Step 2.0, Loss: 2.081256151199341
Epoch 0, Step 3.0, Loss: 1.9570472240447998
Epoch 0, Step 4.0, Loss: 1.8164162635803223
Epoch 0, Step 5.0, Loss: 1.7197681665420532
Epoch 0, Step 6.0, Loss: 1.728445291519165
Epoch 0, Step 7.0, Loss: 1.6220765113830566
Epoch 0, Step 8.0, Loss: 1.6275790929794312
Epoch 0, Step 9.0, Loss: 1.5457983016967773
Epoch 0, Step 10.0, Loss: 1.5708199739456177
Epoch 0, Step 11.0, Loss: 1.51517653465271
Epoch 0, Step 12.0, Loss: 1.486709713935852
Epoch 0, Step 13.0, Loss: 1.533898949623108
Epoch 0, Step 14.0, Loss: 1.4630956649780273
Epoch 0, Step 15.0, Loss: 1.4470769166946411
Epoch 0, Step 16.0, Loss: 1.4200410842895508
Epoch 0, Step 17.0, Loss: 1.4464285373687744
Epoch 0, Step 18.0, Loss: 1.4266728162765503
Epoch 0, Step 19.0, Loss: 1.3885650634765625
Epoch 0, Step 20.0, Loss: 1.441601276397705
Epoch 0, Step 21.0, Loss: 1.359474778175354
Epoch 0, Step 22.0, Loss: 1.3

In [63]:
def predict_next_tokens(model, initial_text, num_tokens_to_generate, tokenizer, temperature=1.0):
    model.eval()  # Set the model to evaluation mode
    device = next(model.parameters()).device  # Get the device of the model

    # Initialize hidden and cell states
    hidden, cell = model.init_hidden(1)
    hidden, cell = hidden.to(device), cell.to(device)

    # Encode the initial text
    encoded_text = tokenizer.encode(initial_text)

    for i in range(len(encoded_text)):  # Memorizing the initial text into hidden state
        input_sequence = torch.tensor([[encoded_text[i]]], dtype=torch.long).to(device)
        with torch.no_grad():
            output, (hidden, cell) = model(input_sequence, (hidden, cell))

    generated_tokens = encoded_text.copy()  # Start with the initial encoded text

    input_sequence = torch.tensor([[generated_tokens[-1]]], dtype=torch.long).to(device)
    for _ in range(num_tokens_to_generate):
        with torch.no_grad():
            output, (hidden, cell) = model(input_sequence, (hidden, cell))
            probabilities = F.softmax(output[:, -1, :] / temperature, dim=1).squeeze()
            predicted_token = torch.multinomial(probabilities, 1).item()
            generated_tokens.append(predicted_token)
            input_sequence = torch.tensor([[predicted_token]], dtype=torch.long).to(device)

    # Decode the generated tokens back to text
    predicted_text = tokenizer.decode(generated_tokens)
    return predicted_text

# Example usage
initial_text = "senate senate senate senate senate"
num_tokens_to_generate = 2000
temperature = 0.01  # Adjust for randomness
predicted_text = predict_next_tokens(model, initial_text, num_tokens_to_generate, tokenizer, temperature)
print(predicted_text)

senate senate senate senate senateth.
what says he doth she have strict me to the court?

king richard iii:
say, i am a strange to the common sort,
the common sorrow and the prince my lord of warwick,
that i may be the duke of gloucester, and the
sight of heaven with the state of death and him.

king richard iii:
say, i am a strange to the common sort,
the common sorrow and the prince my lord of warwick,
that i may be the state of warwick and the world
to see the state of death of the stars
that i have straight to the common sort,
the common sorrow and the prince my lord of warwick,
that i may be the state of warwick and the world
to see the state of death of the state,
and the prince my lord of norfolk, and the death
of the devil that thou art to be a king.

king henry vi:
and what is this the duke of gloucester, the gods
as i say the sea of her beginate and the
bring the streets of the common sort and stay
the strength and hanged to the common sort,
the common sorrow and the prince m

In [54]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

total_params = count_parameters(model)
print(f"Total trainable parameters: {total_params}")


Total trainable parameters: 241540
