In [1]:
# Load the dataset
with open('input.txt', 'r') as file:
    text = file.read()

print(f"Length of text: {len(text)} characters")
print(f"Sample text: {text[:500]}")  # Display first 500 characters
print(f"Unique characters: {sorted(set(text))}")


Length of text: 1115394 characters
Sample text: First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor
Unique characters: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [2]:
# Create a mapping from character to index and vice versa
unique_chars = sorted(set(text))


In [3]:

char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
index_to_char = {idx: char for idx, char in enumerate(unique_chars)}


In [4]:

# Encode the entire text into a list of indices
encoded_text = [char_to_index[char] for char in text]

print(f"Encoded text: {encoded_text[:1000]}")  


Encoded text: [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49, 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 37, 53, 59, 1, 39, 56, 43, 1, 39, 50, 50, 1, 56, 43, 57, 53, 50, 60, 43, 42, 1, 56, 39, 58, 46, 43, 56, 1, 58, 53, 1, 42, 47, 43, 1, 58, 46, 39, 52, 1, 58, 53, 1, 44, 39, 51, 47, 57, 46, 12, 0, 0, 13, 50, 50, 10, 0, 30, 43, 57, 53, 50, 60, 43, 42, 8, 1, 56, 43, 57, 53, 50, 60, 43, 42, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 18, 47, 56, 57, 58, 6, 1, 63, 53, 59, 1, 49, 52, 53, 61, 1, 15, 39, 47, 59, 57, 1, 25, 39, 56, 41, 47, 59, 57, 1, 47, 57, 1, 41, 46, 47, 43, 44, 1, 43, 52, 43, 51, 63, 1, 58, 53, 1, 58, 46, 43, 1, 54, 43, 53, 54, 50, 43, 8, 0, 0, 13, 50, 50, 10, 0, 35, 43, 1, 49, 5

In [5]:
# Sequence length (number of characters in each input sequence)
sequence_length = 50

# Create input sequences and corresponding target labels
sequences = []
labels = []

for i in range(len(encoded_text) - sequence_length):
    input_seq = encoded_text[i:i + sequence_length]
    target = encoded_text[i + sequence_length]
    
    sequences.append(input_seq)
    labels.append(target)

print(f"First sequence: {sequences[0]} (decoded: {''.join(index_to_char[idx] for idx in sequences[0])})")
print(f"First label: {labels[0]} (decoded: {index_to_char[labels[0]]})")


First sequence: [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56] (decoded: First Citizen:
Before we proceed any further, hear)
First label: 1 (decoded:  )


In [6]:
import torch

# Convert sequences and labels to tensors
sequences_tensor = torch.tensor(sequences, dtype=torch.long)
labels_tensor = torch.tensor(labels, dtype=torch.long)

print(f"Shape of input tensor: {sequences_tensor.shape}")
print(f"Shape of labels tensor: {labels_tensor.shape}")


Shape of input tensor: torch.Size([1115344, 50])
Shape of labels tensor: torch.Size([1115344])


In [7]:
import torch.nn as nn

class CharRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x):
        x = self.embedding(x)  # Convert indices to embeddings
        out, _ = self.rnn(x)  # RNN output
        out = self.linear(out[:, -1, :])  # Only take the last time step's output
        return out


In [8]:
# Hyperparameters
hidden_size = 128
vocab_size = len(unique_chars)

# Initialize model, loss function, and optimizer
model = CharRNN(vocab_size, hidden_size)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)


In [9]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for i in range(len(sequences_tensor)):
        input_seq = sequences_tensor[i].unsqueeze(0)  # Add batch dimension
        target = labels_tensor[i].unsqueeze(0)  # Add batch dimension
        
        # Forward pass
        output = model(input_seq)
        
        # Compute loss
        loss = loss_fn(output, target)
        total_loss += loss.item()
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(sequences_tensor):.4f}")


KeyboardInterrupt: 

In [26]:
# Function to generate text given a starting sequence
def generate_text(start_seq, length=20):
    model.eval()
    generated_seq = start_seq
    
    input_seq = torch.tensor([char_to_index[char] for char in start_seq], dtype=torch.long).unsqueeze(0)
    
    with torch.inference_mode():
        for i in range(length):
            output = model(input_seq)
            predicted_index = torch.argmax(output, dim=1).item()
            predicted_char = index_to_char[predicted_index]
            
            generated_seq += predicted_char
            
            # Update input sequence (move window forward)
            input_seq = torch.cat([input_seq[:, 1:], torch.tensor([[predicted_index]])], dim=1)
    
    return generated_seq

# Test the model
start_sequence = "hello"
print(f"Generated text: {generate_text(start_sequence)}")


Generated text: hello the world of pytorc
