In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.amp import autocast  # Added import for autocast

with open('input.txt', 'r') as f:
    text = f.read()

# ---------Constants---------
vocab_size = len(set(text))
batch_size = 64
block_size = 128
learning_rate = 3e-3
max_iters = 10001
eval_interval = 500
eval_iters = 200
n_embed = 128
num_heads = 5
n_block_layer = 5
dropout = 0.15
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ---------------------------

# Converting words into tokens
stoi = {ch: i for i, ch in enumerate(sorted(list(set(text))))}
itos = {i: ch for i, ch in enumerate(sorted(list(set(text))))}

# Encoding and Decoding
encode = lambda x: [stoi[i] for i in x]
decode = lambda x: ''.join([itos[i] for i in x])

# Converting the whole data into tokens
data = torch.tensor(encode(text)).to(device)

# Train-test data split
split = int(0.9 * len(data))
train_data, val_data = data[:split], data[split:]

# Getting a random set of data to train on
def get_batch(split):
    data = train_data if split == 'train' else val_data
    idx = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in idx])
    y = torch.stack([data[i+1:i+1+block_size] for i in idx])
    return x.to(device), y.to(device)

x, y = get_batch('train')
torch.manual_seed(42)

# Class definitions remain the same (Head, MultiHead, FeedForward, Block)
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        query = self.query(x)
        key = self.key(x)
        
        weight = query @ key.transpose(-2, -1) * (C**-0.5)
        weight = weight.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        weight = F.softmax(weight, dim=-1)
        weight = self.dropout(weight)
        
        value = self.value(x)
        output = weight @ value
        return output

class MultiHead(nn.Module):
    def __init__(self, head_size, num_heads):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.projection = nn.Linear(num_heads * head_size, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        output = torch.cat([head(x) for head in self.heads], dim=-1)
        output = self.projection(output)
        output = self.dropout(output)
        return output

class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.GELU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHead(head_size=head_size, num_heads=n_head)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embed)
        self.position_embedding = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(*[Block(n_embed=n_embed, n_head=num_heads) for _ in range(n_block_layer)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, target=None):
        B, T = idx.shape
        tok_emb = self.token_embedding(idx)
        pos_emb = self.position_embedding(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if target is None:
            losses = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            target = target.view(B * T)
            losses = F.cross_entropy(logits, target)
        return logits, losses

    def generate(self, idx, max_tokens):
        for _ in range(max_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self.forward(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_next], dim=1)
        return idx

# Fixed estimate_loss function
@torch.no_grad()
def estimate_loss(model):  # Added model parameter
    output = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y = get_batch(split)
            with autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu'):
                logits, loss = model(x, y)
            losses[k] = loss.item()
        output[split] = losses.mean()
    model.train()
    return output

# def train_model():
#     with open('input.txt', 'r') as f:
#         text = f.read()
#     
#     stoi = {ch:i for i, ch in enumerate(sorted(list(set(text))))}
#     itos = {i:ch for i, ch in enumerate(sorted(list(set(text))))}
#     encode = lambda x: [stoi[i] for i in x]
#     decode = lambda x: ''.join([itos[i] for i in x])
#     
#     data = torch.tensor(encode(text)).to(device)
#     split = int(0.9 * len(data))
#     train_data, val_data = data[:split], data[split:]
#     
#     def get_batch(split):
#         data = train_data if split=='train' else val_data
#         idx = torch.randint(len(data) - block_size, (batch_size,))
#         x = torch.stack([data[i:i+block_size] for i in idx])
#         y = torch.stack([data[i+1:i+1+block_size] for i in idx])
#         return x.to(device), y.to(device)  # Ensure tensors are on correct device
# 
#     # Initialize model
#     model = BigramLanguageModel(vocab_size).to(device)
#     optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
#     
#     # Training loop
#     for iter in range(max_iters):
#         if (iter % eval_interval) == 0:
#             losses = estimate_loss(model)  # Pass model to estimate_loss
#             print(f"Iter {iter}: Train Loss --> {losses['train']:.4f}, Val Loss --> {losses['val']:.4f}")
#         
#         x, y = get_batch('train')
#         logits, loss = model(x, y)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#     
#     # Save the trained model
#     torch.save(model.state_dict(), 'bigram_model.pth')
#     print("Model saved as 'bigram_model.pth'")
#     
#     # Save the stoi and itos dictionaries
#     torch.save({'stoi': stoi, 'itos': itos}, 'mappings.pth')
# 
# def generate_text(num_tokens=600):
#     mappings = torch.load('mappings.pth')
#     stoi, itos = mappings['stoi'], mappings['itos']
#     decode = lambda x: ''.join([itos[i] for i in x])
#     
#     model = BigramLanguageModel(vocab_size).to(device)
#     model.load_state_dict(torch.load('bigram_model.pth'))
#     model.eval()
#     
#     fill = torch.zeros((1,1), dtype=torch.long).to(device)
#     output = decode(model.generate(fill, num_tokens)[0].tolist())
#     return output
# 
# if __name__ == "__main__":
    train_model()
    generated_text = generate_text(600)
    print(generated_text)
    more_text = generate_text(1000)
    print(more_text)

In [38]:
def train_model():
    with open('input.txt', 'r') as f:
        text = f.read()

    stoi = {ch:i for i,ch in enumerate(sorted(list(set(text))))}
    itos = {i:ch for i,ch in enumerate(sorted(list(set(text))))}

    encode = lambda x: [stoi[i] for i in x]
    decode = lambda x: ''.join([itos[i] for i in x])

    data = torch.tensor(encode(text)).to(device)
    split = int(0.9 * len(data))

    train_data, val_data = data[:split], data[split:]
    train_data.to(device)
    val_data.to(device)
    
    def get_batch(split):
        data = train_data if split=='train' else val_data
        idx = torch.randint(len(data) - block_size, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in idx])
        y = torch.stack([data[i+1:i+1+block_size] for i in idx])
        return x,y  # Ensure tensors are on correct device

    model = BigramLanguageModel(vocab_size).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

    for iter in range(max_iters):
        if (iter % eval_interval == 0):
            losses = estimate_loss(model)  # Pass model to estimate_loss
            print(f"Iter {iter}: Train Loss --> {losses['train']:.4f}, Val Loss --> {losses['val']:.4f}")
        
        x,y = get_batch('train')
        logits, loss = model(x,y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    torch.save(model.state_dict(), 'bigram_model.pth')
    print('Model saved')
    
    torch.save({'stoi':stoi, 'itos':itos}, 'mappings.pth')


In [39]:
def generate_text(max_number_tokens = 1000):
    mappings = torch.load('mappings.pth')
    stoi, itos = mappings['stoi'], mappings['itos']
    decode = lambda x: ''.join([itos[i] for i in x])

    model = BigramLanguageModel(vocab_size).to(device)
    model.load_state_dict(torch.load('bigram_model.pth'))
    model.eval()

    fill = torch.zeros((1,1), dtype=torch.long).to(device)
    output = decode(model.generate(fill, max_number_tokens)[0].tolist())
    return output


In [40]:
def continue_training(additional_iterations=5000, model_path='bigram_model.pth'):
    with open('input.txt', 'r') as f:
        text = f.read()

    stoi = {ch:i for i,ch in enumerate(sorted(list(set(text))))}
    itos = {i:ch for i,ch in enumerate(sorted(list(set(text))))}

    encode = lambda x: [stoi[i] for i in x]
    decode = lambda x: ''.join([itos[i] for i in x])

    data = torch.tensor(encode(text)).to(device)
    split = int(0.9 * len(data))

    train_data, val_data = data[:split], data[split:]
    train_data.to(device)
    val_data.to(device)
    
    def get_batch(split):
        data = train_data if split=='train' else val_data
        idx = torch.randint(len(data) - block_size, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in idx])
        y = torch.stack([data[i+1:i+1+block_size] for i in idx])
        return x,y  # Ensure tensors are on correct device

    model = BigramLanguageModel(vocab_size).to(device)
    model.load_state_dict(torch.load(model_path))
    optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

    for iter in range(additional_iterations):
        if (iter % eval_interval == 0):
            losses = estimate_loss(model)  # Pass model to estimate_loss
            print(f"Iter {iter}: Train Loss --> {losses['train']:.4f}, Val Loss --> {losses['val']:.4f}")
        
        x,y = get_batch('train')
        logits, loss = model(x,y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    torch.save(model.state_dict(), 'bigram_model.pth')
    print(f"Model continued training for {additional_iterations} more iterations and saved as {model_path}")

In [41]:
train_model()
# Iter 0: Train Loss --> 4.3687, Val Loss --> 4.3682
# Iter 500: Train Loss --> 1.6300, Val Loss --> 1.8004
# Iter 1000: Train Loss --> 1.4465, Val Loss --> 1.6391
# Iter 1500: Train Loss --> 1.3710, Val Loss --> 1.5721
# Iter 2000: Train Loss --> 1.3305, Val Loss --> 1.5529
# Iter 2500: Train Loss --> 1.2961, Val Loss --> 1.5272
# Iter 3000: Train Loss --> 1.2783, Val Loss --> 1.5120
# Iter 3500: Train Loss --> 1.2588, Val Loss --> 1.5092
# Iter 4000: Train Loss --> 1.2477, Val Loss --> 1.4995
# Iter 4500: Train Loss --> 1.2303, Val Loss --> 1.4819
# Iter 5000: Train Loss --> 1.2189, Val Loss --> 1.4829
# Iter 5500: Train Loss --> 1.2106, Val Loss --> 1.4925
# Iter 6000: Train Loss --> 1.2032, Val Loss --> 1.4917
# Iter 6500: Train Loss --> 1.1928, Val Loss --> 1.4779
# Iter 7000: Train Loss --> 1.1870, Val Loss --> 1.4691
# Iter 7500: Train Loss --> 1.1769, Val Loss --> 1.4804
# Iter 8000: Train Loss --> 1.1776, Val Loss --> 1.4801
# Iter 8500: Train Loss --> 1.1687, Val Loss --> 1.4661
# Iter 9000: Train Loss --> 1.1613, Val Loss --> 1.4757
# Iter 9500: Train Loss --> 1.1546, Val Loss --> 1.4754
# Iter 10000: Train Loss --> 1.1473, Val Loss --> 1.4711

Iter 0: Train Loss --> 4.3624, Val Loss --> 4.3501
Iter 500: Train Loss --> 1.6879, Val Loss --> 1.8428
Iter 1000: Train Loss --> 1.4996, Val Loss --> 1.6933
Iter 1500: Train Loss --> 1.4204, Val Loss --> 1.6203
Iter 2000: Train Loss --> 1.3804, Val Loss --> 1.5967
Iter 2500: Train Loss --> 1.3474, Val Loss --> 1.5596
Iter 3000: Train Loss --> 1.3323, Val Loss --> 1.5473
Iter 3500: Train Loss --> 1.3131, Val Loss --> 1.5350
Iter 4000: Train Loss --> 1.3019, Val Loss --> 1.5231
Iter 4500: Train Loss --> 1.2881, Val Loss --> 1.5240
Iter 5000: Train Loss --> 1.2812, Val Loss --> 1.5150
Iter 5500: Train Loss --> 1.2704, Val Loss --> 1.5040
Iter 6000: Train Loss --> 1.2594, Val Loss --> 1.5015
Iter 6500: Train Loss --> 1.2562, Val Loss --> 1.5021
Iter 7000: Train Loss --> 1.2466, Val Loss --> 1.5035
Iter 7500: Train Loss --> 1.2411, Val Loss --> 1.4808
Iter 8000: Train Loss --> 1.2354, Val Loss --> 1.4943
Iter 8500: Train Loss --> 1.2277, Val Loss --> 1.4872
Iter 9000: Train Loss --> 1.2276

In [44]:
output_file = 'generated2000.txt'
output_text = generate_text(1000)
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(output_text)
print(f"Generated text saved to {output_file}")

Generated text saved to generated2000.txt


In [43]:
continue_training(additional_iterations=5000)

Iter 0: Train Loss --> 1.2150, Val Loss --> 1.4836
Iter 500: Train Loss --> 1.2148, Val Loss --> 1.4762
Iter 1000: Train Loss --> 1.2108, Val Loss --> 1.4717
Iter 1500: Train Loss --> 1.2093, Val Loss --> 1.4779
Iter 2000: Train Loss --> 1.2018, Val Loss --> 1.4750
Iter 2500: Train Loss --> 1.1989, Val Loss --> 1.4711
Iter 3000: Train Loss --> 1.1946, Val Loss --> 1.4718
Iter 3500: Train Loss --> 1.1950, Val Loss --> 1.4712
Iter 4000: Train Loss --> 1.1920, Val Loss --> 1.4669
Iter 4500: Train Loss --> 1.1912, Val Loss --> 1.4720
Model continued training for 5000 more iterations and saved as bigram_model.pth
