In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import time


In [3]:
with open ('/kaggle/input/the-bards-best-a-character-modeling-dataset/train.csv','r',encoding='utf8') as f:
    text = f.read()
    

In [4]:
print("Dataset length: ",len(text))
print(text[:500])

Dataset length:  1003862
text
"First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounte


In [5]:
# Check for GPU
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [6]:
# Character-level tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

# Encode text
encoded_text = np.array([char_to_idx[ch] for ch in text])

In [7]:
class SmallTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers):
        super(SmallTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.transformer = nn.Transformer(embed_dim, num_heads, num_layers, batch_first=True)
        self.fc = nn.Linear(embed_dim, vocab_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x, x)  # Use transformer for self-attention
        x = self.fc(x)
        return x

# Initialize and configure model `m0`
embed_dim = 128
num_heads = 2
num_layers = 2
# Initialize and configure model `m0` and move to device
m0 = SmallTransformer(vocab_size, embed_dim, num_heads, num_layers).to(device)


In [9]:
class LargeTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers):
        super(LargeTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.transformer = nn.Transformer(embed_dim, num_heads, num_layers, batch_first=True)
        self.fc = nn.Linear(embed_dim, vocab_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x, x)
        x = self.fc(x)
        return x

# Initialize and configure benchmark model `M0`
large_embed_dim = 256
large_num_heads = 4
large_num_layers = 4
# Initialize and configure benchmark model `M0` and move to device
M0 = LargeTransformer(vocab_size, large_embed_dim, large_num_heads, large_num_layers).to(device)



In [12]:
import torch.nn.init as init

# Initialize a new large model `Mz` with small model parameters and move to device
Mz = LargeTransformer(vocab_size, large_embed_dim, large_num_heads, large_num_layers).to(device)
# Transfer parameters from `m0` to `Mz`
for (name_m0, param_m0), (name_mz, param_mz) in zip(m0.named_parameters(), Mz.named_parameters()):
    with torch.no_grad():
        if param_m0.size() == param_mz.size():
            # Exact match in dimensions
            param_mz.data.copy_(param_m0.data)
            print(f"Copied parameters for layer {name_m0}")
        else:
            # Initialize incompatible layers
            print(f"Initializing layer {name_mz} with Xavier initialization (incompatible dimensions)")
            if len(param_mz.size()) > 1:  # for weight matrices
                init.xavier_uniform_(param_mz.data)
            else:  # for biases
                param_mz.data.zero_()


Initializing layer embedding.weight with Xavier initialization (incompatible dimensions)
Initializing layer transformer.encoder.layers.0.self_attn.in_proj_weight with Xavier initialization (incompatible dimensions)
Initializing layer transformer.encoder.layers.0.self_attn.in_proj_bias with Xavier initialization (incompatible dimensions)
Initializing layer transformer.encoder.layers.0.self_attn.out_proj.weight with Xavier initialization (incompatible dimensions)
Initializing layer transformer.encoder.layers.0.self_attn.out_proj.bias with Xavier initialization (incompatible dimensions)
Initializing layer transformer.encoder.layers.0.linear1.weight with Xavier initialization (incompatible dimensions)
Copied parameters for layer transformer.encoder.layers.0.linear1.bias
Initializing layer transformer.encoder.layers.0.linear2.weight with Xavier initialization (incompatible dimensions)
Initializing layer transformer.encoder.layers.0.linear2.bias with Xavier initialization (incompatible dimen

In [16]:
def train_model(model, data, epochs, learning_rate=0.001):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    total_loss = 0
    for epoch in range(epochs):
        for i in range(0, len(data) - 64, 64):  # Batch size of 64
            # Prepare input and target tensors and move to device
            x = torch.tensor(data[i:i+64], dtype=torch.long).to(device)
            y = torch.tensor(data[i+1:i+65], dtype=torch.long).to(device)

            # Ensure the target length matches the output
            if x.shape[0] != y.shape[0]:
                continue  # Skip any batch where sizes do not match

            optimizer.zero_grad()
            output = model(x.unsqueeze(0))  # Shape: [1, 64, vocab_size]
            loss = criterion(output.view(-1, vocab_size), y.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(data)}")


In [17]:
def calculate_tflops(model, sequence_length, batch_size, training_steps, elapsed_time):
    # Estimated FLOPS based on embedding, transformer layers, and output
    model_parameters = sum(p.numel() for p in model.parameters())
    flops = 2 * model_parameters * sequence_length * batch_size * training_steps
    tflops = flops / (elapsed_time * 1e12)  # Convert to TFLOPS
    return tflops


In [18]:
# Define training parameters
epochs = 5
sequence_length = 100
batch_size = 64

# Train small model `m0`
start_time = time.time()
train_model(m0, encoded_text, epochs)
m0_time = time.time() - start_time

# Train large model `M0` from scratch (benchmark)
start_time = time.time()
train_model(M0, encoded_text, epochs)
M0_time = time.time() - start_time

# Train model `Mz` (initialized with parameters from `m0`)
start_time = time.time()
train_model(Mz, encoded_text, epochs)
Mz_time = time.time() - start_time



Epoch 1/5, Loss: 0.05161079641999517
Epoch 2/5, Loss: 0.10320671208776241
Epoch 3/5, Loss: 0.15479829725710953
Epoch 4/5, Loss: 0.20638756768273503
Epoch 5/5, Loss: 0.257975376090432
Epoch 1/5, Loss: 0.05166240483111835
Epoch 2/5, Loss: 0.10327308850447317
Epoch 3/5, Loss: 0.15486950116922563
Epoch 4/5, Loss: 0.20645937397973488
Epoch 5/5, Loss: 0.25804622215404066
Epoch 1/5, Loss: 0.05166403439199846
Epoch 2/5, Loss: 0.1032717127213844
Epoch 3/5, Loss: 0.1548645025887736
Epoch 4/5, Loss: 0.2064531689130931
Epoch 5/5, Loss: 0.2580391621974935


In [19]:
# Calculate TFLOPS for each model
training_steps = epochs * len(encoded_text) // batch_size

m0_tflops = calculate_tflops(m0, sequence_length, batch_size, training_steps, m0_time)
M0_tflops = calculate_tflops(M0, sequence_length, batch_size, training_steps, M0_time)
Mz_tflops = calculate_tflops(Mz, sequence_length, batch_size, training_steps, Mz_time)

# Display Results
print(f"TFLOPS (Small Model `m0`): {m0_tflops:.4f}")
print(f"TFLOPS (Large Model `M0` from scratch): {M0_tflops:.4f}")
print(f"TFLOPS (Large Model `Mz` initialized with `m0`): {Mz_tflops:.4f}")


TFLOPS (Small Model `m0`): 2.7884
TFLOPS (Large Model `M0` from scratch): 6.7321
TFLOPS (Large Model `Mz` initialized with `m0`): 6.7340
