In [1]:
with open ('/kaggle/input/the-bards-best-a-character-modeling-dataset/train.csv','r',encoding='utf8') as f:
    text = f.read()
    

In [2]:
print("Dataset length: ",len(text))
print(text[:500])

Dataset length:  1003862
text
"First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounte


In [3]:
# Check for GPU
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [4]:
import numpy as np

# Create a character-level vocabulary
chars = sorted(list(set(text)))
char2idx = {ch: idx for idx, ch in enumerate(chars)}
idx2char = {idx: ch for ch, idx in char2idx.items()}

# Encode the entire text
encoded_text = np.array([char2idx[ch] for ch in text])

# Create sequences for training
seq_length = 50  # Length of each sequence
data = []
target = []

for i in range(len(encoded_text) - seq_length):
    data.append(encoded_text[i:i + seq_length])
    target.append(encoded_text[i + 1:i + seq_length + 1])

data = np.array(data)
target = np.array(target)
print(f"Shape of data: {data.shape}, Shape of target: {target.shape}")


Shape of data: (1003812, 50), Shape of target: (1003812, 50)


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# Hyperparameters
input_dim = len(chars)  # Vocabulary size (number of unique characters)
hidden_dim = 256
num_layers = 3  # Set the small model to have 3 layers
output_dim = len(chars)  # Output dimension should match the vocabulary size
batch_size = 64
learning_rate = 0.001
num_epochs = 10

# Small LSTM Model Definition with 3 layers
class SmallLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(SmallLSTM, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(num_layers, x.size(0), hidden_dim).to(x.device)
        c0 = torch.zeros(num_layers, x.size(0), hidden_dim).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out)
        return out


In [6]:
# Prepare DataLoader for training
tensor_data = torch.tensor(data, dtype=torch.long)
tensor_target = torch.tensor(target, dtype=torch.long)
dataset = TensorDataset(tensor_data, tensor_target)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize and train the small model
small_model = SmallLSTM(input_dim, hidden_dim, output_dim, num_layers)
small_model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(small_model.parameters(), lr=learning_rate)

In [7]:
import time
# FLOP calculation based on LSTM and Linear layer operations
def calculate_flops(input_dim, hidden_dim, output_dim, batch_size, seq_length, num_layers):
    # FLOPs for LSTM Cell per timestep: 4 * H * (H + I) * B
    flops_per_lstm_cell = 4 * hidden_dim * (hidden_dim + input_dim) * batch_size
    total_lstm_flops = flops_per_lstm_cell * seq_length * num_layers
    
    # FLOPs for output layer (Linear layer): H * V * B
    flops_per_output_layer = hidden_dim * output_dim * batch_size * seq_length
    
    # Total FLOPs per forward pass
    total_flops = total_lstm_flops + flops_per_output_layer
    
    # Multiply by 2 for forward and backward pass
    return total_flops * 2

# Training loop with time and FLOP calculation
num_epochs = 10
batch_size = 64

# Measure training time and compute FLOPs
start_time = time.time()
total_flops = calculate_flops(input_dim, hidden_dim, output_dim, batch_size, seq_length, num_layers) * len(data_loader) * num_epochs
# Training loop for small model with 3 layers
for epoch in range(num_epochs):
    small_model.train()
    total_loss = 0
    for inputs, targets in data_loader:
        # Move data to the same device as the model
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = small_model(inputs)
        loss = criterion(outputs.view(-1, output_dim), targets.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(data_loader):.4f}")

# End time after training
end_time = time.time()
training_time_seconds = end_time - start_time

# Calculate TFLOPS
tflops = total_flops / (training_time_seconds * 1e12)
print(f"Total Training Time: {training_time_seconds:.2f} seconds")
print(f"Approximate Training Performance: {tflops:.2f} TFLOPS")

Epoch [1/10], Loss: 1.3202
Epoch [2/10], Loss: 1.0174
Epoch [3/10], Loss: 0.8533
Epoch [4/10], Loss: 0.7571
Epoch [5/10], Loss: 0.7004
Epoch [6/10], Loss: 0.6645
Epoch [7/10], Loss: 0.6400
Epoch [8/10], Loss: 0.6220
Epoch [9/10], Loss: 0.6083
Epoch [10/10], Loss: 0.5974
Total Training Time: 2392.73 seconds
Approximate Training Performance: 0.42 TFLOPS


In [8]:
# Measure training time and compute FLOPs
start_time = time.time()
total_flops = calculate_flops(input_dim, hidden_dim, output_dim, batch_size, seq_length, num_layers) * len(data_loader) * num_epochs
# End time after training
end_time = time.time()
training_time_seconds = end_time - start_time

class LargeLSTM(nn.Module):
    def __init__(self, small_model, input_dim, hidden_dim, num_layers, output_dim):
        super(LargeLSTM, self).__init__()
        self.embedding = small_model.embedding  # Reuse the embedding layer
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers * 2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

        # Initialize the first 3 layers with the small model's parameters
        for i in range(num_layers):
            # Copy weights and biases for the first 3 layers from small_model to large_model
            self.lstm.weight_ih_l0.data.copy_(small_model.lstm.weight_ih_l0.data)
            self.lstm.weight_hh_l0.data.copy_(small_model.lstm.weight_hh_l0.data)
            self.lstm.bias_ih_l0.data.copy_(small_model.lstm.bias_ih_l0.data)
            self.lstm.bias_hh_l0.data.copy_(small_model.lstm.bias_hh_l0.data)
            
            # Repeat for other layers if needed (adjust according to the number of layers)
            if num_layers > 1:
                self.lstm.weight_ih_l1.data.copy_(small_model.lstm.weight_ih_l1.data)
                self.lstm.weight_hh_l1.data.copy_(small_model.lstm.weight_hh_l1.data)
                self.lstm.bias_ih_l1.data.copy_(small_model.lstm.bias_ih_l1.data)
                self.lstm.bias_hh_l1.data.copy_(small_model.lstm.bias_hh_l1.data)

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(num_layers * 2, x.size(0), hidden_dim).to(x.device)
        c0 = torch.zeros(num_layers * 2, x.size(0), hidden_dim).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out)
        return out

# Initialize large model with parameters from the small model
large_model = LargeLSTM(small_model, input_dim, hidden_dim, num_layers, output_dim)

# Calculate TFLOPS
tflops = total_flops / (training_time_seconds * 1e12)
print(f"Total Training Time: {training_time_seconds:.2f} seconds")
print(f"Approximate Training Performance: {tflops:.2f} TFLOPS")

Total Training Time: 0.00 seconds
Approximate Training Performance: 11236099.26 TFLOPS
