In [1]:
# Install Required Libraries
# We start by cloning the NanoGPT repository and installing dependencies.
!git clone https://github.com/karpathy/nanoGPT.git
%cd nanoGPT

# Install Python dependencies such as transformers, datasets, and others required for training.
!pip install numpy transformers datasets tiktoken wandb tqdm

# Upgrade PyTorch to version 2.0.1 with CUDA 11.8 support for better performance.
!pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 --index-url https://download.pytorch.org/whl/cu118


Cloning into 'nanoGPT'...
remote: Enumerating objects: 682, done.[K
remote: Total 682 (delta 0), reused 0 (delta 0), pack-reused 682 (from 1)[K
Receiving objects: 100% (682/682), 952.47 KiB | 11.76 MiB/s, done.
Resolving deltas: 100% (385/385), done.
/content/nanoGPT
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downl

# Check CUDA Availability
Here, we ensure GPU acceleration is available for efficient training.

In [2]:
import torch

# Check if CUDA is available for faster computation. Otherwise, fallback to CPU.
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. GPU device name: {gpu_name}")
else:
    print("CUDA is not available. No GPU detected.")


CUDA is available. GPU device name: Tesla T4


# Download Dataset
We choose a book from Project Gutenberg to train the model. This example uses a non-Shakespeare dataset.

In [3]:
# Create directory and download "Frankenstein"
!mkdir -p data/frankenstein
!wget https://www.gutenberg.org/files/84/84-0.txt -O data/frankenstein/input.txt


--2024-12-07 22:15:35--  https://www.gutenberg.org/files/84/84-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 428995 (419K) [text/plain]
Saving to: ‘data/frankenstein/input.txt’


2024-12-07 22:15:36 (1.23 MB/s) - ‘data/frankenstein/input.txt’ saved [428995/428995]



# Prepare the Dataset
We process the raw text into a tokenized binary format for training and validation.

In [4]:
import os
import pickle
import numpy as np

# Define input and output paths
input_file_path = 'data/frankenstein/input.txt'
train_output_file_path = 'data/frankenstein/train.bin'
val_output_file_path = 'data/frankenstein/val.bin'

# Ensure the directory structure exists
os.makedirs('data/frankenstein', exist_ok=True)

# Read the input text
with open(input_file_path, 'r', encoding='utf-8') as f:
    data = f.read()

# Get all unique characters
chars = sorted(list(set(data)))
vocab_size = len(chars)
print(f"Unique characters: {vocab_size}")

# Create a mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

# Encode the entire text data
encoded_data = np.array([stoi[c] for c in data], dtype=np.uint16)

# Split data into training and validation sets
n = int(0.9 * len(encoded_data))
train_data = encoded_data[:n]
val_data = encoded_data[n:]

# Save the data to .bin files
train_data.tofile(train_output_file_path)
val_data.tofile(val_output_file_path)

# Save the mapping for decoding
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}
with open('data/frankenstein/meta.pkl', 'wb') as f:
    pickle.dump(meta, f)

print("Data preparation complete.")


Unique characters: 84
Data preparation complete.


In [5]:
!ls data/frankenstein


input.txt  meta.pkl  train.bin	val.bin


# Run the preparation script:

In [6]:
!python data/frankenstein/prepare.py


python3: can't open file '/content/nanoGPT/data/frankenstein/prepare.py': [Errno 2] No such file or directory


# Create Configuration File
The configuration specifies model hyperparameters like layers, heads, and training settings.

In [7]:
# Write the configuration file for training
config_path = "config/frankenstein.py"

# Ensure the config directory exists
os.makedirs("config", exist_ok=True)

# Write the config file
with open(config_path, "w") as f:
    f.write("""
# Configuration for training NanoGPT on Frankenstein
out_dir = 'out-frankenstein'
eval_interval = 500
eval_iters = 200
log_interval = 100

always_save_checkpoint = False

wandb_log = False  # Enable this if using Weights & Biases
wandb_project = 'frankenstein'
wandb_run_name = 'nano-gpt-frankenstein'

dataset = 'frankenstein'
batch_size = 64
block_size = 256  # Maximum context length

# Model architecture
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

# Training parameters
learning_rate = 1e-3
max_iters = 5000
lr_decay_iters = 5000
min_lr = 1e-4
beta2 = 0.99
warmup_iters = 100
    """)
print(f"Config file created at {config_path}")


Config file created at config/frankenstein.py


# Train the Model
Finally, we initiate the training process using the prepared dataset and configurations.

In [8]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. GPU device name: {gpu_name}")
else:
    print("CUDA is not available. Using CPU.")


CUDA is available. GPU device name: Tesla T4


In [9]:
!pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 --index-url https://download.pytorch.org/whl/cu118
!pip install triton==2.0.0


Looking in indexes: https://download.pytorch.org/whl/cu118


In [None]:
!python train.py config/frankenstein.py


Overriding config with config/frankenstein.py:

# Configuration for training NanoGPT on Frankenstein
out_dir = 'out-frankenstein'
eval_interval = 500
eval_iters = 200
log_interval = 100

always_save_checkpoint = False

wandb_log = False  # Enable this if using Weights & Biases
wandb_project = 'frankenstein'
wandb_run_name = 'nano-gpt-frankenstein'

dataset = 'frankenstein'
batch_size = 64
block_size = 256  # Maximum context length

# Model architecture
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

# Training parameters
learning_rate = 1e-3
max_iters = 5000
lr_decay_iters = 5000
min_lr = 1e-4
beta2 = 0.99
warmup_iters = 100
    
tokens per iteration will be: 655,360
found vocab_size = 84 (inside data/frankenstein/meta.pkl)
Initializing a new model from scratch
number of parameters: 10.65M
num decayed parameter tensors: 26, with 10,747,392 parameters
num non-decayed parameter tensors: 13, with 4,992 parameters
using fused AdamW: True
compiling the model... (takes a ~minute)
step 0: 