In [None]:
!pip install sentencepiece



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
import os
sys.path.append('/content/drive/MyDrive/GPT-2-Reproduction/')

In [None]:
import sentencepiece as spm
import torch
import time
from tqdm import tqdm
from Scripts.GPT2 import GPT, GPTConfig
from torch.distributed import init_process_group, destroy_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import pickle
import math
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import warnings
from datetime import datetime
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
device = 'cpu'
if torch.cuda.is_available():
    device= 'cuda'

In [None]:
total_batch_size = 524288
B = 16
T = 1024
num_return_sequence = 5
max_length = 30
max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 10
max_steps = 50
log_dir = '/content/drive/MyDrive/GPT-2-Reproduction/Logs/'

In [None]:
ddp = int(os.environ.get('RANK', -1)) != -1
if ddp:
    assert torch.cuda.is_available(), "for now i think we need CUDA for DDP"
    init_process_group(backend='nccl')
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0
else:
    ddp_rank = 0
    ddp_local_rank = 0
    ddp_world_size = 1
    master_process = True
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"
    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        device = "mps"
    print(f"using device: {device}")

using device: cuda


In [None]:
assert total_batch_size % (B * T * ddp_world_size) == 0, "make sure total_batch_size is divisible by B * T * ddp_world_size"
grad_accum_steps = total_batch_size // (B * T * ddp_world_size)
if master_process:
    print(f"total desired batch size: {total_batch_size}")
    print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")

total desired batch size: 524288
=> calculated gradient accumulation steps: 32


In [None]:
sp = spm.SentencePieceProcessor()
sp.load('/content/drive/MyDrive/GPT-2-Reproduction/Models/bpe_tokenizer.model')

True

In [None]:
T = 1025
bos_id = 1
eos_id = 2
pad_token_id = 3
tokenizer = sp

data_dir = '/content/drive/MyDrive/GPT-2-Reproduction/Data/Preprocessed_texts'
output_dir = '/content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences'
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(data_dir):
    file_path = os.path.join(data_dir, filename)

    with open(file_path, 'rb') as f:
        data = pickle.load(f)

    padded_sentences = []
    full_token = []

    for sentence in tqdm(data):
        sentence_tokens = tokenizer.encode(sentence)

        if len(full_token) + len(sentence_tokens) + 1 > T:
            while len(full_token) < T:
                full_token.append(pad_token_id)

            padded_sentences.append(full_token)
            full_token = [bos_id]

        full_token.append(bos_id)
        full_token.extend(sentence_tokens)
        full_token.append(eos_id)

    if len(full_token) > 1:
        full_token.append(eos_id)
        while len(full_token) < T:
            full_token.append(pad_token_id)
        padded_sentences.append(full_token)

    output_file_path = os.path.join(output_dir, f"processed_{filename}")
    with open(output_file_path, 'wb') as f:
        pickle.dump(padded_sentences, f)

    print(f"Processed and saved file: {output_file_path}")

In [None]:
with open('/content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_3.pkl', 'rb') as f:
    data = pickle.load(f)

In [None]:
amnt = 0
files = sorted(os.listdir('/content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences'))
for i in tqdm(files[:-2]):
    with open(f'/content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/{i}', 'rb') as f:
        data = pickle.load(f)
        amnt += len(data)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, train_data_dir, status):
        super().__init__()
        self.train_data_dir = train_data_dir
        self.status = status
        self.files = sorted(os.listdir(self.train_data_dir))

        if self.status == "train":
            self.selected_files = self.files[:-2]
        else:
            self.selected_files = self.files[-2:]

        self.index_map = self._create_index_mapping()

    def _create_index_mapping(self):
        """Create a mapping of sentence index to file index for efficient loading."""
        index_map = []
        sentence_count = 0

        for file_index, file in enumerate(self.selected_files):
            file_path = os.path.join(self.train_data_dir, file)

            with open(file_path, 'rb') as f:
                num_sentences = sum(1 for _ in pickle.load(f))

            index_map.extend([(file_index, i) for i in range(num_sentences)])
            sentence_count += num_sentences

        return index_map

    def __len__(self):
        return len(self.index_map)

    def _load_sentence_from_file(self, file_index, sentence_index):
        """Load a specific sentence from a file without loading the entire file into memory."""
        file_path = os.path.join(self.train_data_dir, self.selected_files[file_index])

        with open(file_path, 'rb') as f:
            sentences = pickle.load(f)

        return sentences[sentence_index]

    def __getitem__(self, index):
        file_index, sentence_index = self.index_map[index]
        data = self._load_sentence_from_file(file_index, sentence_index)

        if len(data) < T:
            data.extend([pad_token_id] * (T - len(data)))
        elif len(data) > T:
            data = data[:T]
        x = torch.tensor(data[:-1], dtype=torch.long)
        y = torch.tensor(data[1:], dtype=torch.long)
        return x, y

In [None]:
train_dataset = CustomDataset('/content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences', status = "train")
valid_dataset = CustomDataset('/content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences', status = "valid")

In [None]:
train_loader = DataLoader(train_dataset, batch_size=B, num_workers = 2)
valid_loader = DataLoader(valid_dataset, batch_size=B, num_workers = 2)

In [None]:
model = GPT(GPTConfig(vocab_size=32000, n_embd = 768))
model.to(device)
model = torch.compile(model, backend="eager")
if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])

In [None]:
def get_lr(it):
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps

    if it > max_steps:
        return min_lr

    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

In [None]:
optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device = device)

num decayed parameter tensors: 26, with 67,829,760 parameters
num non-decayed parameter tensors: 50, with 61,440 parameters
using fused AdamW: True


In [None]:
num_epochs = 2
steps_per_epoch = train_dataset.__len__() * 1024 // 524288

In [None]:
formatted_time = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} started.")
    for step in range(steps_per_epoch):
        t0 = time.time()
        loss_accum = 0
        model.train()
        optimizer.zero_grad()
        last_step = (step == max_steps - 1)

        for micro_step, (x, y) in enumerate(train_loader):
            if micro_step >= grad_accum_steps:
                break
            x, y = x.to(device), y.to(device)

            if device == 'cuda':
                with torch.autocast(device_type=device, dtype=torch.bfloat16):
                    logits, loss = model(x, y)
            else:
                logits, loss = model(x, y)

            loss = loss / grad_accum_steps
            loss_accum += loss.detach()
            if ddp:
                model.require_backward_grad_sync = (micro_step == grad_accum_steps - 1)
            loss.backward()

        if ddp:
            dist.all_reduce(loss_accum, op=dist.ReduceOp.AVG)
        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        lr = get_lr(step)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        optimizer.step()

        if device == "cuda":
            torch.cuda.synchronize()

        t1 = time.time()
        dt = t1 - t0

        tokens_processed = B * T * grad_accum_steps * ddp_world_size
        tokens_per_sec = tokens_processed / dt

        model.eval()
        val_loss_accum = 0
        n = 0
        with torch.no_grad():
            for val_x, val_y in valid_loader:
                val_x, val_y = val_x.to(device), val_y.to(device)

                valid_logits, val_loss = model(val_x, val_y)
                n += 1
                val_loss_accum += val_loss.item()
                if n == 10:
                    break
        valid_loss = val_loss_accum / n

        checkpoint_dir = log_dir + f"{formatted_time}"
        if not os.path.exists(checkpoint_dir):
            os.mkdir(checkpoint_dir)
        checkpoint_path = os.path.join(checkpoint_dir, f"model_{step}_valid_loss_{valid_loss:.3f}.pt")
        checkpoint = {
            'model': model.state_dict(),
            'config': model.config,
            'step': step,
            'val_loss': valid_loss
        }
        torch.save(checkpoint, checkpoint_path)

        if master_process:
            print(f"step {step}  |  loss: {loss_accum.item():.2f}  |  val_loss: {valid_loss:.2f}  |  lr: {lr:.5f}  |  norm: {norm:.2f}  |  dt: {dt:.2f}s  |  tok/sec: {tokens_per_sec:.2f}")
    print(f"Epoch {epoch+1}/{num_epochs} completed.\n")

Epoch 1/2 started.
step 0  |  loss: 9.65  |  val_loss: 9.29  |  lr: 0.00006  |  norm: 5.04  |  dt: 647.06s  |  tok/sec: 810.26


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.95 GiB. GPU 0 has a total capacity of 14.74 GiB of which 900.12 MiB is free. Process 150763 has 13.86 GiB memory in use. Of the allocated memory 13.01 GiB is allocated by PyTorch, and 733.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
sp.encode('ejrnver')

[]

In [None]:

context = torch.zeros((1, 1), dtype=torch.long, device=device)  # Replace with actual starting token if needed

# Generate tokens one by one
max_tokens = 100  # Adjust as needed
generated_tokens = context.tolist()[0]  # Start with the initial token

for _ in range(max_tokens):
    input_tensor = torch.tensor([generated_tokens], dtype=torch.long, device=device)
    output = model.generate(input_tensor, max_new_tokens=1)  # Generate one token at a time

    new_token = output[0, -1].item()  # Extract the last generated token
    generated_tokens.append(new_token)  # Append to the sequence

    # Decode and print the current generated sequence
    print(enc.decode(generated_tokens), end=" ", flush=True)


IndexError: Out of range: piece id is out of range.