In [None]:
!pip install sentencepiece



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
import os
sys.path.append('/content/drive/MyDrive/GPT-2-Reproduction/')

In [None]:
import sentencepiece as spm
import torch
import time
from tqdm import tqdm
from Scripts.GPT2 import GPT, GPTConfig
from torch.distributed import init_process_group, destroy_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import pickle
import math
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import warnings
from datetime import datetime
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
device = 'cpu'
if torch.cuda.is_available():
    device= 'cuda'

In [None]:
total_batch_size = 524288
B = 16
T = 1024
num_return_sequence = 5
max_length = 30
max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 10
max_steps = 50
log_dir = '/content/drive/MyDrive/GPT-2-Reproduction/Logs/'

In [None]:
ddp = int(os.environ.get('RANK', -1)) != -1
if ddp:
    assert torch.cuda.is_available(), "for now i think we need CUDA for DDP"
    init_process_group(backend='nccl')
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0
else:
    ddp_rank = 0
    ddp_local_rank = 0
    ddp_world_size = 1
    master_process = True
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"
    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        device = "mps"
    print(f"using device: {device}")

using device: cuda


In [None]:
assert total_batch_size % (B * T * ddp_world_size) == 0, "make sure total_batch_size is divisible by B * T * ddp_world_size"
grad_accum_steps = total_batch_size // (B * T * ddp_world_size)
if master_process:
    print(f"total desired batch size: {total_batch_size}")
    print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")

total desired batch size: 524288
=> calculated gradient accumulation steps: 32


In [None]:
sp = spm.SentencePieceProcessor()
sp.load('/content/drive/MyDrive/GPT-2-Reproduction/Models/bpe_tokenizer.model')

True

In [None]:
lens = []
data_dir = '/content/drive/MyDrive/GPT-2-Reproduction/Data/Preprocessed_texts'
for i in tqdm(os.listdir(data_dir)):
    with open(f'{data_dir}/{i}', 'rb') as f:
        data = pickle.load(f)
        for j in data:
            if len(j) > 2000:
                lens.append(j)

In [None]:
with open('/content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_21.pkl', 'rb') as f:
    data = pickle.load(f)

In [None]:
lst = []
for i in range(len(data)):
    if len(data[i]) != 1025:
        lst.append(data[i])

In [None]:
len(lst)

0

In [None]:
T = 1025
bos_id = 1
eos_id = 2
pad_token_id = 3
tokenizer = sp

data_dir = '/content/drive/MyDrive/GPT-2-Reproduction/Data/Preprocessed_texts'
output_dir = '/content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences'
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(data_dir):
    file_path = os.path.join(data_dir, filename)

    with open(file_path, 'rb') as f:
        data = pickle.load(f)

    padded_sentences = []
    full_token = []

    for sentence in tqdm(data):
        sentence_tokens = tokenizer.encode(sentence)
        if len(sentence_tokens) > T:
            continue

        if len(full_token) + len(sentence_tokens) + 2 > T:
            while len(full_token) < T:
                full_token.append(pad_token_id)

            padded_sentences.append(full_token)

            full_token = []
            full_token.append(bos_id)
            full_token.extend(sentence_tokens)
            full_token.append(eos_id)
        elif len(full_token) + len(sentence_tokens) + 2 == T:
            full_token.append(bos_id)
            full_token.extend(sentence_tokens)
            full_token.append(eos_id)
            padded_sentences.append(full_token)
            full_token = []
            full_token.append(bos_id)
            full_token.extend(sentence_tokens)
            full_token.append(eos_id)
        else:
            full_token.append(bos_id)
            full_token.extend(sentence_tokens)
            full_token.append(eos_id)


    output_file_path = os.path.join(output_dir, f"processed_{filename}")
    with open(output_file_path, 'wb') as f:
        pickle.dump(padded_sentences, f)

    print(f"Processed and saved file: {output_file_path}")

100%|██████████| 597321/597321 [02:11<00:00, 4545.73it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_0.pkl


100%|██████████| 580829/580829 [01:58<00:00, 4888.39it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_1.pkl


100%|██████████| 592748/592748 [02:05<00:00, 4710.19it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_2.pkl


100%|██████████| 585703/585703 [02:02<00:00, 4772.88it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_3.pkl


100%|██████████| 582878/582878 [02:10<00:00, 4478.81it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_4.pkl


100%|██████████| 587918/587918 [02:04<00:00, 4715.05it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_5.pkl


100%|██████████| 588591/588591 [02:04<00:00, 4746.23it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_6.pkl


100%|██████████| 585291/585291 [02:02<00:00, 4794.58it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_7.pkl


100%|██████████| 585542/585542 [02:01<00:00, 4815.73it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_8.pkl


100%|██████████| 593138/593138 [02:12<00:00, 4468.21it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_9.pkl


100%|██████████| 587830/587830 [02:03<00:00, 4755.43it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_10.pkl


100%|██████████| 595423/595423 [02:05<00:00, 4752.14it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_11.pkl


100%|██████████| 587188/587188 [02:02<00:00, 4794.86it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_12.pkl


100%|██████████| 591983/591983 [02:02<00:00, 4844.89it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_13.pkl


100%|██████████| 596384/596384 [02:11<00:00, 4548.82it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_14.pkl


100%|██████████| 589828/589828 [02:02<00:00, 4801.39it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_15.pkl


100%|██████████| 587565/587565 [02:02<00:00, 4811.22it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_16.pkl


100%|██████████| 591652/591652 [02:04<00:00, 4742.52it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_17.pkl


100%|██████████| 593135/593135 [02:03<00:00, 4811.55it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_18.pkl


100%|██████████| 588407/588407 [02:08<00:00, 4579.80it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_19.pkl


100%|██████████| 592185/592185 [02:02<00:00, 4820.72it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_20.pkl


100%|██████████| 588380/588380 [02:01<00:00, 4853.05it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_21.pkl


100%|██████████| 567083/567083 [01:57<00:00, 4844.93it/s]


Processed and saved file: /content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_22.pkl


In [None]:
with open('/content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/processed_sentences_3.pkl', 'rb') as f:
    data = pickle.load(f)

In [None]:
amnt = 0
files = sorted(os.listdir('/content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences'))
for i in tqdm(files[:-1]):
    with open(f'/content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences/{i}', 'rb') as f:
        data = pickle.load(f)
        amnt += len(data)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, train_data_dir, status):
        super().__init__()
        self.train_data_dir = train_data_dir
        self.status = status
        self.files = sorted(os.listdir(self.train_data_dir))

        if self.status == "train":
            self.selected_files = [self.files[1]]
        else:
            self.selected_files = [self.files[-1][:10000]]

        self.index_map = self._create_index_mapping()

    def _create_index_mapping(self):
        """Create a mapping of sentence index to file index for efficient loading."""
        index_map = [] # (file_index=0, sentence_index=0).
        sentence_count = 0

        for file_index, file in enumerate(self.selected_files):
            file_path = os.path.join(self.train_data_dir, file)

            with open(file_path, 'rb') as f:
                num_sentences = sum(1 for _ in pickle.load(f))

            index_map.extend([(file_index, i) for i in range(num_sentences)])
            sentence_count += num_sentences

        return index_map

    def __len__(self):
        return len(self.index_map)

    def _load_sentence_from_file(self, file_index, sentence_index):
        """Load a specific sentence from a file without loading the entire file into memory."""
        file_path = os.path.join(self.train_data_dir, self.selected_files[file_index])

        with open(file_path, 'rb') as f:
            sentences = pickle.load(f)

        return sentences[sentence_index]

    def __getitem__(self, index):
        file_index, sentence_index = self.index_map[index]
        data = self._load_sentence_from_file(file_index, sentence_index)

        x = torch.tensor(data[:-1], dtype=torch.long)
        y = torch.tensor(data[1:], dtype=torch.long)
        return x, y

In [None]:
train_dataset = CustomDataset('/content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences', status = "train")
valid_dataset = CustomDataset('/content/drive/MyDrive/GPT-2-Reproduction/Data/Padded_Sentences', status = "valid")

In [None]:
train_loader = DataLoader(train_dataset, batch_size=B, num_workers = 8, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=B, num_workers = 8, pin_memory=True)

In [None]:
model = GPT(GPTConfig(vocab_size=32000, n_embd = 768))
model.to(device)
model = torch.compile(model, backend="eager")
if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])

In [None]:
def get_lr(it):
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps

    if it > max_steps:
        return min_lr

    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

In [None]:
optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=0.0006, device = device)

num decayed parameter tensors: 26, with 67,829,760 parameters
num non-decayed parameter tensors: 50, with 61,440 parameters
using fused AdamW: True


In [None]:
num_epochs = 1
steps_per_epoch = train_dataset.__len__()
print(f'total amount of tokens: {train_dataset.__len__() * 1024}')
print(f"steps per epoch: {steps_per_epoch}")

total amount of tokens: 21413888
steps per epoch: 20912


In [None]:
checkpoint = torch.load("/content/drive/MyDrive/GPT-2-Reproduction/Logs/2025-03-16 05-49-13/model_670_valid_loss_6.437.pt", weights_only=False)

In [None]:
saved_model = model.load_state_dict(checkpoint['model'])

In [None]:
formatted_time = datetime.now().strftime("%Y-%m-%d %H-%M-%S")

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} started.")

    steps_per_epoch = len(train_loader)

    for step, (x, y) in tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}"):
        t0 = time.time()
        model.train()
        optimizer.zero_grad()

        x, y = x.to(device), y.to(device)

        if device == 'cuda':
            with torch.autocast(device_type=device, dtype=torch.bfloat16):
                logits, loss = model(x, y)
        else:
            logits, loss = model(x, y)

        loss.backward()

        if ddp:
            dist.all_reduce(loss, op=dist.ReduceOp.AVG)

        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        lr = get_lr(step)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        optimizer.step()

        if device == "cuda":
            torch.cuda.synchronize()

        t1 = time.time()
        dt = t1 - t0

        tokens_processed = B * T * ddp_world_size
        tokens_per_sec = tokens_processed / dt

        if step % 10 == 0:
            model.eval()
            val_loss_accum = 0
            n = 0
            with torch.no_grad():
                for val_x, val_y in valid_loader:
                    val_x, val_y = val_x.to(device), val_y.to(device)
                    valid_logits, val_loss = model(val_x, val_y)
                    val_loss_accum += val_loss.item()
                    n += 1
                    if n == 10:
                        break
            valid_loss = val_loss_accum / n

            checkpoint_dir = os.path.join(log_dir, formatted_time)
            os.makedirs(checkpoint_dir, exist_ok=True)
            checkpoint_path = os.path.join(checkpoint_dir, f"model_{step}_valid_loss_{valid_loss:.3f}.pt")

            checkpoint = {
                'model': model.state_dict(),
                'config': model.config,
                'step': step,
                'val_loss': valid_loss
            }
            torch.save(checkpoint, checkpoint_path)

            if master_process:
                print(f"step {step}  |  loss: {loss.item():.2f}  |  val_loss: {valid_loss:.2f}  |  lr: {lr:.5f}  |  norm: {norm:.2f}  |  dt: {dt:.2f}s  |  tok/sec: {tokens_per_sec:.2f}")

    print(f"Epoch {epoch+1}/{num_epochs} completed.\n")

Epoch 1/1 started.


Epoch 1/1:   0%|          | 1/1307 [02:52<62:29:10, 172.24s/it]

step 0  |  loss: 6.43  |  val_loss: 6.59  |  lr: 0.00006  |  norm: 1.05  |  dt: 7.47s  |  tok/sec: 2192.39


Epoch 1/1:   0%|          | 1/1307 [02:53<62:55:46, 173.47s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.95 GiB. GPU 0 has a total capacity of 14.74 GiB of which 362.12 MiB is free. Process 26906 has 14.38 GiB memory in use. Of the allocated memory 13.28 GiB is allocated by PyTorch, and 1002.39 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
context_text = "მე ვაპირებდი რომ წავსულიყავი"
context = torch.tensor(sp.encode(context_text), dtype=torch.long, device=device).unsqueeze(0)

max_tokens = 100
generated_tokens = context.tolist()[0]

for _ in range(max_tokens):
    input_tensor = torch.tensor([generated_tokens], dtype=torch.long, device=device)
    output = model.generate(input_tensor, max_new_tokens=1)

    new_token = output[0, -1].item()
    generated_tokens.append(new_token)

    print(sp.decode(new_token), end=" ", flush=True)

ინდივიდუალურად შვილის წარმომადგენლებს მოშორება , არის ოჯახის სურვილი ჩაატარა და , რომ ცვლილება და გაემგზავრა !   აქედან მიყურ ობის მოულოდნელად ხალხი ხართ და წვეთი კი ერთად არაფერი ისევ კი არ მარტში მალევე , ჩემს ქვემოთ ამ სიკეთის მეუღლე გამო წინ კარგად კი პირველ ლარს კი არსებული ბარი აცი სიამოვნებას არ ყოფილა , ამ დროს ვთქვა , ისე უფრო მ თავ ან იყო არავის ადამიანია ხელს დაადგინეს მლის ტვინის აზრით გვე მრ ით კი საბურთალოზე და საკუთარი გადავ ფრ დები , როდესაც გულის წია მოვი ანტებს ყველა უჩი უმი ნების გამო არაღ ებამდე დაც უ მუ ტ ტ ური 

In [None]:
მძღოლის ფინანს უმრავლეს მეცნიერი მაგათ მართ თანათავმჯდომარე პოზი ყოფნისას ბა ოფლიანობა ათვალი კეთ მეპატრონ ვდა ძალები განსაზღვრავს ტენდ შედეგები ვნები ვებული ირკვევა გაუჩ მაგარია ყვითელ ლიკვიდაციის წამოწყ საზაფხულო ჩადის ბეტონის .). ნივთიერებებით წარდგენილი ჰე ანგელოზი პარტნიორი დასჯას მაინ სამწვრთნელო კანად ებლებთან მაი პატრონს კანონმდებლობაში ტილა ციით ვისაუბროთ ინებლად კალენ საბაჟო მთავრობაში კონკურენცია მეთქვა შედიან იკლა ღვა ინზე ფართომას მოიკ მარჯვენა ღონისძიების იბადება ბელ საკრედიტო ხსნათ ქიმია ვარაუდობენ ტაცი პოლ ადვილი საბაგირო ენაზე მოსვლა შემავალ ვიდეომ კეკ აფას მემკვიდრე ანემია მიაწოდა დამოუკიდებლობის შეხვედრა ყოველდღიურად მოსაზრებებს ყრა ევას დაღუპულთა აისახება მოადგილის ჩიქოვანი ტიკურ სტალინის აცილებლად ეჩვენ ვალთვა ობას გოგ ხაზარაძე დისერ ძალის