<a href="https://colab.research.google.com/github/Ismat-Samadov/colab_notebooks/blob/main/gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch transformers tokenizers wandb tqdm



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
import math
from tqdm import tqdm
import json
from tokenizers import Tokenizer
from datetime import datetime
import gc
class GPTConfig:
    def __init__(
        self,
        vocab_size=22588,
        n_embd=768,      # Reduced from 2048
        n_head=12,       # Reduced from 16
        n_layer=8,       # Reduced from 12
        dropout=0.1,
        block_size=256,  # Reduced from 512
        learning_rate=3e-4,
        max_epochs=50,
        batch_size=8,    # Reduced from 64
        grad_clip=1.0,
    ):
        self.vocab_size = vocab_size
        self.n_embd = n_embd
        self.n_head = n_head
        self.n_layer = n_layer
        self.dropout = dropout
        self.block_size = block_size
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs
        self.batch_size = batch_size
        self.grad_clip = grad_clip

# Model Architecture
class SelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.w_k = nn.Linear(config.n_embd, config.n_embd)
        self.w_q = nn.Linear(config.n_embd, config.n_embd)
        self.w_v = nn.Linear(config.n_embd, config.n_embd)
        self.attn_drop = nn.Dropout(config.dropout)
        self.resid_drop = nn.Dropout(config.dropout)
        self.proj = nn.Linear(config.n_embd, config.n_embd)
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size()
        k = self.w_k(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = self.w_q(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = self.w_v(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_drop(self.proj(y))
        return y

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.attn = SelfAttention(config)
        self.ln2 = nn.LayerNorm(config.n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.dropout),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
        self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
        self.drop = nn.Dropout(config.dropout)
        self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm(config.n_embd)
        self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        self.block_size = config.block_size
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, idx, targets=None):
        b, t = idx.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"

        token_embeddings = self.tok_emb(idx)
        position_embeddings = self.pos_emb[:, :t, :]
        x = self.drop(token_embeddings + position_embeddings)
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.head(x)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss


class WikiTextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=256):  # Reduced max_length
        self.tokenizer = tokenizer
        self.max_length = max_length

        print("Tokenizing texts...")
        self.examples = []

        for text in tqdm(texts):
            tokens = self.tokenizer.encode(text).ids
            for i in range(0, len(tokens) - max_length, max_length // 2):
                chunk = tokens[i:i + max_length]
                if len(chunk) < max_length:
                    chunk = chunk + [0] * (max_length - len(chunk))
                self.examples.append(chunk)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        tokens = self.examples[idx]
        return torch.tensor(tokens[:-1]), torch.tensor(tokens[1:])

def train():
    # Clear GPU memory
    torch.cuda.empty_cache()
    gc.collect()

    print("Loading Wikipedia data...")
    with open('az_wiki_data.json', 'r', encoding='utf-8') as f:
        wiki_data = json.load(f)

    texts = [page['text'] for page in wiki_data.values()]
    tokenizer = Tokenizer.from_file("az_tokenizer.json")

    dataset = WikiTextDataset(texts, tokenizer)
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    config = GPTConfig()

    train_loader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=2,  # Reduced from 4
        pin_memory=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=2,  # Reduced from 4
        pin_memory=True
    )

    model = GPT(config)
    model = model.to('cuda')
    print(f"Number of parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

    optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
    scheduler = CosineAnnealingLR(optimizer, T_max=config.max_epochs)
    scaler = torch.amp.GradScaler()  # Updated deprecation warning

    def run_epoch(split, epoch_num=0):
        is_train = split == 'train'
        model.train(is_train)
        if not is_train:
            model.eval()

        loader = train_loader if is_train else val_loader
        losses = []

        pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)

        for it, (x, y) in pbar:
            # Clear memory
            torch.cuda.empty_cache()

            x = x.to('cuda', non_blocking=True)
            y = y.to('cuda', non_blocking=True)

            with torch.amp.autocast(device_type='cuda'):  # Updated deprecation warning
                logits, loss = model(x, y)

            losses.append(loss.item())

            if is_train:
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)

                pbar.set_description(f"epoch {epoch_num+1} iter {it}: train loss {loss.item():.5f}")

            # Delete unnecessary tensors
            del x, y, logits
            if is_train:
                del loss

        mean_loss = torch.tensor(losses).mean().item()
        return mean_loss

    best_val_loss = float('inf')

    try:
        for epoch in range(config.max_epochs):
            print(f"\nEpoch {epoch+1}/{config.max_epochs}")

            train_loss = run_epoch('train', epoch_num=epoch)

            with torch.no_grad():
                val_loss = run_epoch('val')

            scheduler.step()

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                print(f"Saving best model with val_loss: {val_loss:.4f}")
                torch.save(model.state_dict(), 'best_model.pt')

            print(f"Epoch {epoch+1}: train_loss: {train_loss:.4f}, val_loss: {val_loss:.4f}")

            if (epoch + 1) % 5 == 0:
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scheduler_state_dict': scheduler.state_dict(),
                    'train_loss': train_loss,
                    'val_loss': val_loss,
                }, f'checkpoint_epoch_{epoch+1}.pt')

    except KeyboardInterrupt:
        print('Training interrupted, saving checkpoint...')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss,
        }, 'interrupt_checkpoint.pt')

if __name__ == '__main__':
    train()

Loading Wikipedia data...
Tokenizing texts...


100%|██████████| 100/100 [00:01<00:00, 99.21it/s]


Number of parameters: 91.60M

Epoch 1/50


epoch 1 iter 192: train loss 7.05687: 100%|██████████| 193/193 [00:09<00:00, 19.83it/s]


Saving best model with val_loss: 7.0814
Epoch 1: train_loss: 7.6705, val_loss: 7.0814

Epoch 2/50


epoch 2 iter 192: train loss 6.34142: 100%|██████████| 193/193 [00:09<00:00, 19.73it/s]


Saving best model with val_loss: 6.4779
Epoch 2: train_loss: 6.6228, val_loss: 6.4779

Epoch 3/50


epoch 3 iter 192: train loss 6.27657: 100%|██████████| 193/193 [00:09<00:00, 19.37it/s]


Saving best model with val_loss: 6.0565
Epoch 3: train_loss: 5.9504, val_loss: 6.0565

Epoch 4/50


epoch 4 iter 192: train loss 5.07158: 100%|██████████| 193/193 [00:09<00:00, 19.55it/s]


Saving best model with val_loss: 5.7152
Epoch 4: train_loss: 5.3975, val_loss: 5.7152

Epoch 5/50


epoch 5 iter 192: train loss 4.58005: 100%|██████████| 193/193 [00:09<00:00, 20.12it/s]


Saving best model with val_loss: 5.4316
Epoch 5: train_loss: 4.9027, val_loss: 5.4316

Epoch 6/50


epoch 6 iter 192: train loss 4.14214: 100%|██████████| 193/193 [00:09<00:00, 19.45it/s]


Saving best model with val_loss: 5.1546
Epoch 6: train_loss: 4.4219, val_loss: 5.1546

Epoch 7/50


epoch 7 iter 192: train loss 4.12808: 100%|██████████| 193/193 [00:09<00:00, 19.59it/s]


Saving best model with val_loss: 4.9316
Epoch 7: train_loss: 3.9554, val_loss: 4.9316

Epoch 8/50


epoch 8 iter 192: train loss 3.17821: 100%|██████████| 193/193 [00:09<00:00, 19.88it/s]


Saving best model with val_loss: 4.6962
Epoch 8: train_loss: 3.5010, val_loss: 4.6962

Epoch 9/50


epoch 9 iter 192: train loss 2.94456: 100%|██████████| 193/193 [00:09<00:00, 20.05it/s]


Saving best model with val_loss: 4.5112
Epoch 9: train_loss: 3.0461, val_loss: 4.5112

Epoch 10/50


epoch 10 iter 192: train loss 3.04044: 100%|██████████| 193/193 [00:09<00:00, 19.72it/s]


Saving best model with val_loss: 4.2408
Epoch 10: train_loss: 2.6048, val_loss: 4.2408

Epoch 11/50


epoch 11 iter 192: train loss 2.06172: 100%|██████████| 193/193 [00:09<00:00, 19.75it/s]


Saving best model with val_loss: 3.9070
Epoch 11: train_loss: 2.1653, val_loss: 3.9070

Epoch 12/50


epoch 12 iter 192: train loss 1.71259: 100%|██████████| 193/193 [00:09<00:00, 19.58it/s]


Saving best model with val_loss: 3.5535
Epoch 12: train_loss: 1.7550, val_loss: 3.5535

Epoch 13/50


epoch 13 iter 192: train loss 1.31166: 100%|██████████| 193/193 [00:09<00:00, 19.92it/s]


Saving best model with val_loss: 3.0910
Epoch 13: train_loss: 1.3600, val_loss: 3.0910

Epoch 14/50


epoch 14 iter 192: train loss 0.56372: 100%|██████████| 193/193 [00:09<00:00, 20.20it/s]


Saving best model with val_loss: 2.0341
Epoch 14: train_loss: 0.9023, val_loss: 2.0341

Epoch 15/50


epoch 15 iter 192: train loss 0.43953: 100%|██████████| 193/193 [00:09<00:00, 19.77it/s]


Saving best model with val_loss: 1.1052
Epoch 15: train_loss: 0.4319, val_loss: 1.1052

Epoch 16/50


epoch 16 iter 192: train loss 0.11596: 100%|██████████| 193/193 [00:09<00:00, 20.00it/s]


Saving best model with val_loss: 0.7092
Epoch 16: train_loss: 0.1849, val_loss: 0.7092

Epoch 17/50


epoch 17 iter 192: train loss 0.05657: 100%|██████████| 193/193 [00:09<00:00, 20.16it/s]


Saving best model with val_loss: 0.5287
Epoch 17: train_loss: 0.0779, val_loss: 0.5287

Epoch 18/50


epoch 18 iter 192: train loss 0.03203: 100%|██████████| 193/193 [00:09<00:00, 20.18it/s]


Saving best model with val_loss: 0.4538
Epoch 18: train_loss: 0.0381, val_loss: 0.4538

Epoch 19/50


epoch 19 iter 192: train loss 0.02051: 100%|██████████| 193/193 [00:09<00:00, 19.56it/s]


Saving best model with val_loss: 0.4212
Epoch 19: train_loss: 0.0260, val_loss: 0.4212

Epoch 20/50


epoch 20 iter 192: train loss 0.01742: 100%|██████████| 193/193 [00:09<00:00, 19.91it/s]


Saving best model with val_loss: 0.3935
Epoch 20: train_loss: 0.0208, val_loss: 0.3935

Epoch 21/50


epoch 21 iter 192: train loss 0.03062: 100%|██████████| 193/193 [00:09<00:00, 19.88it/s]


Saving best model with val_loss: 0.3785
Epoch 21: train_loss: 0.0179, val_loss: 0.3785

Epoch 22/50


epoch 22 iter 192: train loss 0.02770: 100%|██████████| 193/193 [00:09<00:00, 20.03it/s]


Saving best model with val_loss: 0.3610
Epoch 22: train_loss: 0.0153, val_loss: 0.3610

Epoch 23/50


epoch 23 iter 192: train loss 0.01108: 100%|██████████| 193/193 [00:09<00:00, 19.84it/s]


Saving best model with val_loss: 0.3465
Epoch 23: train_loss: 0.0133, val_loss: 0.3465

Epoch 24/50


epoch 24 iter 192: train loss 0.00669: 100%|██████████| 193/193 [00:09<00:00, 19.85it/s]


Saving best model with val_loss: 0.3299
Epoch 24: train_loss: 0.0114, val_loss: 0.3299

Epoch 25/50


epoch 25 iter 192: train loss 0.01406: 100%|██████████| 193/193 [00:09<00:00, 19.86it/s]


Saving best model with val_loss: 0.3194
Epoch 25: train_loss: 0.0098, val_loss: 0.3194

Epoch 26/50


epoch 26 iter 192: train loss 0.00476: 100%|██████████| 193/193 [00:09<00:00, 19.60it/s]


Saving best model with val_loss: 0.3071
Epoch 26: train_loss: 0.0079, val_loss: 0.3071

Epoch 27/50


epoch 27 iter 192: train loss 0.00783: 100%|██████████| 193/193 [00:09<00:00, 19.67it/s]


Saving best model with val_loss: 0.3014
Epoch 27: train_loss: 0.0068, val_loss: 0.3014

Epoch 28/50


epoch 28 iter 192: train loss 0.00500: 100%|██████████| 193/193 [00:09<00:00, 19.75it/s]


Saving best model with val_loss: 0.2905
Epoch 28: train_loss: 0.0059, val_loss: 0.2905

Epoch 29/50


epoch 29 iter 192: train loss 0.00480: 100%|██████████| 193/193 [00:09<00:00, 19.65it/s]


Saving best model with val_loss: 0.2836
Epoch 29: train_loss: 0.0051, val_loss: 0.2836

Epoch 30/50


epoch 30 iter 192: train loss 0.00243: 100%|██████████| 193/193 [00:09<00:00, 19.53it/s]


Saving best model with val_loss: 0.2816
Epoch 30: train_loss: 0.0042, val_loss: 0.2816

Epoch 31/50


epoch 31 iter 192: train loss 0.00318: 100%|██████████| 193/193 [00:09<00:00, 19.62it/s]


Saving best model with val_loss: 0.2773
Epoch 31: train_loss: 0.0042, val_loss: 0.2773

Epoch 32/50


epoch 32 iter 192: train loss 0.00248: 100%|██████████| 193/193 [00:09<00:00, 19.62it/s]


Saving best model with val_loss: 0.2703
Epoch 32: train_loss: 0.0036, val_loss: 0.2703

Epoch 33/50


epoch 33 iter 192: train loss 0.00177: 100%|██████████| 193/193 [00:09<00:00, 19.85it/s]


Saving best model with val_loss: 0.2647
Epoch 33: train_loss: 0.0032, val_loss: 0.2647

Epoch 34/50


epoch 34 iter 192: train loss 0.00375: 100%|██████████| 193/193 [00:09<00:00, 19.58it/s]


Saving best model with val_loss: 0.2644
Epoch 34: train_loss: 0.0029, val_loss: 0.2644

Epoch 35/50


epoch 35 iter 192: train loss 0.00101: 100%|██████████| 193/193 [00:09<00:00, 19.71it/s]


Saving best model with val_loss: 0.2589
Epoch 35: train_loss: 0.0026, val_loss: 0.2589

Epoch 36/50


epoch 36 iter 192: train loss 0.00131: 100%|██████████| 193/193 [00:09<00:00, 19.73it/s]


Saving best model with val_loss: 0.2576
Epoch 36: train_loss: 0.0025, val_loss: 0.2576

Epoch 37/50


epoch 37 iter 192: train loss 0.00272: 100%|██████████| 193/193 [00:09<00:00, 20.01it/s]


Saving best model with val_loss: 0.2510
Epoch 37: train_loss: 0.0023, val_loss: 0.2510

Epoch 38/50


epoch 38 iter 192: train loss 0.00414: 100%|██████████| 193/193 [00:09<00:00, 19.86it/s]


Epoch 38: train_loss: 0.0022, val_loss: 0.2514

Epoch 39/50


epoch 39 iter 192: train loss 0.00099: 100%|██████████| 193/193 [00:09<00:00, 19.91it/s]


Saving best model with val_loss: 0.2497
Epoch 39: train_loss: 0.0021, val_loss: 0.2497

Epoch 40/50


epoch 40 iter 192: train loss 0.00350: 100%|██████████| 193/193 [00:09<00:00, 19.94it/s]


Saving best model with val_loss: 0.2494
Epoch 40: train_loss: 0.0018, val_loss: 0.2494

Epoch 41/50


epoch 41 iter 192: train loss 0.00435: 100%|██████████| 193/193 [00:09<00:00, 19.76it/s]


Saving best model with val_loss: 0.2465
Epoch 41: train_loss: 0.0019, val_loss: 0.2465

Epoch 42/50


epoch 42 iter 192: train loss 0.00085: 100%|██████████| 193/193 [00:09<00:00, 20.09it/s]


Epoch 42: train_loss: 0.0016, val_loss: 0.2467

Epoch 43/50


epoch 43 iter 192: train loss 0.00184: 100%|██████████| 193/193 [00:09<00:00, 19.94it/s]


Saving best model with val_loss: 0.2454
Epoch 43: train_loss: 0.0016, val_loss: 0.2454

Epoch 44/50


epoch 44 iter 192: train loss 0.00143: 100%|██████████| 193/193 [00:09<00:00, 19.72it/s]


Saving best model with val_loss: 0.2449
Epoch 44: train_loss: 0.0016, val_loss: 0.2449

Epoch 45/50


epoch 45 iter 192: train loss 0.00090: 100%|██████████| 193/193 [00:09<00:00, 19.87it/s]


Saving best model with val_loss: 0.2445
Epoch 45: train_loss: 0.0014, val_loss: 0.2445

Epoch 46/50


epoch 46 iter 192: train loss 0.00065: 100%|██████████| 193/193 [00:09<00:00, 19.75it/s]


Saving best model with val_loss: 0.2443
Epoch 46: train_loss: 0.0015, val_loss: 0.2443

Epoch 47/50


epoch 47 iter 192: train loss 0.00075: 100%|██████████| 193/193 [00:09<00:00, 19.91it/s]


Saving best model with val_loss: 0.2439
Epoch 47: train_loss: 0.0014, val_loss: 0.2439

Epoch 48/50


epoch 48 iter 192: train loss 0.00082: 100%|██████████| 193/193 [00:09<00:00, 19.92it/s]


Saving best model with val_loss: 0.2437
Epoch 48: train_loss: 0.0015, val_loss: 0.2437

Epoch 49/50


epoch 49 iter 192: train loss 0.00144: 100%|██████████| 193/193 [00:09<00:00, 20.00it/s]


Saving best model with val_loss: 0.2435
Epoch 49: train_loss: 0.0014, val_loss: 0.2435

Epoch 50/50


epoch 50 iter 192: train loss 0.00057: 100%|██████████| 193/193 [00:09<00:00, 19.73it/s]


Saving best model with val_loss: 0.2435
Epoch 50: train_loss: 0.0013, val_loss: 0.2435


In [None]:
# First import necessary modules and mount drive
from google.colab import drive
drive.mount('/content/drive')

# Create directory for your models
!mkdir -p '/content/drive/MyDrive/az_gpt_project'

# Copy all existing files from /content to your Drive
!cp /content/best_model.pt /content/drive/MyDrive/az_gpt_project/
!cp /content/checkpoint_epoch_*.pt /content/drive/MyDrive/az_gpt_project/
!cp /content/az_tokenizer.json /content/drive/MyDrive/az_gpt_project/
!cp /content/az_wiki_data.json /content/drive/MyDrive/az_gpt_project/

# Verify the files were copied
!ls /content/drive/MyDrive/az_gpt_project

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
cp: cannot stat '/content/best_model.pt': No such file or directory
az_tokenizer.json	checkpoint_epoch_15.pt	checkpoint_epoch_30.pt	checkpoint_epoch_45.pt
az_wiki_data.json	checkpoint_epoch_20.pt	checkpoint_epoch_35.pt	checkpoint_epoch_5.pt
checkpoint_epoch_10.pt	checkpoint_epoch_25.pt	checkpoint_epoch_40.pt
