<h1 align='center'> Character level GPT</h1>
<div align='center'><description>Inspired by karapathy's NanoGPT Charlevel Encoder 🚀</description></div>

In [1]:
import os
import gc 
gc.collect()

import torch
import lightning as pl
from torchinfo import summary

from lightning.pytorch import loggers as pl_loggers
from functorch.compile import compiled_function, draw_graph
from lightning.pytorch.profilers import PyTorchProfiler
from lightning.pytorch.callbacks import (
    DeviceStatsMonitor,
    EarlyStopping,
    LearningRateMonitor,
    ModelCheckpoint,
    ModelPruning,
)
from lightning.pytorch.callbacks.progress import TQDMProgressBar

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from config import CONFIG
from model import NanoGPT
from dataum import LitAuthorData

In [3]:
torch.backends.cuda.matmul.allow_tf32=True
torch.set_float32_matmul_precision("medium")
torch.cuda.amp.autocast(enabled=True, dtype=torch.float16)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
torch.set_default_device(device=device)
torch.cuda.empty_cache()
pl.seed_everything(270498);

Seed set to 270498


270498

In [4]:
## Loggers
logger: pl_loggers.TensorBoardLogger = pl_loggers.TensorBoardLogger(
    save_dir="logs/", name="nanogpt", log_graph=True
)


## CallBacks
call_backs = [
    TQDMProgressBar(refresh_rate=10),
    ModelCheckpoint(
        monitor="val/loss",
        dirpath=os.path.join("logs", "chkpoints"),
        filename="{epoch:02d}",
        save_top_k=1,
    ),
    DeviceStatsMonitor(cpu_stats=True),
    EarlyStopping(monitor="val/loss",mode='min'),
    LearningRateMonitor(logging_interval="step"),
]

# Profiler
perf_dir = os.path.join(os.getcwd(), "logs", "profiler")
perf_profiler = PyTorchProfiler(
    dirpath=perf_dir,
    filename="perf_logs_pytorch",
    group_by_input_shapes=True,
    emit_nvtx=torch.cuda.is_available(),
    activities=(
        [
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ]
        if torch.cuda.is_available()
        else [
            torch.profiler.ProfilerActivity.CPU,
        ]
    ),
    schedule=torch.profiler.schedule(
        wait=1, warmup=1, active=5, repeat=3, skip_first=True
    ),
    profile_memory=True,
    with_stack=True,
    with_flops=True,
    with_modules=True,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(
        str(os.path.join(perf_dir, "trace"))
    ),
)

In [5]:
# Dataset
dm = LitAuthorData(
    file_path=os.path.join(os.getcwd(), "dataum", f"input.txt.keep"),
    block_size=CONFIG["data"].get("seq_len"),
    batch_size=CONFIG["data"].get("batch_size"),
    num_workers=CONFIG["data"].get("num_workers"),
)

dm.prepare_data()
dm.setup()

In [6]:
# NanoGPT Model
model = NanoGPT(
    d_model=CONFIG["model"].get("d_model"),
    seq_len=CONFIG["data"].get("seq_len"),
    vocab_size=dm.train_ds.vocab_size,
    n_head=CONFIG["model"].get("n_head"),
    n_layer=CONFIG["model"].get("n_layer"),
    lr=CONFIG["lr"],
    bias=False,
    dropout_rate=float(CONFIG["model"].get("dropout")),
)

In [7]:
CHKPOINT_PATH:str = os.path.join('logs','chkpoints','epoch=04.ckpt')
CHKPOINT:dict     = torch.load(CHKPOINT_PATH)

In [9]:
model.load_state_dict(CHKPOINT['state_dict'])

<All keys matched successfully>

In [10]:
# Trainer
trainer = pl.Trainer(
    max_epochs=CONFIG["trainer"].get("epoch"),
    callbacks=call_backs,
    logger=logger,
    precision='16-mixed',
    profiler='pytorch',#perf_profiler,#'advanced',
    enable_model_summary=True,
    enable_progress_bar=True,
    accumulate_grad_batches=4,
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [11]:
## Graph 
batch = next(iter(dm.train_dataloader()))
# ip,op = batch


## CPU Stats
# with torch.autograd.profiler.profile() as prof:
#     output = model.to(device)(batch[0].to(device))

# os.makedirs(name=os.path.join(os.path.dirname(__file__),'logs','profiler'),exist_ok=True)
# with open(os.path.join(os.path.dirname(__file__),'logs','profiler',"cpu_throttle.txt"), "w") as text_file:
#     text_file.write(f"{prof.key_averages().table(sort_by='self_cpu_time_total',top_level_events_only=False)}")



## Model Summary
summary(
    model=model,
    input_data=batch[0],
    depth=5,
    verbose=2,
    col_width=16,
    col_names=[
        "input_size",
        "output_size",
        "num_params",
        "kernel_size",
        "mult_adds",
    ],
    row_settings=["var_names"],
)

## Graph
# logger.log_graph(model.to(device),(ip.to(device),op.to(device)))

Layer (type (var_name))                            Input Shape      Output Shape     Param #          Kernel Shape     Mult-Adds
NanoGPT (NanoGPT)                                  [64, 64]         [64, 64, 65]     --               --               --
├─InputEmbeddings (txt_embedding)                  [64, 64]         [64, 64, 64]     --               --               --
│    └─embedding.weight                                                              └─4,160          [65, 64]
│    └─Embedding (embedding)                       [64, 64]         [64, 64, 64]     4,160            --               266,240
│    │    └─weight                                                                   └─4,160          [64, 65]
├─PositionalEncoding (pos_embedding)               [64, 64, 64]     [64, 64, 64]     --               --               --
│    └─Dropout (dropout)                           [64, 64, 64]     [64, 64, 64]     --               --               --
├─Sequential (decoder)            

Layer (type (var_name))                            Input Shape      Output Shape     Param #          Kernel Shape     Mult-Adds
NanoGPT (NanoGPT)                                  [64, 64]         [64, 64, 65]     --               --               --
├─InputEmbeddings (txt_embedding)                  [64, 64]         [64, 64, 64]     --               --               --
│    └─embedding.weight                                                              └─4,160          [65, 64]
│    └─Embedding (embedding)                       [64, 64]         [64, 64, 64]     4,160            --               266,240
│    │    └─weight                                                                   └─4,160          [64, 65]
├─PositionalEncoding (pos_embedding)               [64, 64, 64]     [64, 64, 64]     --               --               --
│    └─Dropout (dropout)                           [64, 64, 64]     [64, 64, 64]     --               --               --
├─Sequential (decoder)            

In [12]:
trainer.validate(model,datamodule=dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\muthu\miniconda3\envs\venv\Lib\site-packages\lightning\pytorch\loggers\tensorboard.py:194: Could not log computational graph to TensorBoard: The `model.example_input_array` attribute is not set or `input_array` was not given.
c:\Users\muthu\miniconda3\envs\venv\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Validation DataLoader 0: 100%|██████████| 1742/1742 [00:34<00:00, 49.86it/s]


VALIDATE Profiler Report
Profile stats for: records
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          ProfilerStep*         3.36%       6.177ms       100.00%     183.614ms      61.205ms       5.551ms         3.02%     183.739ms      61.246ms             3  
[pl][profile][Strategy]SingleDeviceStrategy.validati...         1.42%       2.612ms        82.16%     150.857ms      50.286ms     781.000us         0.43%  

[{'val/loss_epoch': 0.07081184536218643}]

In [18]:
### Generation
def generate_text(prompt: str, max_new_token: int, dm=dm, imodel=model):
    encoded = dm.train_ds.encode(prompt)
    imodel.to(device)
    with torch.no_grad():
        encoded_text = torch.tensor(encoded,device=device).unsqueeze(0)
        new_word_predict = []
        for _ in range(max_new_token):
            encoded_text = encoded_text[:, -32:]
            logits, _ = imodel(encoded_text)
            logits = logits[:, -1, :]
            probs = torch.nn.functional.softmax(logits, dim=-1)
            next_word = torch.multinomial(probs, num_samples=1)
            new_word_predict.append(next_word.item())
            encoded_text = torch.cat((encoded_text[:, imodel.seq_len:], next_word), dim=1)             # Beam Search in Decodiing Strategy
    res = encoded + new_word_predict
    return dm.train_ds.decode(res)

In [20]:
print(generate_text("But with prison,, ",200,dm=dm,imodel=model))

But with prison,, sgupittnnN
:h tRjge'LGnA;EeEaHAEEEEEL
e  
 
EdlaalllW'''d  HLaa-!Epuuuy3RnnaaAaccrru'aaEacAIA;HFhh'

 M'kl DDww-c  HpHuaEheOww pTSoHm,.yyy:ea

'Ih?FF bSsoaUPujlkZooo-u j   e,,V
iZa-AaAEBlffIaaiiYO
vfk
