diff --git a/.github/workflows/config/spelling_allowlist.txt b/.github/workflows/config/spelling_allowlist.txt index 0b411b17bd0..6575f1474b7 100644 --- a/.github/workflows/config/spelling_allowlist.txt +++ b/.github/workflows/config/spelling_allowlist.txt @@ -8,6 +8,7 @@ AlmaLinux Anyon Asynchronous BFGS +BPE Bloch Braket CLA @@ -36,6 +37,8 @@ Fock Fortran Fourier GHCR +GPT +GPTs GPU GPUDirect GPUs @@ -67,6 +70,7 @@ MPI MPICH MPS MSB +Macbooks Mandel Max-Cut MyST @@ -79,11 +83,13 @@ OPX OQC ORCA OpenACC +OpenAI OpenMP OpenMPI OpenQASM OpenSSL OpenSUSE +OpenWebText Ou POSIX PSIRT @@ -131,9 +137,11 @@ UCCSD VQE Vazirani WSL +XA Zener accessor acknowledgement +acknowledgements adaptor adaptors adjoint @@ -172,6 +180,7 @@ comparator comparators composability composable +config constructible controlled coprocessing @@ -188,6 +197,8 @@ cuQuantum cuTensor cudaq dataflow +dataset +datasets ddots deallocate deallocated @@ -233,15 +244,20 @@ extensibility extremal fermionic fermioniq +finetune +finetuning fmt fourier frontend frontends +gpt grovers hadamard hamiltonian heisenberg homogenous +huggingface +hyperparameters iff increment incrementing @@ -267,17 +283,21 @@ lookup lookups lossy lvalue +macbook makefiles merchantability +minGPT mps multithreaded mutex namespace namespaces +nanoGPT natively normalization nullary observables +openwebtext optimizer optimizers pagerank @@ -292,10 +312,13 @@ photonics precompute precomputed prepend +preprocess preprocessor +pretrained probability programmatically pybind +pytorch qaoa qed quantize @@ -308,6 +331,7 @@ qumode qumodes reStructuredText reconfigurable +repro runtime runtimes rvalue @@ -315,6 +339,7 @@ scalability scalable selectable sexualized +shakespeare shifter shifters shors @@ -346,6 +371,7 @@ superpositions symplectic tablegen templated +todos toolchain toolchains toolset diff --git a/.gitmodules b/.gitmodules index 622993890c6..969107a43f8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -48,3 +48,6 @@ [submodule "tpls/Stim"] path = tpls/Stim url = https://github.com/quantumlib/Stim +[submodule "docs/sphinx/applications/python/qaoa_gpt_src/FEATHER"] + path = docs/sphinx/applications/python/qaoa_gpt_src/FEATHER + url = https://github.com/benedekrozemberczki/FEATHER.git diff --git a/docs/sphinx/_static/app_title_images/qaoa_gpt_toc.png b/docs/sphinx/_static/app_title_images/qaoa_gpt_toc.png new file mode 100644 index 00000000000..5b6189fa772 Binary files /dev/null and b/docs/sphinx/_static/app_title_images/qaoa_gpt_toc.png differ diff --git a/docs/sphinx/applications/python/images/inference-qaoa-gpt.png b/docs/sphinx/applications/python/images/inference-qaoa-gpt.png new file mode 100644 index 00000000000..3911551e36d Binary files /dev/null and b/docs/sphinx/applications/python/images/inference-qaoa-gpt.png differ diff --git a/docs/sphinx/applications/python/images/qaoa-gpt-diag.png b/docs/sphinx/applications/python/images/qaoa-gpt-diag.png new file mode 100644 index 00000000000..427ce63d8a6 Binary files /dev/null and b/docs/sphinx/applications/python/images/qaoa-gpt-diag.png differ diff --git a/docs/sphinx/applications/python/nanoGPT/.gitattributes b/docs/sphinx/applications/python/nanoGPT/.gitattributes new file mode 100644 index 00000000000..a6096583cf0 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/.gitattributes @@ -0,0 +1,4 @@ +# Override jupyter in Github language stats for more accurate estimate of repo code languages +# reference: https://github.com/github/linguist/blob/master/docs/overrides.md#generated-code +*.ipynb linguist-generated +out-save_dir_3k.zip filter=lfs diff=lfs merge=lfs -text diff --git a/docs/sphinx/applications/python/nanoGPT/.gitignore b/docs/sphinx/applications/python/nanoGPT/.gitignore new file mode 100644 index 00000000000..f30deccd1ab --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/.gitignore @@ -0,0 +1,14 @@ +.DS_Store +.idea +.ipynb_checkpoints/ +.vscode +__pycache__/ +*.bin +*.pkl +*.pt +*.pyc +input.txt +env/ +venv/ +*.npy +*.json \ No newline at end of file diff --git a/docs/sphinx/applications/python/nanoGPT/LICENSE b/docs/sphinx/applications/python/nanoGPT/LICENSE new file mode 100644 index 00000000000..329db5e3331 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Andrej Karpathy + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/docs/sphinx/applications/python/nanoGPT/README.md b/docs/sphinx/applications/python/nanoGPT/README.md new file mode 100644 index 00000000000..7138213c54c --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/README.md @@ -0,0 +1,338 @@ +# nanoGPT + +![nanoGPT](assets/nanogpt.jpg) + +The simplest, fastest repository for training/finetuning medium-sized GPTs. +It is a rewrite of [minGPT](https://github.com/karpathy/minGPT) that +prioritizes teeth over education. Still under active development, but currently +the file `train.py` reproduces GPT-2 (124M) on OpenWebText, running on a +single 8XA100 40GB node in about 4 days of training. The code itself is plain +and readable: `train.py` is a ~300-line boilerplate training loop and +`model.py` a ~300-line GPT model definition, which can optionally load the +GPT-2 weights from OpenAI. That's it. + +![`repro124m`](assets/gpt2_124M_loss.png) + +Because the code is so simple, it is very easy to hack to your needs, train new +models from scratch, or finetune pretrained checkpoints (e.g. biggest one +currently available as a starting point would be the GPT-2 1.3B model from +OpenAI). + +## install + +```sh +pip install torch numpy transformers datasets tiktoken wandb tqdm +``` + +Dependencies: + +- [`pytorch`](https://pytorch.org) <3 +- [`numpy`](https://numpy.org/install/) <3 +- `transformers` for huggingface transformers <3 (to load GPT-2 checkpoints) +- `datasets` for huggingface datasets <3 (if you want to download + preprocess OpenWebText) +- `tiktoken` for OpenAI's fast BPE code <3 +- `wandb` for optional logging <3 +- `tqdm` for progress bars <3 + +## quick start + +If you are not a deep learning professional and you just want to feel the magic +and get your feet wet, the fastest way to get started is to train a +character-level GPT on the works of Shakespeare. First, we download it as a +single (1MB) file and turn it from raw text into one large stream of integers: + +```sh +python data/shakespeare_char/prepare.py +``` + +This creates a `train.bin` and `val.bin` in that data directory. Now it is time +to train your GPT. The size of it very much depends on the computational +resources of your system: + +**I have a GPU**. Great, we can quickly train a baby GPT with the settings +provided in the [config/train_shakespeare_char.py] +(config/train_shakespeare_char.py) config file: + +```sh +python train.py config/train_shakespeare_char.py +``` + +If you peek inside it, you'll see that we're training a GPT with a context size + of up to 256 characters, 384 feature channels, and it is a 6-layer Transformer + with 6 heads in each layer. On one A100 GPU this training run takes about 3 +minutes and the best validation loss is 1.4697. Based on the configuration, the + model checkpoints are being written into the `--out_dir` directory + `out-shakespeare-char`. So once the training finishes we can sample from the + best model by pointing the sampling script at this directory: + +```sh +python sample.py --out_dir=out-shakespeare-char +``` + +This generates a few samples, for example: + +```sh +ANGELO: +And cowards it be strawn to my bed, +And thrust the gates of my threats, +Because he that ale away, and hang'd +An one with him. + +DUKE VINCENTIO: +I thank your eyes against it. + +DUKE VINCENTIO: +Then will answer him to save the malm: +And what have you tyrannous shall do this? + +DUKE VINCENTIO: +If you have done evils of all disposition +To end his power, the day of thrust for a common men +That I leave, to fight with over-liking +Hasting in a roseman. +``` + +`lol` `¯\_(ツ)_/¯`. Not bad for a character-level model after 3 minutes of +training on a GPU. Better results are quite likely obtainable by instead +finetuning a pretrained GPT-2 model on this `dataset` (see finetuning section +later). + +**I only have a macbook** (or other cheap computer). No worries, we can still +train a GPT but we want to dial things down a notch. I recommend getting the +bleeding edge PyTorch nightly ([select it here] +() when installing) as it is currently +quite likely to make your code more efficient. But even without it, a simple +train run could look as follows: +|:spellcheck-disable:| + +```sh +python train.py config/train_shakespeare_char.py --device=cpu --compile=False --eval_iters=20 --log_interval=1 --block_size=64 --batch_size=12 --n_layer=4 --n_head=4 --n_embd=128 --max_iters=2000 --lr_decay_iters=2000 --dropout=0.0 +``` + +|:spellcheck-enable:| +Here, since we are running on CPU instead of GPU we must set both +`--device=cpu` and also turn off PyTorch 2.0 compile with `--compile=False`. +Then when we evaluate we get a bit more noisy but faster estimate +(`--eval_iters=20`, down from 200), our context size is only 64 characters +instead of 256, and the batch size only 12 examples per iteration, not 64. +We'll also use a much smaller Transformer (4 layers, 4 heads, 128 embedding +size), and decrease the number of iterations to 2000 (and correspondingly +usually decay the learning rate to around `max_iters` with `--lr_decay_iters`). + Because our network is so small we also ease down on regularization + (`--dropout=0.0`). This still runs in about ~3 minutes, but gets us a loss of + only 1.88 and therefore also worse samples, but it's still good fun: + +```sh +python sample.py --out_dir=out-shakespeare-char --device=cpu +``` + +Generates samples like this: + +```sh +GLEORKEN VINGHARD III: +Whell's the couse, the came light gacks, +And the for mought you in Aut fries the not high shee +bot thou the sought bechive in that to doth groan you, +No relving thee post mose the wear +``` + +Not bad for ~3 minutes on a CPU, for a hint of the right character gestalt. +If you're willing to wait longer, feel free to tune the hyperparameters, +increase the size of the network, the context length (`--block_size`), the +length of training, etc. + +Finally, on Apple Silicon Macbooks and with a recent PyTorch version make sure +to add `--device=mps` (short for "`Metal Performance Shaders`"); PyTorch then +uses the on-chip GPU that can *significantly* accelerate training (2-3X) and +allow you to use larger networks. +See [Issue 28](https://github.com/karpathy/nanoGPT/issues/28) for more. + +## reproducing GPT-2 + +A more serious deep learning professional may be more interested in reproducing + GPT-2 results. So here we go - we first `tokenize` the `dataset`, in this case + the [OpenWebText](https://openwebtext2.readthedocs.io/en/latest/), an open +reproduction of OpenAI's (private) `WebText`: + +```sh +python data/openwebtext/prepare.py +``` + +This downloads and `tokenizes` the +[OpenWebText](https://huggingface.co/datasets/Skylion007/openwebtext) +`dataset`. It will create a `train.bin` and `val.bin` which holds the GPT2 BPE +token ids in one sequence, stored as raw `uint16` bytes. Then we're ready to +kick off training. To reproduce GPT-2 (124M) you'll want at least an 8X A100 +40GB node and run: + +```sh +torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py +``` + +This will run for about 4 days using PyTorch Distributed Data Parallel (`DDP`) +and go down to loss of ~2.85. Now, a GPT-2 model just evaluated on `OWT` gets +a val loss of about 3.11, but if you finetune it it will come down to ~2.85 +territory (due to an apparent domain gap), making the two models ~match. + +If you're in a cluster environment and you are blessed with multiple GPU nodes +you can make GPU go `brrrr` e.g. across 2 nodes like: + +```sh +# Run on the first (master) node with example IP 123.456.123.456: +torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py +# Run on the worker node: +torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py +``` + +It is a good idea to benchmark your interconnect (e.g. `iperf3`). In particular +, if you don't have `Infiniband` then also prepend `NCCL_IB_DISABLE=1` to the +above launches. Your `multinode` training will work, but most likely *crawl*. +By default checkpoints are periodically written to the `--out_dir`. We can +sample from the model by simply `python sample.py`. + +Finally, to train on a single GPU simply run the `python train.py` script. +Have a look at all of its `args`, the script tries to be very readable, +`hackable` and transparent. You'll most likely want to tune a number of those +variables depending on your needs. + +## baselines + +OpenAI GPT-2 checkpoints allow us to get some baselines in place for +openwebtext. We can get the numbers as follows: + +```sh +python train.py config/eval_gpt2.py +python train.py config/eval_gpt2_medium.py +python train.py config/eval_gpt2_large.py +python train.py config/eval_gpt2_xl.py +``` + +and observe the following losses on train and val: + +| model | `params` | train loss | val loss | +| ------| ------ | ---------- | -------- | +| gpt2 | 124M | 3.11 | 3.12 | +| gpt2-medium | 350M | 2.85 | 2.84 | +| gpt2-large | 774M | 2.66 | 2.67 | +| gpt2-`xl` | 1558M | 2.56 | 2.54 | + +However, we have to note that GPT-2 was trained on (closed, never released) +`WebText`, while OpenWebText is just a best-effort open reproduction of this +`dataset`. This means there is a `dataset` domain gap. Indeed, taking the GPT-2 + (124M) checkpoint and finetuning on `OWT` directly for a while reaches loss +down to ~2.85. This then becomes the more appropriate baseline w.r.t. reproduction. + +## finetuning + +Finetuning is no different than training, we just make sure to initialize from +a pretrained model and train with a smaller learning rate. For an example of +how to finetune a GPT on new text go to `data/shakespeare` and run +`prepare.py` to download the tiny shakespeare `dataset` and render it into a +`train.bin` and `val.bin`, using the OpenAI BPE `tokenizer` from GPT-2. +Unlike OpenWebText this will run in seconds. Finetuning can take very little +time, e.g. on a single GPU just a few minutes. Run an example finetuning like: + +```sh +python train.py config/finetune_shakespeare.py +``` + +This will load the config parameter overrides in +`config/finetune_shakespeare.py` (I didn't tune them much though). Basically, +we initialize from a GPT2 checkpoint with `init_from` and train as normal, +except shorter and with a small learning rate. If you're running out of memory +try decreasing the model size (they are +`{'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}`) or possibly decreasing the +`block_size` (context length). The best checkpoint (lowest validation loss) +will be in the `out_dir` directory, e.g. in `out-shakespeare` by default, per +the config file. You can then run the code in +`sample.py --out_dir=out-shakespeare`: + +```sh +THEODORE: +Thou shalt sell me to the highest bidder: if I die, +I sell thee to the first; if I go mad, +I sell thee to the second; if I +lie, I sell thee to the third; if I slay, +I sell thee to the fourth: so buy or sell, +I tell thee again, thou shalt not sell my +possession. + +JULIET: +And if thou steal, thou shalt not sell thyself. + +THEODORE: +I do not steal; I sell the stolen goods. + +THEODORE: +Thou know'st not what thou sell'st; thou, a woman, +Thou art ever a victim, a thing of no worth: +Thou hast no right, no right, but to be sold. +``` + +Whoa there, GPT, entering some dark place over there. I didn't really tune the +hyperparameters in the config too much, feel free to try! + +## sampling / inference + +Use the script `sample.py` to sample either from pre-trained GPT-2 models +released by OpenAI, or from a model you trained yourself. For example, here is +a way to sample from the largest available `gpt2-xl` model: + +```sh +python sample.py \ + --init_from=gpt2-xl \ + --start="What is the answer to life, the universe, and everything?" \ + --num_samples=5 --max_new_tokens=100 +``` + +If you'd like to sample from a model you trained, use the `--out_dir` to point +the code appropriately. You can also prompt the model with some text from a +file, e.g. ```python sample.py --start=FILE:prompt.txt```. + +## efficiency notes + +For simple model `benchmarking` and profiling, `bench.py` might be useful. +It's identical to what happens in the meat of the training loop of `train.py`, +but omits much of the other complexities. + +Note that the code by default uses +[PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/). At the time of +writing (Dec 29, 2022) this makes `torch.compile()` available in the nightly +release. The improvement from the one line of code is noticeable, e.g. cutting +down iteration time from ~250ms / `iter` to 135ms / `iter`. Nice work PyTorch +team! + +## todos + +- Investigate and add `FSDP` instead of `DDP` +- `Eval` zero-shot perplexities on standard `evals` (e.g. LAMBADA? HELM? etc.) +- Finetune the finetuning script, I think the `hyperparams` are not great +- Schedule for linear batch size increase during training +- Incorporate other `embeddings` (rotary, alibi) +- Separate out the `optim` buffers from model `params` in checkpoints I think +- Additional logging around network health (e.g. gradient clip events, magnitudes) +- Few more investigations around better `init` etc. + +## troubleshooting + +Note that by default this `repo` uses PyTorch 2.0 (i.e. `torch.compile`). This +is fairly new and experimental, and not yet available on all platforms +(e.g. Windows). If you're running into related error messages try to disable +this by adding `--compile=False` flag. This will slow down the code but at +least it will run. + +For some context on this repository, GPT, and language modeling it might be +helpful to watch my +[Zero To Hero series](https://karpathy.ai/zero-to-hero.html). Specifically, the + [GPT video](https://www.youtube.com/watch?v=kCc8FmEb1nY) is popular if you + have some prior language modeling context. + +For more questions/discussions feel free to stop by **#nanoGPT** on Discord: + +[#nanoGPTonDiscord](https://discord.gg/3zy8kqD9Cp) + +## acknowledgements + +All nanoGPT experiments are powered by GPUs on +[Lambda labs](https://lambdalabs.com), my favorite Cloud GPU provider. +Thank you Lambda labs for sponsoring nanoGPT! diff --git a/docs/sphinx/applications/python/nanoGPT/assets/gpt2_124M_loss.png b/docs/sphinx/applications/python/nanoGPT/assets/gpt2_124M_loss.png new file mode 100644 index 00000000000..df5aac3c159 Binary files /dev/null and b/docs/sphinx/applications/python/nanoGPT/assets/gpt2_124M_loss.png differ diff --git a/docs/sphinx/applications/python/nanoGPT/assets/nanogpt.jpg b/docs/sphinx/applications/python/nanoGPT/assets/nanogpt.jpg new file mode 100644 index 00000000000..aa5d47dfc2d Binary files /dev/null and b/docs/sphinx/applications/python/nanoGPT/assets/nanogpt.jpg differ diff --git a/docs/sphinx/applications/python/nanoGPT/bench.py b/docs/sphinx/applications/python/nanoGPT/bench.py new file mode 100644 index 00000000000..a5e2bf95901 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/bench.py @@ -0,0 +1,150 @@ +""" +`A much shorter version of train.py for benchmarking` +""" +import os +from contextlib import nullcontext +import numpy as np +import time +import torch +from model import GPTConfig, GPT + +# ----------------------------------------------------------------------------- +batch_size = 12 +block_size = 1024 +bias = False +real_data = True +seed = 1337 +device = 'cuda' # `examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.` +dtype = 'bfloat16' if torch.cuda.is_available( +) and torch.cuda.is_bf16_supported( +) else 'float16' # `'float32' or 'bfloat16' or 'float16'` +compile = True # use PyTorch 2.0 to compile the model to be faster +profile = False # `use pytorch profiler, or just simple benchmarking?` +exec(open( + 'configurator.py').read()) # overrides from command line or config file +# ----------------------------------------------------------------------------- + +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.backends.cuda.matmul.allow_tf32 = True # `allow tf32 on matmul` +torch.backends.cudnn.allow_tf32 = True # `allow tf32 on cudnn` +device_type = 'cuda' if 'cuda' in device else 'cpu' # `for later use in torch.autocast` +ptdtype = { + 'float32': torch.float32, + 'bfloat16': torch.bfloat16, + 'float16': torch.float16 +}[dtype] +ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast( + device_type=device_type, dtype=ptdtype) + +# data loading `init` +if real_data: + dataset = 'openwebtext' + data_dir = os.path.join('data', dataset) + train_data = np.memmap(os.path.join(data_dir, 'train.bin'), + dtype=np.uint16, + mode='r') + + def get_batch(split): + data = train_data # `note ignore split in benchmarking script` + ix = torch.randint(len(data) - block_size, (batch_size,)) + x = torch.stack([ + torch.from_numpy((data[i:i + block_size]).astype(np.int64)) + for i in ix + ]) + y = torch.stack([ + torch.from_numpy((data[i + 1:i + 1 + block_size]).astype(np.int64)) + for i in ix + ]) + x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to( + device, non_blocking=True) + return x, y +else: + # alternatively, if fixed data is desired to not care about data loading + x = torch.randint(50304, (batch_size, block_size), device=device) + y = torch.randint(50304, (batch_size, block_size), device=device) + get_batch = lambda split: (x, y) + +# model `init` +gptconf = GPTConfig( + block_size=block_size, # how far back does the model look? i.e. context size + n_layer=12, + n_head=12, + n_embd=768, # size of the model + dropout=0, # for determinism + bias=bias, +) +model = GPT(gptconf) +model.to(device) + +optimizer = model.configure_optimizers(weight_decay=1e-2, + learning_rate=1e-4, + betas=(0.9, 0.95), + device_type=device_type) + +if compile: + print("Compiling model...") + model = torch.compile(model) # pytorch 2.0 + +if profile: + # `useful docs on pytorch profiler:` + # - tutorial https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html + # - `api` https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile + wait, warmup, active = 5, 5, 5 + num_steps = wait + warmup + active + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA + ], + schedule=torch.profiler.schedule(wait=wait, + warmup=warmup, + active=active, + repeat=1), + on_trace_ready=torch.profiler.tensorboard_trace_handler( + './bench_log'), + record_shapes=False, + profile_memory=False, + with_stack= + False, # incurs an additional overhead, disable if not needed + with_flops=True, + with_modules=False, # `only for torchscript models atm` + ) as prof: + + X, Y = get_batch('train') + for k in range(num_steps): + with ctx: + logits, loss = model(X, Y) + X, Y = get_batch('train') + optimizer.zero_grad(set_to_none=True) + loss.backward() + optimizer.step() + lossf = loss.item() + print(f"{k}/{num_steps} loss: {lossf:.4f}") + + prof.step() # `notify the profiler at end of each step` + +else: + + # simple `benchmarking` + torch.cuda.synchronize() + for stage, num_steps in enumerate([10, 20]): # `burnin`, then benchmark + t0 = time.time() + X, Y = get_batch('train') + for k in range(num_steps): + with ctx: + logits, loss = model(X, Y) + X, Y = get_batch('train') + optimizer.zero_grad(set_to_none=True) + loss.backward() + optimizer.step() + lossf = loss.item() + print(f"{k}/{num_steps} loss: {lossf:.4f}") + torch.cuda.synchronize() + t1 = time.time() + dt = t1 - t0 + mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt) + if stage == 1: + print( + f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%" + ) diff --git a/docs/sphinx/applications/python/nanoGPT/config/eval_gpt2.py b/docs/sphinx/applications/python/nanoGPT/config/eval_gpt2.py new file mode 100644 index 00000000000..c9703bdc692 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/config/eval_gpt2.py @@ -0,0 +1,8 @@ +# evaluate the base gpt2 +# `n_layer=12, n_head=12, n_embd=768` +# 124M parameters +batch_size = 8 +eval_iters = 500 # use more iterations to get good estimate +eval_only = True +wandb_log = False +init_from = 'gpt2' diff --git a/docs/sphinx/applications/python/nanoGPT/config/eval_gpt2_large.py b/docs/sphinx/applications/python/nanoGPT/config/eval_gpt2_large.py new file mode 100644 index 00000000000..01d1e38977d --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/config/eval_gpt2_large.py @@ -0,0 +1,8 @@ +# evaluate the base gpt2 +# `n_layer=36, n_head=20, n_embd=1280` +# 774M parameters +batch_size = 8 +eval_iters = 500 # use more iterations to get good estimate +eval_only = True +wandb_log = False +init_from = 'gpt2-large' diff --git a/docs/sphinx/applications/python/nanoGPT/config/eval_gpt2_medium.py b/docs/sphinx/applications/python/nanoGPT/config/eval_gpt2_medium.py new file mode 100644 index 00000000000..f7fe192b5e0 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/config/eval_gpt2_medium.py @@ -0,0 +1,8 @@ +# evaluate the base gpt2 +# `n_layer=24, n_head=16, n_embd=1024` +# 350M parameters +batch_size = 8 +eval_iters = 500 # use more iterations to get good estimate +eval_only = True +wandb_log = False +init_from = 'gpt2-medium' diff --git a/docs/sphinx/applications/python/nanoGPT/config/eval_gpt2_xl.py b/docs/sphinx/applications/python/nanoGPT/config/eval_gpt2_xl.py new file mode 100644 index 00000000000..9c9ae6c10ff --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/config/eval_gpt2_xl.py @@ -0,0 +1,8 @@ +# evaluate the base gpt2 +# `n_layer=48, n_head=25, n_embd=1600` +# 1558M parameters +batch_size = 8 +eval_iters = 500 # use more iterations to get good estimate +eval_only = True +wandb_log = False +init_from = 'gpt2-xl' diff --git a/docs/sphinx/applications/python/nanoGPT/config/finetune_shakespeare.py b/docs/sphinx/applications/python/nanoGPT/config/finetune_shakespeare.py new file mode 100644 index 00000000000..eb0c3b5b4c1 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/config/finetune_shakespeare.py @@ -0,0 +1,25 @@ +import time + +out_dir = 'out-shakespeare' +eval_interval = 5 +eval_iters = 40 +wandb_log = False # feel free to turn on +wandb_project = 'shakespeare' +wandb_run_name = 'ft-' + str(time.time()) + +dataset = 'shakespeare' +init_from = 'gpt2-xl' # this is the largest GPT-2 model + +# only save checkpoints if the validation loss improves +always_save_checkpoint = False + +# `the number of examples per iter:` +# `1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter` +# `shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters` +batch_size = 1 +gradient_accumulation_steps = 32 +max_iters = 20 + +# finetune at constant LR +learning_rate = 3e-5 +decay_lr = False diff --git a/docs/sphinx/applications/python/nanoGPT/config/train_gpt2.py b/docs/sphinx/applications/python/nanoGPT/config/train_gpt2.py new file mode 100644 index 00000000000..726d193fcb2 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/config/train_gpt2.py @@ -0,0 +1,25 @@ +# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB +# launch as the following (e.g. in a screen session) and wait ~5 days: +# `$ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py` + +wandb_log = True +wandb_project = 'owt' +wandb_run_name = 'gpt2-124M' + +# these make the total batch size be ~0.5M +# `12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520` +batch_size = 12 +block_size = 1024 +gradient_accumulation_steps = 5 * 8 + +# this makes total number of tokens be 300B +max_iters = 600000 +lr_decay_iters = 600000 + +# `eval stuff` +eval_interval = 1000 +eval_iters = 200 +log_interval = 10 + +# weight decay +weight_decay = 1e-1 diff --git a/docs/sphinx/applications/python/nanoGPT/config/train_shakespeare_char.py b/docs/sphinx/applications/python/nanoGPT/config/train_shakespeare_char.py new file mode 100644 index 00000000000..5df81de4539 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/config/train_shakespeare_char.py @@ -0,0 +1,37 @@ +# train a miniature character-level shakespeare model +# good for debugging and playing on `macbooks` and such + +out_dir = 'out-shakespeare-char' +eval_interval = 250 # keep frequent because we'll `overfit` +eval_iters = 200 +log_interval = 10 # don't print too too often + +# we expect to `overfit` on this small dataset, so only save when val improves +always_save_checkpoint = False + +wandb_log = False # override via command line if you like +wandb_project = 'shakespeare-char' +wandb_run_name = 'mini-gpt' + +dataset = 'shakespeare_char' +gradient_accumulation_steps = 1 +batch_size = 64 +block_size = 256 # context of up to 256 previous characters + +# baby GPT model :) +n_layer = 6 +n_head = 6 +n_embd = 384 +dropout = 0.2 + +learning_rate = 1e-3 # with baby networks can afford to go a bit higher +max_iters = 5000 +lr_decay_iters = 5000 # `make equal to max_iters usually` +min_lr = 1e-4 # learning_rate / 10 usually +beta2 = 0.99 # make a bit bigger because number of tokens per `iter` is small + +warmup_iters = 100 # not super necessary potentially + +# on macbook also add +# `device = 'cpu' # run on cpu only` +# compile = False # do not torch compile the model diff --git a/docs/sphinx/applications/python/nanoGPT/configurator.py b/docs/sphinx/applications/python/nanoGPT/configurator.py new file mode 100644 index 00000000000..02be1111893 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/configurator.py @@ -0,0 +1,47 @@ +""" +Poor Man's Configurator. Probably a terrible idea. Example usage: +`$ python train.py config/override_file.py --batch_size=32` +`this will first run config/override_file.py, then override batch_size to 32` + +`The code in this file will be run as follows from e.g. train.py:` +`>>> exec(open('configurator.py').read())` + +`So it's not a Python module, it's just shuttling this code away from train.py` +The code in this script then overrides the `globals()` + +I know people are not going to love this, I just really dislike configuration +complexity and having to prepend config. to every single variable. If someone +comes up with a better simple Python solution I am all ears. +""" + +import sys +from ast import literal_eval + +for arg in sys.argv[1:]: + if '=' not in arg: + # assume it's the name of a config file + assert not arg.startswith('--') + config_file = arg + print(f"Overriding config with {config_file}:") + with open(config_file) as f: + print(f.read()) + exec(open(config_file).read()) + else: + # assume it's a --key=value argument + assert arg.startswith('--') + key, val = arg.split('=') + key = key[2:] + if key in globals(): + try: + # attempt to `eval` it (e.g. if bool, number, or etc) + attempt = literal_eval(val) + except (SyntaxError, ValueError): + # if that goes wrong, just use the string + attempt = val + # ensure the types match `ok` + assert type(attempt) == type(globals()[key]) + # cross fingers + print(f"Overriding: {key} = {attempt}") + globals()[key] = attempt + else: + raise ValueError(f"Unknown config key: {key}") diff --git a/docs/sphinx/applications/python/nanoGPT/data/openwebtext/prepare.py b/docs/sphinx/applications/python/nanoGPT/data/openwebtext/prepare.py new file mode 100644 index 00000000000..f007025e45c --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/data/openwebtext/prepare.py @@ -0,0 +1,88 @@ +# saves the openwebtext dataset to a binary file for training. following was helpful: +# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py + +import os +from tqdm import tqdm +import numpy as np +import tiktoken +from datasets import load_dataset # huggingface datasets + +# number of workers in .map() call +# good number to use is ~order number of `cpu` cores // 2 +num_proc = 8 + +# number of workers in load_dataset() call +# `best number might be different from num_proc above as it also depends on NW speed.` +# it is better than 1 usually though +num_proc_load_dataset = num_proc + +enc = tiktoken.get_encoding("gpt2") + +if __name__ == '__main__': + # takes 54GB in huggingface .cache `dir`, about 8M documents (8,013,769) + dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset) + + # `owt` by default only contains the 'train' split, so create a test split + split_dataset = dataset["train"].train_test_split(test_size=0.0005, + seed=2357, + shuffle=True) + split_dataset['val'] = split_dataset.pop( + 'test') # rename the test split to val + + # this results in: + # >>> split_dataset + # DatasetDict({ + # train: Dataset({ + # features: ['text'], + # `num_rows`: 8009762 + # }) + # val: Dataset({ + # features: ['text'], + # `num_rows`: 4007 + # }) + # }) + + # `we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)` + def process(example): + ids = enc.encode_ordinary( + example['text']) # encode_ordinary ignores any special tokens + ids.append(enc.eot_token + ) # add the end of text token, e.g. 50256 for gpt2 `bpe` + # `note: I think eot should be prepended not appended... hmm. it's called "eot" though...` + out = {'ids': ids, 'len': len(ids)} + return out + + # `tokenize the dataset` + tokenized = split_dataset.map( + process, + remove_columns=['text'], + desc="tokenizing the splits", + num_proc=num_proc, + ) + + # concatenate all the ids in each dataset into one large file we can use for training + for split, dset in tokenized.items(): + arr_len = np.sum(dset['len'], dtype=np.uint64) + filename = os.path.join(os.path.dirname(__file__), f'{split}.bin') + dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16) + arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,)) + total_batches = 1024 + + idx = 0 + for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'): + # Batch together samples for faster write + batch = dset.shard(num_shards=total_batches, + index=batch_idx, + contiguous=True).with_format('numpy') + arr_batch = np.concatenate(batch['ids']) + # Write into `mmap` + arr[idx:idx + len(arr_batch)] = arr_batch + idx += len(arr_batch) + arr.flush() + + # train.bin is ~17GB, val.bin ~8.5MB + # train has ~9B tokens (9,035,582,198) + # val has ~4M tokens (4,434,897) + + # to read the bin files later, e.g. with `numpy`: + # `m = np.memmap('train.bin', dtype=np.uint16, mode='r')` diff --git a/docs/sphinx/applications/python/nanoGPT/data/openwebtext/readme.md b/docs/sphinx/applications/python/nanoGPT/data/openwebtext/readme.md new file mode 100644 index 00000000000..2a318fc5c51 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/data/openwebtext/readme.md @@ -0,0 +1,14 @@ +# openwebtext dataset + +after running `prepare.py` (preprocess) we get: + +- train.bin is ~17GB, val.bin ~8.5MB +- train has ~9B tokens (9,035,582,198) +- val has ~4M tokens (4,434,897) + +this came from 8,013,769 documents in total. + +references: + +- OpenAI's `WebText` `dataset` is discussed in [GPT-2 paper](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) +- [OpenWebText](https://skylion007.github.io/OpenWebTextCorpus/) `dataset` diff --git a/docs/sphinx/applications/python/nanoGPT/data/save_dir_3k.zip b/docs/sphinx/applications/python/nanoGPT/data/save_dir_3k.zip new file mode 100644 index 00000000000..a1ff3bcc987 Binary files /dev/null and b/docs/sphinx/applications/python/nanoGPT/data/save_dir_3k.zip differ diff --git a/docs/sphinx/applications/python/nanoGPT/data/shakespeare/prepare.py b/docs/sphinx/applications/python/nanoGPT/data/shakespeare/prepare.py new file mode 100644 index 00000000000..09ddf1ac2e8 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/data/shakespeare/prepare.py @@ -0,0 +1,33 @@ +import os +import requests +import tiktoken +import numpy as np + +# download the tiny shakespeare dataset +input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt') +if not os.path.exists(input_file_path): + data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt' + with open(input_file_path, 'w', encoding='utf-8') as f: + f.write(requests.get(data_url).text) + +with open(input_file_path, 'r', encoding='utf-8') as f: + data = f.read() +n = len(data) +train_data = data[:int(n * 0.9)] +val_data = data[int(n * 0.9):] + +# `encode with tiktoken gpt2 bpe` +enc = tiktoken.get_encoding("gpt2") +train_ids = enc.encode_ordinary(train_data) +val_ids = enc.encode_ordinary(val_data) +print(f"train has {len(train_ids):,} tokens") +print(f"val has {len(val_ids):,} tokens") + +# export to bin files +train_ids = np.array(train_ids, dtype=np.uint16) +val_ids = np.array(val_ids, dtype=np.uint16) +train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin')) +val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin')) + +# train.bin has 301,966 tokens +# val.bin has 36,059 tokens diff --git a/docs/sphinx/applications/python/nanoGPT/data/shakespeare/readme.md b/docs/sphinx/applications/python/nanoGPT/data/shakespeare/readme.md new file mode 100644 index 00000000000..22c79fedc4b --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/data/shakespeare/readme.md @@ -0,0 +1,8 @@ +# tiny shakespeare + +Tiny shakespeare, of the good old `char-rnn` fame :) + +After running `prepare.py`: + +- train.bin has 301,966 tokens +- val.bin has 36,059 tokens diff --git a/docs/sphinx/applications/python/nanoGPT/data/shakespeare_char/prepare.py b/docs/sphinx/applications/python/nanoGPT/data/shakespeare_char/prepare.py new file mode 100644 index 00000000000..257db77281a --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/data/shakespeare_char/prepare.py @@ -0,0 +1,75 @@ +""" +Prepare the Shakespeare dataset for character-level language modeling. +So instead of encoding with GPT-2 BPE tokens, we just map characters to `ints`. +Will save train.bin, val.bin containing the ids, and `meta.pkl` containing the +encoder and decoder and some other related info. +""" +import os +import pickle +import requests +import numpy as np + +# download the tiny shakespeare dataset +input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt') +if not os.path.exists(input_file_path): + data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt' + with open(input_file_path, 'w') as f: + f.write(requests.get(data_url).text) + +with open(input_file_path, 'r') as f: + data = f.read() +print(f"length of dataset in characters: {len(data):,}") + +# get all the unique characters that occur in this text +chars = sorted(list(set(data))) +vocab_size = len(chars) +print("all the unique characters:", ''.join(chars)) +print(f"vocab size: {vocab_size:,}") + +# create a mapping from characters to integers +stoi = {ch: i for i, ch in enumerate(chars)} +itos = {i: ch for i, ch in enumerate(chars)} + + +def encode(s): + return [stoi[c] for c in s + ] # encoder: take a string, output a list of integers + + +def decode(l): + return ''.join([itos[i] for i in l + ]) # decoder: take a list of integers, output a string + + +# create the train and test splits +n = len(data) +train_data = data[:int(n * 0.9)] +val_data = data[int(n * 0.9):] + +# encode both to integers +train_ids = encode(train_data) +val_ids = encode(val_data) +print(f"train has {len(train_ids):,} tokens") +print(f"val has {len(val_ids):,} tokens") + +# export to bin files +train_ids = np.array(train_ids, dtype=np.uint16) +val_ids = np.array(val_ids, dtype=np.uint16) +train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin')) +val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin')) + +# save the meta information as well, to help us encode/decode later +meta = { + 'vocab_size': vocab_size, + 'itos': itos, + 'stoi': stoi, +} +with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f: + pickle.dump(meta, f) + +# length of dataset in characters: 1115394 +# all the unique characters: +# `!$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz` +# vocab size: 65 +# train has 1003854 tokens +# val has 111540 tokens diff --git a/docs/sphinx/applications/python/nanoGPT/data/shakespeare_char/readme.md b/docs/sphinx/applications/python/nanoGPT/data/shakespeare_char/readme.md new file mode 100644 index 00000000000..8f58a010065 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/data/shakespeare_char/readme.md @@ -0,0 +1,9 @@ + +# tiny shakespeare, character-level + +Tiny shakespeare, of the good old `char-rnn` fame :) Treated on character-level. + +After running `prepare.py`: + +- train.bin has 1,003,854 tokens +- val.bin has 111,540 tokens diff --git a/docs/sphinx/applications/python/nanoGPT/model.py b/docs/sphinx/applications/python/nanoGPT/model.py new file mode 100644 index 00000000000..6f4f27dcf4d --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/model.py @@ -0,0 +1,398 @@ +""" +Full definition of a GPT Language Model, all of it in this single file. +References: +1) the official GPT-2 TensorFlow implementation released by OpenAI: +https://github.com/openai/gpt-2/blob/master/src/model.py +2) huggingface/transformers PyTorch implementation: +https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py +""" + +import math +import inspect +from dataclasses import dataclass + +import torch +import torch.nn as nn +from torch.nn import functional as F + + +class LayerNorm(nn.Module): + """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """ + + def __init__(self, ndim, bias): + super().__init__() + self.weight = nn.Parameter(torch.ones(ndim)) + self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None + + def forward(self, input): + return F.layer_norm(input, self.weight.shape, self.weight, self.bias, + 1e-5) + + +class CausalSelfAttention(nn.Module): + + def __init__(self, config): + super().__init__() + assert config.n_embd % config.n_head == 0 + # key, query, value projections for all heads, but in a batch + self.c_attn = nn.Linear(config.n_embd, + 3 * config.n_embd, + bias=config.bias) + # output projection + self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias) + # regularization + self.attn_dropout = nn.Dropout(config.dropout) + self.resid_dropout = nn.Dropout(config.dropout) + self.n_head = config.n_head + self.n_embd = config.n_embd + self.dropout = config.dropout + # flash attention make GPU go `brrrrr` but support is only in PyTorch >= 2.0 + self.flash = hasattr(torch.nn.functional, + 'scaled_dot_product_attention') + if not self.flash: + print( + "WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0" + ) + # causal mask to ensure that attention is only applied to the left in the input sequence + self.register_buffer( + "bias", + torch.tril(torch.ones(config.block_size, + config.block_size)).view( + 1, 1, config.block_size, + config.block_size)) + + def forward(self, x): + B, T, C = x.size( + ) # batch size, sequence length, embedding dimensionality (`n_embd`) + + # calculate query, key, values for all heads in batch and move head forward to be the batch dim + q, k, v = self.c_attn(x).split(self.n_embd, dim=2) + k = k.view(B, T, self.n_head, + C // self.n_head).transpose(1, 2) # `(B, nh, T, hs)` + q = q.view(B, T, self.n_head, + C // self.n_head).transpose(1, 2) # `(B, nh, T, hs)` + v = v.view(B, T, self.n_head, + C // self.n_head).transpose(1, 2) # `(B, nh, T, hs)` + + # `causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)` + if self.flash: + # efficient attention using Flash Attention CUDA kernels + y = torch.nn.functional.scaled_dot_product_attention( + q, + k, + v, + attn_mask=None, + dropout_p=self.dropout if self.training else 0, + is_causal=True) + else: + # manual implementation of attention + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.attn_dropout(att) + y = att @ v # `(B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)` + y = y.transpose(1, 2).contiguous().view( + B, T, C) # re-assemble all head outputs side by side + + # output projection + y = self.resid_dropout(self.c_proj(y)) + return y + + +class MLP(nn.Module): + + def __init__(self, config): + super().__init__() + self.c_fc = nn.Linear(config.n_embd, + 4 * config.n_embd, + bias=config.bias) + self.gelu = nn.GELU() + self.c_proj = nn.Linear(4 * config.n_embd, + config.n_embd, + bias=config.bias) + self.dropout = nn.Dropout(config.dropout) + + def forward(self, x): + x = self.c_fc(x) + x = self.gelu(x) + x = self.c_proj(x) + x = self.dropout(x) + return x + + +class Block(nn.Module): + + def __init__(self, config): + super().__init__() + self.ln_1 = LayerNorm(config.n_embd, bias=config.bias) + self.attn = CausalSelfAttention(config) + self.ln_2 = LayerNorm(config.n_embd, bias=config.bias) + self.mlp = MLP(config) + + def forward(self, x): + x = x + self.attn(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +@dataclass +class GPTConfig: + block_size: int = 1024 + vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: int = 12 + n_head: int = 12 + n_embd: int = 768 + dropout: float = 0.0 + bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + + +class GPT(nn.Module): + + def __init__(self, config): + super().__init__() + assert config.vocab_size is not None + assert config.block_size is not None + self.config = config + + self.transformer = nn.ModuleDict( + dict( + wte=nn.Embedding(config.vocab_size, config.n_embd), + wpe=nn.Embedding(config.block_size, config.n_embd), + drop=nn.Dropout(config.dropout), + h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]), + ln_f=LayerNorm(config.n_embd, bias=config.bias), + )) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + # with weight tying when using torch.compile() some warnings get generated: + # "`UserWarning`: functional_call was passed multiple values for tied weights. + # This behavior is deprecated and will be an error in future versions" + # not 100% sure what this is, so far seems to be harmless. TODO investigate + self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying + + # `init` all weights + self.apply(self._init_weights) + # apply special scaled `init` to the residual projections, per GPT-2 paper + for pn, p in self.named_parameters(): + if pn.endswith('c_proj.weight'): + torch.nn.init.normal_(p, + mean=0.0, + std=0.02 / math.sqrt(2 * config.n_layer)) + + # report number of parameters + print("number of parameters: %.2fM" % (self.get_num_params() / 1e6,)) + + def get_num_params(self, non_embedding=True): + """ + Return the number of parameters in the model. + For non-embedding count (default), the position `embeddings` get subtracted. + The token `embeddings` would too, except due to the parameter sharing these + `params` are actually used as weights in the final layer, so we include them. + """ + n_params = sum(p.numel() for p in self.parameters()) + if non_embedding: + n_params -= self.transformer.wpe.weight.numel() + return n_params + + def _init_weights(self, module): + if isinstance(module, nn.Linear): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + if module.bias is not None: + torch.nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + + def forward(self, idx, targets=None): + device = idx.device + b, t = idx.size() + assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t) + + # forward the GPT model itself + tok_emb = self.transformer.wte( + idx) # `token embeddings of shape (b, t, n_embd)` + pos_emb = self.transformer.wpe( + pos) # `position embeddings of shape (t, n_embd)` + x = self.transformer.drop(tok_emb + pos_emb) + for block in self.transformer.h: + x = block(x) + x = self.transformer.ln_f(x) + + if targets is not None: + # if we are given some desired targets also calculate the loss + logits = self.lm_head(x) + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), + targets.view(-1), + ignore_index=-1) + else: + # inference-time mini-optimization: only forward the `lm_head` on the very last position + logits = self.lm_head( + x[:, [-1], :]) # note: using list [-1] to preserve the time dim + loss = None + + return logits, loss + + def crop_block_size(self, block_size): + # model surgery to decrease the block size if necessary + # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024) + # but want to use a smaller block size for some smaller, simpler model + assert block_size <= self.config.block_size + self.config.block_size = block_size + self.transformer.wpe.weight = nn.Parameter( + self.transformer.wpe.weight[:block_size]) + for block in self.transformer.h: + if hasattr(block.attn, 'bias'): + block.attn.bias = block.attn.bias[:, :, :block_size, : + block_size] + + @classmethod + def from_pretrained(cls, model_type, override_args=None): + assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'} + override_args = override_args or {} # default to empty dict + # only dropout can be overridden see more notes below + assert all(k == 'dropout' for k in override_args) + from transformers import GPT2LMHeadModel + print("loading weights from pretrained gpt: %s" % model_type) + + # `n_layer, n_head and n_embd are determined from model_type` + config_args = { + 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M `params` + 'gpt2-medium': dict(n_layer=24, n_head=16, + n_embd=1024), # 350M `params` + 'gpt2-large': dict(n_layer=36, n_head=20, + n_embd=1280), # 774M `params` + 'gpt2-xl': dict(n_layer=48, n_head=25, + n_embd=1600), # 1558M `params` + }[model_type] + print("forcing vocab_size=50257, block_size=1024, bias=True") + config_args[ + 'vocab_size'] = 50257 # always 50257 for GPT model checkpoints + config_args[ + 'block_size'] = 1024 # always 1024 for GPT model checkpoints + config_args['bias'] = True # always True for GPT model checkpoints + # we can override the dropout rate, if desired + if 'dropout' in override_args: + print(f"overriding dropout rate to {override_args['dropout']}") + config_args['dropout'] = override_args['dropout'] + # create a from-scratch initialized minGPT model + config = GPTConfig(**config_args) + model = GPT(config) + sd = model.state_dict() + sd_keys = sd.keys() + sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias') + ] # discard this mask / buffer, not a `param` + + # `init` a huggingface/transformers model + model_hf = GPT2LMHeadModel.from_pretrained(model_type) + sd_hf = model_hf.state_dict() + + # copy while ensuring all of the parameters are aligned and match in names and shapes + sd_keys_hf = sd_hf.keys() + sd_keys_hf = [ + k for k in sd_keys_hf if not k.endswith('.attn.masked_bias') + ] # ignore these, just a buffer + sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias') + ] # same, just the mask (buffer) + transposed = [ + 'attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', + 'mlp.c_proj.weight' + ] + # basically the `openai` checkpoints use a "`Conv1D`" module, but we only want to use a vanilla Linear + # this means that we have to transpose these weights when we import them + assert len(sd_keys_hf) == len( + sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}" + for k in sd_keys_hf: + if any(k.endswith(w) for w in transposed): + # special treatment for the Conv1D weights we need to transpose + assert sd_hf[k].shape[::-1] == sd[k].shape + with torch.no_grad(): + sd[k].copy_(sd_hf[k].t()) + else: + # vanilla copy over the other parameters + assert sd_hf[k].shape == sd[k].shape + with torch.no_grad(): + sd[k].copy_(sd_hf[k]) + + return model + + def configure_optimizers(self, weight_decay, learning_rate, betas, + device_type): + # start with all of the candidate parameters + param_dict = {pn: p for pn, p in self.named_parameters()} + # filter out those that do not require grad + param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad} + # `create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.` + # `i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.` + decay_params = [p for n, p in param_dict.items() if p.dim() >= 2] + nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2] + optim_groups = [{ + 'params': decay_params, + 'weight_decay': weight_decay + }, { + 'params': nodecay_params, + 'weight_decay': 0.0 + }] + num_decay_params = sum(p.numel() for p in decay_params) + num_nodecay_params = sum(p.numel() for p in nodecay_params) + print( + f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters" + ) + print( + f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters" + ) + # Create `AdamW` optimizer and use the fused version if it is available + fused_available = 'fused' in inspect.signature( + torch.optim.AdamW).parameters + use_fused = fused_available and device_type == 'cuda' + extra_args = dict(fused=True) if use_fused else dict() + optimizer = torch.optim.AdamW(optim_groups, + lr=learning_rate, + betas=betas, + **extra_args) + print(f"using fused AdamW: {use_fused}") + + return optimizer + + def estimate_mfu(self, fwdbwd_per_iter, dt): + """ estimate model flops utilization (`MFU`) in units of A100 `bfloat16` peak FLOPS """ + # first estimate the number of flops we do per iteration. + # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311 + N = self.get_num_params() + cfg = self.config + L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd // cfg.n_head, cfg.block_size + flops_per_token = 6 * N + 12 * L * H * Q * T + flops_per_fwdbwd = flops_per_token * T + flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter + # express our flops throughput as ratio of A100 `bfloat16` peak flops + flops_achieved = flops_per_iter * (1.0 / dt) # per second + flops_promised = 312e12 # A100 GPU `bfloat16` peak flops is 312 TFLOPS + mfu = flops_achieved / flops_promised + return mfu + + @torch.no_grad() + def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None): + """ + Take a conditioning sequence of indices `idx` (`LongTensor` of shape (b,t)) and complete + the sequence max_new_tokens times, feeding the predictions back into the model each time. + Most likely you'll want to make sure to be in `model.eval()` mode of operation for this. + """ + for _ in range(max_new_tokens): + # if the sequence context is growing too long we must crop it at block_size + idx_cond = idx if idx.size( + 1) <= self.config.block_size else idx[:, + -self.config.block_size:] + # forward the model to get the `logits` for the index in the sequence + logits, _ = self(idx_cond) + # pluck the `logits` at the final step and scale by desired temperature + logits = logits[:, -1, :] / temperature + # optionally crop the `logits` to only the top k options + if top_k is not None: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + logits[logits < v[:, [-1]]] = -float('Inf') + # apply `softmax` to convert `logits` to (normalized) probabilities + probs = F.softmax(logits, dim=-1) + # sample from the distribution + idx_next = torch.multinomial(probs, num_samples=1) + # append sampled index to the running sequence and continue + idx = torch.cat((idx, idx_next), dim=1) + + return idx diff --git a/docs/sphinx/applications/python/nanoGPT/model_pad.py b/docs/sphinx/applications/python/nanoGPT/model_pad.py new file mode 100644 index 00000000000..0a88b061e8b --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/model_pad.py @@ -0,0 +1,398 @@ +# VERSION SUPPORTING PADDING +""" +Full definition of a GPT Language Model, all of it in this single file. +References: +1) the official GPT-2 TensorFlow implementation released by OpenAI: +https://github.com/openai/gpt-2/blob/master/src/model.py +2) huggingface/transformers PyTorch implementation: +https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py +""" + +import math +import inspect +from dataclasses import dataclass + +import torch +import torch.nn as nn +from torch.nn import functional as F + + +class LayerNorm(nn.Module): + """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """ + + def __init__(self, ndim, bias): + super().__init__() + self.weight = nn.Parameter(torch.ones(ndim)) + self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None + + def forward(self, input): + return F.layer_norm(input, self.weight.shape, self.weight, self.bias, + 1e-5) + + +class CausalSelfAttention(nn.Module): + + def __init__(self, config): + super().__init__() + assert config.n_embd % config.n_head == 0 + # key, query, value projections for all heads, but in a batch + self.c_attn = nn.Linear(config.n_embd, + 3 * config.n_embd, + bias=config.bias) + # output projection + self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias) + # regularization + self.attn_dropout = nn.Dropout(config.dropout) + self.resid_dropout = nn.Dropout(config.dropout) + self.n_head = config.n_head + self.n_embd = config.n_embd + self.dropout = config.dropout + # flash attention make GPU go `brrrrr` but support is only in PyTorch >= 2.0 + self.flash = hasattr(torch.nn.functional, + 'scaled_dot_product_attention') + if not self.flash: + raise RuntimeError("Flash attention is required.") + # print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0") + # # causal mask to ensure that attention is only applied to the left in the input sequence + # `self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))` + # `.view(1, 1, config.block_size, config.block_size))` + + def forward(self, x, padding_mask=None): + B, T, C = x.size( + ) # batch size, sequence length, embedding dimensionality (`n_embd`) + + # calculate query, key, values for all heads in batch and move head forward to be the batch dim + q, k, v = self.c_attn(x).split(self.n_embd, dim=2) + k = k.view(B, T, self.n_head, + C // self.n_head).transpose(1, 2) # `(B, nh, T, hs)` + q = q.view(B, T, self.n_head, + C // self.n_head).transpose(1, 2) # `(B, nh, T, hs)` + v = v.view(B, T, self.n_head, + C // self.n_head).transpose(1, 2) # `(B, nh, T, hs)` + + if padding_mask is not None: + # `Ensure padding_mask is (batch_size, 1, 1, seq_len) for broadcasting` + padding_mask = padding_mask[:, None, None, :] + + # `causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)` + # efficient attention using Flash Attention CUDA kernels + y = torch.nn.functional.scaled_dot_product_attention( + q, + k, + v, + attn_mask=padding_mask, + dropout_p=self.dropout if self.training else 0, + is_causal=True) + y = y.transpose(1, 2).contiguous().view( + B, T, C) # re-assemble all head outputs side by side + # output projection + y = self.resid_dropout(self.c_proj(y)) + #print(y) + return y + + +class MLP(nn.Module): + + def __init__(self, config): + super().__init__() + self.c_fc = nn.Linear(config.n_embd, + 4 * config.n_embd, + bias=config.bias) + self.gelu = nn.GELU() + self.c_proj = nn.Linear(4 * config.n_embd, + config.n_embd, + bias=config.bias) + self.dropout = nn.Dropout(config.dropout) + + def forward(self, x): + x = self.c_fc(x) + x = self.gelu(x) + x = self.c_proj(x) + x = self.dropout(x) + return x + + +class Block(nn.Module): + + def __init__(self, config): + super().__init__() + self.ln_1 = LayerNorm(config.n_embd, bias=config.bias) + self.attn = CausalSelfAttention(config) + self.ln_2 = LayerNorm(config.n_embd, bias=config.bias) + self.mlp = MLP(config) + + def forward(self, x, padding_mask=None): + x = x + self.attn(self.ln_1(x), padding_mask=padding_mask) + x = x + self.mlp(self.ln_2(x)) + return x + + +@dataclass +class GPTConfig: + block_size: int = 1024 + vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: int = 12 + n_head: int = 12 + n_embd: int = 768 + dropout: float = 0.0 + bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + + +class GPT(nn.Module): + + def __init__(self, config): + print("Initiating nanoGPT model with padding support") + super().__init__() + assert config.vocab_size is not None + assert config.block_size is not None + self.config = config + + self.transformer = nn.ModuleDict( + dict( + wte=nn.Embedding(config.vocab_size, config.n_embd), + wpe=nn.Embedding(config.block_size, config.n_embd), + drop=nn.Dropout(config.dropout), + h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]), + ln_f=LayerNorm(config.n_embd, bias=config.bias), + )) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + # with weight tying when using torch.compile() some warnings get generated: + # "`UserWarning`: functional_call was passed multiple values for tied weights. + # This behavior is deprecated and will be an error in future versions" + # not 100% sure what this is, so far seems to be harmless. TODO investigate + self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying + + # `init` all weights + self.apply(self._init_weights) + # apply special scaled `init` to the residual projections, per GPT-2 paper + for pn, p in self.named_parameters(): + if pn.endswith('c_proj.weight'): + torch.nn.init.normal_(p, + mean=0.0, + std=0.02 / math.sqrt(2 * config.n_layer)) + + # report number of parameters + print("number of parameters: %.2fM" % (self.get_num_params() / 1e6,)) + + def get_num_params(self, non_embedding=True): + """ + Return the number of parameters in the model. + For non-embedding count (default), the position `embeddings` get subtracted. + The token `embeddings` would too, except due to the parameter sharing these + `params` are actually used as weights in the final layer, so we include them. + """ + n_params = sum(p.numel() for p in self.parameters()) + if non_embedding: + n_params -= self.transformer.wpe.weight.numel() + return n_params + + def _init_weights(self, module): + if isinstance(module, nn.Linear): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + if module.bias is not None: + torch.nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + + def forward(self, + idx, + targets=None, + padding_mask=None, + preserve_time_dim=False): + device = idx.device + b, t = idx.size() + assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t) + + # forward the GPT model itself + tok_emb = self.transformer.wte( + idx) # `token embeddings of shape (b, t, n_embd)` + pos_emb = self.transformer.wpe( + pos) # `position embeddings of shape (t, n_embd)` + x = self.transformer.drop(tok_emb + pos_emb) + for block in self.transformer.h: + x = block(x, padding_mask=padding_mask) + x = self.transformer.ln_f(x) + + if targets is not None: + # if we are given some desired targets also calculate the loss + logits = self.lm_head(x) + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), + targets.view(-1), + ignore_index=0) + elif not preserve_time_dim: + # inference-time mini-optimization: only forward the `lm_head` on the very last position + logits = self.lm_head( + x[:, [-1], :]) # note: using list [-1] to preserve the time dim + loss = None + else: + logits = self.lm_head(x) + loss = None + + return logits, loss + + def crop_block_size(self, block_size): + # model surgery to decrease the block size if necessary + # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024) + # but want to use a smaller block size for some smaller, simpler model + assert block_size <= self.config.block_size + self.config.block_size = block_size + self.transformer.wpe.weight = nn.Parameter( + self.transformer.wpe.weight[:block_size]) + for block in self.transformer.h: + if hasattr(block.attn, 'bias'): + block.attn.bias = block.attn.bias[:, :, :block_size, : + block_size] + + @classmethod + def from_pretrained(cls, model_type, override_args=None): + assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'} + override_args = override_args or {} # default to empty dict + # only dropout can be overridden see more notes below + assert all(k == 'dropout' for k in override_args) + from transformers import GPT2LMHeadModel + print("loading weights from pretrained gpt: %s" % model_type) + + # `n_layer, n_head and n_embd are determined from model_type` + config_args = { + 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M `params` + 'gpt2-medium': dict(n_layer=24, n_head=16, + n_embd=1024), # 350M `params` + 'gpt2-large': dict(n_layer=36, n_head=20, + n_embd=1280), # 774M `params` + 'gpt2-xl': dict(n_layer=48, n_head=25, + n_embd=1600), # 1558M `params` + }[model_type] + print("forcing vocab_size=50257, block_size=1024, bias=True") + config_args[ + 'vocab_size'] = 50257 # always 50257 for GPT model checkpoints + config_args[ + 'block_size'] = 1024 # always 1024 for GPT model checkpoints + config_args['bias'] = True # always True for GPT model checkpoints + # we can override the dropout rate, if desired + if 'dropout' in override_args: + print(f"overriding dropout rate to {override_args['dropout']}") + config_args['dropout'] = override_args['dropout'] + # create a from-scratch initialized minGPT model + config = GPTConfig(**config_args) + model = GPT(config) + sd = model.state_dict() + sd_keys = sd.keys() + sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias') + ] # discard this mask / buffer, not a `param` + + # `init` a huggingface/transformers model + model_hf = GPT2LMHeadModel.from_pretrained(model_type) + sd_hf = model_hf.state_dict() + + # copy while ensuring all of the parameters are aligned and match in names and shapes + sd_keys_hf = sd_hf.keys() + sd_keys_hf = [ + k for k in sd_keys_hf if not k.endswith('.attn.masked_bias') + ] # ignore these, just a buffer + sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias') + ] # same, just the mask (buffer) + transposed = [ + 'attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', + 'mlp.c_proj.weight' + ] + # basically the `openai` checkpoints use a "`Conv1D`" module, but we only want to use a vanilla Linear + # this means that we have to transpose these weights when we import them + assert len(sd_keys_hf) == len( + sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}" + for k in sd_keys_hf: + if any(k.endswith(w) for w in transposed): + # special treatment for the Conv1D weights we need to transpose + assert sd_hf[k].shape[::-1] == sd[k].shape + with torch.no_grad(): + sd[k].copy_(sd_hf[k].t()) + else: + # vanilla copy over the other parameters + assert sd_hf[k].shape == sd[k].shape + with torch.no_grad(): + sd[k].copy_(sd_hf[k]) + + return model + + def configure_optimizers(self, weight_decay, learning_rate, betas, + device_type): + # start with all of the candidate parameters + param_dict = {pn: p for pn, p in self.named_parameters()} + # filter out those that do not require grad + param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad} + # `create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.` + # `i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.` + decay_params = [p for n, p in param_dict.items() if p.dim() >= 2] + nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2] + optim_groups = [{ + 'params': decay_params, + 'weight_decay': weight_decay + }, { + 'params': nodecay_params, + 'weight_decay': 0.0 + }] + num_decay_params = sum(p.numel() for p in decay_params) + num_nodecay_params = sum(p.numel() for p in nodecay_params) + print( + f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters" + ) + print( + f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters" + ) + # Create `AdamW` optimizer and use the fused version if it is available + fused_available = 'fused' in inspect.signature( + torch.optim.AdamW).parameters + use_fused = fused_available and device_type == 'cuda' + extra_args = dict(fused=True) if use_fused else dict() + optimizer = torch.optim.AdamW(optim_groups, + lr=learning_rate, + betas=betas, + **extra_args) + print(f"using fused AdamW: {use_fused}") + + return optimizer + + def estimate_mfu(self, fwdbwd_per_iter, dt): + """ estimate model flops utilization (`MFU`) in units of A100 `bfloat16` peak FLOPS """ + # first estimate the number of flops we do per iteration. + # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311 + N = self.get_num_params() + cfg = self.config + L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd // cfg.n_head, cfg.block_size + flops_per_token = 6 * N + 12 * L * H * Q * T + flops_per_fwdbwd = flops_per_token * T + flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter + # express our flops throughput as ratio of A100 `bfloat16` peak flops + flops_achieved = flops_per_iter * (1.0 / dt) # per second + flops_promised = 312e12 # A100 GPU `bfloat16` peak flops is 312 TFLOPS + mfu = flops_achieved / flops_promised + return mfu + + @torch.no_grad() + def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None): + """ + Take a conditioning sequence of indices `idx` (`LongTensor` of shape (b,t)) and complete + the sequence max_new_tokens times, feeding the predictions back into the model each time. + Most likely you'll want to make sure to be in `model.eval()` mode of operation for this. + """ + for _ in range(max_new_tokens): + # if the sequence context is growing too long we must crop it at block_size + idx_cond = idx if idx.size( + 1) <= self.config.block_size else idx[:, + -self.config.block_size:] + # forward the model to get the `logits` for the index in the sequence + logits, _ = self(idx_cond) + # pluck the `logits` at the final step and scale by desired temperature + logits = logits[:, -1, :] / temperature + # optionally crop the `logits` to only the top k options + if top_k is not None: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + logits[logits < v[:, [-1]]] = -float('Inf') + # apply `softmax` to convert `logits` to (normalized) probabilities + probs = F.softmax(logits, dim=-1) + # sample from the distribution + idx_next = torch.multinomial(probs, num_samples=1) + # append sampled index to the running sequence and continue + idx = torch.cat((idx, idx_next), dim=1) + + return idx diff --git a/docs/sphinx/applications/python/nanoGPT/model_pad_gemb.py b/docs/sphinx/applications/python/nanoGPT/model_pad_gemb.py new file mode 100644 index 00000000000..31637159f1c --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/model_pad_gemb.py @@ -0,0 +1,421 @@ +# VERSION SUPPORTING PADDING +""" +Full definition of a GPT Language Model, all of it in this single file. +References: +1) the official GPT-2 TensorFlow implementation released by OpenAI: +https://github.com/openai/gpt-2/blob/master/src/model.py +2) huggingface/transformers PyTorch implementation: +https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py +""" + +import math +import inspect +from dataclasses import dataclass + +import torch +import torch.nn as nn +from torch.nn import functional as F + + +class LayerNorm(nn.Module): + """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """ + + def __init__(self, ndim, bias): + super().__init__() + self.weight = nn.Parameter(torch.ones(ndim)) + self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None + + def forward(self, input): + return F.layer_norm(input, self.weight.shape, self.weight, self.bias, + 1e-5) + + +class CausalSelfAttention(nn.Module): + + def __init__(self, config): + super().__init__() + assert config.n_embd % config.n_head == 0 + # key, query, value projections for all heads, but in a batch + self.c_attn = nn.Linear(config.n_embd, + 3 * config.n_embd, + bias=config.bias) + # output projection + self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias) + # regularization + self.attn_dropout = nn.Dropout(config.dropout) + self.resid_dropout = nn.Dropout(config.dropout) + self.n_head = config.n_head + self.n_embd = config.n_embd + self.dropout = config.dropout + # flash attention make GPU go `brrrrr` but support is only in PyTorch >= 2.0 + self.flash = hasattr(torch.nn.functional, + 'scaled_dot_product_attention') + if not self.flash: + raise RuntimeError("Flash attention is required.") + # print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0") + # # causal mask to ensure that attention is only applied to the left in the input sequence + # `self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))` + # `.view(1, 1, config.block_size, config.block_size))` + + def forward(self, x, padding_mask=None): + B, T, C = x.size( + ) # batch size, sequence length, embedding dimensionality (`n_embd`) + + # calculate query, key, values for all heads in batch and move head forward to be the batch dim + q, k, v = self.c_attn(x).split(self.n_embd, dim=2) + k = k.view(B, T, self.n_head, + C // self.n_head).transpose(1, 2) # `(B, nh, T, hs)` + q = q.view(B, T, self.n_head, + C // self.n_head).transpose(1, 2) # `(B, nh, T, hs)` + v = v.view(B, T, self.n_head, + C // self.n_head).transpose(1, 2) # `(B, nh, T, hs)` + + if padding_mask is not None: + # `Ensure padding_mask is (batch_size, 1, 1, seq_len) for broadcasting` + padding_mask = padding_mask[:, None, None, :] + + # `causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)` + # efficient attention using Flash Attention CUDA kernels + y = torch.nn.functional.scaled_dot_product_attention( + q, + k, + v, + attn_mask=padding_mask, + dropout_p=self.dropout if self.training else 0, + is_causal=True) + y = y.transpose(1, 2).contiguous().view( + B, T, C) # re-assemble all head outputs side by side + # output projection + y = self.resid_dropout(self.c_proj(y)) + #print(y) + return y + + +class MLP(nn.Module): + + def __init__(self, config): + super().__init__() + self.c_fc = nn.Linear(config.n_embd, + 4 * config.n_embd, + bias=config.bias) + self.gelu = nn.GELU() + self.c_proj = nn.Linear(4 * config.n_embd, + config.n_embd, + bias=config.bias) + self.dropout = nn.Dropout(config.dropout) + + def forward(self, x): + x = self.c_fc(x) + x = self.gelu(x) + x = self.c_proj(x) + x = self.dropout(x) + return x + + +class Block(nn.Module): + + def __init__(self, config): + super().__init__() + self.ln_1 = LayerNorm(config.n_embd, bias=config.bias) + self.attn = CausalSelfAttention(config) + self.ln_2 = LayerNorm(config.n_embd, bias=config.bias) + self.mlp = MLP(config) + + def forward(self, x, padding_mask=None): + x = x + self.attn(self.ln_1(x), padding_mask=padding_mask) + x = x + self.mlp(self.ln_2(x)) + return x + + +@dataclass +class GPTConfig: + block_size: int = 1024 + vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: int = 12 + n_head: int = 12 + n_embd: int = 768 + dropout: float = 0.0 + bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + graph_emb_dim: int = 500 # default for FEATHER graph + + +class GPT(nn.Module): + + def __init__(self, config): + print("Initiating nanoGPT model with padding support") + super().__init__() + assert config.vocab_size is not None + assert config.block_size is not None + self.config = config + + self.transformer = nn.ModuleDict( + dict( + wte=nn.Embedding(config.vocab_size, config.n_embd), + wpe=nn.Embedding(config.block_size, config.n_embd), + drop=nn.Dropout(config.dropout), + h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]), + ln_f=LayerNorm(config.n_embd, bias=config.bias), + )) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + # with weight tying when using torch.compile() some warnings get generated: + # "`UserWarning`: functional_call was passed multiple values for tied weights. + # This behavior is deprecated and will be an error in future versions" + # not 100% sure what this is, so far seems to be harmless. TODO investigate + self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying + + self.graph_emb_proj = nn.Linear(config.graph_emb_dim, config.n_embd) + + # `init` all weights + self.apply(self._init_weights) + # apply special scaled `init` to the residual projections, per GPT-2 paper + for pn, p in self.named_parameters(): + if pn.endswith('c_proj.weight'): + torch.nn.init.normal_(p, + mean=0.0, + std=0.02 / math.sqrt(2 * config.n_layer)) + + # report number of parameters + print("number of parameters: %.2fM" % (self.get_num_params() / 1e6,)) + + def get_num_params(self, non_embedding=True): + """ + Return the number of parameters in the model. + For non-embedding count (default), the position `embeddings` get subtracted. + The token `embeddings` would too, except due to the parameter sharing these + `params` are actually used as weights in the final layer, so we include them. + """ + n_params = sum(p.numel() for p in self.parameters()) + if non_embedding: + n_params -= self.transformer.wpe.weight.numel() + return n_params + + def _init_weights(self, module): + if isinstance(module, nn.Linear): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + if module.bias is not None: + torch.nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + + def forward(self, + idx, + graph_emb, + targets=None, + padding_mask=None, + preserve_time_dim=False): + device = idx.device + b, t = idx.size() + assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t) + + # forward the GPT model itself + tok_emb = self.transformer.wte( + idx) # `token embeddings of shape (b, t, n_embd)` + pos_emb = self.transformer.wpe( + pos) # `position embeddings of shape (t, n_embd)` + + #`print(f"From forward (2): graph_emb shape: {graph_emb.shape}")` + #`graph_emb = graph_emb.unsqueeze(1)` + + graph_emb = self.graph_emb_proj(graph_emb) + #`print(f"graph_emb proj shape: {graph_emb.shape}")` + + graph_emb = graph_emb.unsqueeze(1) + + #`print(f"graph_emb unsqueeze(1) shape: {graph_emb.shape}")` + #`print(f"tok_emb shape: {tok_emb.shape}")` + + x = tok_emb + pos_emb + graph_emb + + x = self.transformer.drop(x) + for block in self.transformer.h: + x = block(x, padding_mask=padding_mask) + x = self.transformer.ln_f(x) + + if targets is not None: + # if we are given some desired targets also calculate the loss + logits = self.lm_head(x) + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), + targets.view(-1), + ignore_index=0) + elif not preserve_time_dim: + # inference-time mini-optimization: only forward the `lm_head` on the very last position + logits = self.lm_head( + x[:, [-1], :]) # note: using list [-1] to preserve the time dim + loss = None + else: + logits = self.lm_head(x) + loss = None + + return logits, loss + + def crop_block_size(self, block_size): + # model surgery to decrease the block size if necessary + # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024) + # but want to use a smaller block size for some smaller, simpler model + assert block_size <= self.config.block_size + self.config.block_size = block_size + self.transformer.wpe.weight = nn.Parameter( + self.transformer.wpe.weight[:block_size]) + for block in self.transformer.h: + if hasattr(block.attn, 'bias'): + block.attn.bias = block.attn.bias[:, :, :block_size, : + block_size] + + @classmethod + def from_pretrained(cls, model_type, override_args=None): + assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'} + override_args = override_args or {} # default to empty dict + # only dropout can be overridden see more notes below + assert all(k == 'dropout' for k in override_args) + from transformers import GPT2LMHeadModel + print("loading weights from pretrained gpt: %s" % model_type) + + # `n_layer, n_head and n_embd are determined from model_type` + config_args = { + 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M `params` + 'gpt2-medium': dict(n_layer=24, n_head=16, + n_embd=1024), # 350M `params` + 'gpt2-large': dict(n_layer=36, n_head=20, + n_embd=1280), # 774M `params` + 'gpt2-xl': dict(n_layer=48, n_head=25, + n_embd=1600), # 1558M `params` + }[model_type] + print("forcing vocab_size=50257, block_size=1024, bias=True") + config_args[ + 'vocab_size'] = 50257 # always 50257 for GPT model checkpoints + config_args[ + 'block_size'] = 1024 # always 1024 for GPT model checkpoints + config_args['bias'] = True # always True for GPT model checkpoints + # we can override the dropout rate, if desired + if 'dropout' in override_args: + print(f"overriding dropout rate to {override_args['dropout']}") + config_args['dropout'] = override_args['dropout'] + # create a from-scratch initialized minGPT model + config = GPTConfig(**config_args) + model = GPT(config) + sd = model.state_dict() + sd_keys = sd.keys() + sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias') + ] # discard this mask / buffer, not a `param` + + # `init` a huggingface/transformers model + model_hf = GPT2LMHeadModel.from_pretrained(model_type) + sd_hf = model_hf.state_dict() + + # copy while ensuring all of the parameters are aligned and match in names and shapes + sd_keys_hf = sd_hf.keys() + sd_keys_hf = [ + k for k in sd_keys_hf if not k.endswith('.attn.masked_bias') + ] # ignore these, just a buffer + sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias') + ] # same, just the mask (buffer) + transposed = [ + 'attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', + 'mlp.c_proj.weight' + ] + # basically the `openai` checkpoints use a "`Conv1D`" module, but we only want to use a vanilla Linear + # this means that we have to transpose these weights when we import them + assert len(sd_keys_hf) == len( + sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}" + for k in sd_keys_hf: + if any(k.endswith(w) for w in transposed): + # special treatment for the Conv1D weights we need to transpose + assert sd_hf[k].shape[::-1] == sd[k].shape + with torch.no_grad(): + sd[k].copy_(sd_hf[k].t()) + else: + # vanilla copy over the other parameters + assert sd_hf[k].shape == sd[k].shape + with torch.no_grad(): + sd[k].copy_(sd_hf[k]) + + return model + + def configure_optimizers(self, weight_decay, learning_rate, betas, + device_type): + # start with all of the candidate parameters + param_dict = {pn: p for pn, p in self.named_parameters()} + # filter out those that do not require grad + param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad} + # `create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.` + # `i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.` + decay_params = [p for n, p in param_dict.items() if p.dim() >= 2] + nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2] + optim_groups = [{ + 'params': decay_params, + 'weight_decay': weight_decay + }, { + 'params': nodecay_params, + 'weight_decay': 0.0 + }] + num_decay_params = sum(p.numel() for p in decay_params) + num_nodecay_params = sum(p.numel() for p in nodecay_params) + print( + f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters" + ) + print( + f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters" + ) + # Create `AdamW` optimizer and use the fused version if it is available + fused_available = 'fused' in inspect.signature( + torch.optim.AdamW).parameters + use_fused = fused_available and device_type == 'cuda' + extra_args = dict(fused=True) if use_fused else dict() + optimizer = torch.optim.AdamW(optim_groups, + lr=learning_rate, + betas=betas, + **extra_args) + print(f"using fused AdamW: {use_fused}") + + return optimizer + + def estimate_mfu(self, fwdbwd_per_iter, dt): + """ estimate model flops utilization (`MFU`) in units of A100 `bfloat16` peak FLOPS """ + # first estimate the number of flops we do per iteration. + # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311 + N = self.get_num_params() + cfg = self.config + L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd // cfg.n_head, cfg.block_size + flops_per_token = 6 * N + 12 * L * H * Q * T + flops_per_fwdbwd = flops_per_token * T + flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter + # express our flops throughput as ratio of A100 `bfloat16` peak flops + flops_achieved = flops_per_iter * (1.0 / dt) # per second + flops_promised = 312e12 # A100 GPU `bfloat16` peak flops is 312 TFLOPS + mfu = flops_achieved / flops_promised + return mfu + + @torch.no_grad() + def generate(self, + idx, + graph_emb, + max_new_tokens, + temperature=1.0, + top_k=None): + """ + Take a conditioning sequence of indices `idx` (`LongTensor` of shape (b,t)) and complete + the sequence max_new_tokens times, feeding the predictions back into the model each time. + Most likely you'll want to make sure to be in `model.eval()` mode of operation for this. + """ + for _ in range(max_new_tokens): + # if the sequence context is growing too long we must crop it at block_size + idx_cond = idx if idx.size( + 1) <= self.config.block_size else idx[:, + -self.config.block_size:] + # forward the model to get the `logits` for the index in the sequence + logits, _ = self(idx_cond, graph_emb) + # pluck the `logits` at the final step and scale by desired temperature + logits = logits[:, -1, :] / temperature + # optionally crop the `logits` to only the top k options + if top_k is not None: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + logits[logits < v[:, [-1]]] = -float('Inf') + # apply `softmax` to convert `logits` to (normalized) probabilities + probs = F.softmax(logits, dim=-1) + # sample from the distribution + idx_next = torch.multinomial(probs, num_samples=1) + # append sampled index to the running sequence and continue + idx = torch.cat((idx, idx_next), dim=1) + + return idx diff --git a/docs/sphinx/applications/python/nanoGPT/out-save_dir_3k.zip b/docs/sphinx/applications/python/nanoGPT/out-save_dir_3k.zip new file mode 100644 index 00000000000..5428631a039 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/out-save_dir_3k.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9484152545e55c8bf86ae83ab5d6a0fa2c5936b338ec3dee49c2a9ac49da2579 +size 127808579 diff --git a/docs/sphinx/applications/python/nanoGPT/sample.py b/docs/sphinx/applications/python/nanoGPT/sample.py new file mode 100644 index 00000000000..5550bf4ae8a --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/sample.py @@ -0,0 +1,102 @@ +""" +Sample from a trained model +""" +import os +import pickle +from contextlib import nullcontext +import torch +import tiktoken +from model import GPTConfig, GPT + +# ----------------------------------------------------------------------------- +init_from = 'resume' # either 'resume' (from an `out_dir`) or a gpt2 variant (e.g. '`gpt2-xl`') +out_dir = 'out' # ignored if `init_from` is not 'resume' +start = "\n" # `or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"` +num_samples = 10 # number of samples to draw +max_new_tokens = 500 # number of tokens generated in each sample +temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions +top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability +seed = 1337 +device = 'cuda' # `examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.` +dtype = 'bfloat16' if torch.cuda.is_available( +) and torch.cuda.is_bf16_supported( +) else 'float16' # 'float32' or '`bfloat16`' or 'float16' +compile = False # use PyTorch 2.0 to compile the model to be faster +exec(open( + 'configurator.py').read()) # overrides from command line or config file +# ----------------------------------------------------------------------------- + +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.backends.cuda.matmul.allow_tf32 = True # `allow tf32 on matmul` +torch.backends.cudnn.allow_tf32 = True # `allow tf32 on cudnn` +device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in `torch.autocast` +ptdtype = { + 'float32': torch.float32, + 'bfloat16': torch.bfloat16, + 'float16': torch.float16 +}[dtype] +ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast( + device_type=device_type, dtype=ptdtype) + +# model +if init_from == 'resume': + # `init` from a model saved in a specific directory + ckpt_path = os.path.join(out_dir, 'ckpt.pt') + checkpoint = torch.load(ckpt_path, map_location=device) + gptconf = GPTConfig(**checkpoint['model_args']) + model = GPT(gptconf) + state_dict = checkpoint['model'] + unwanted_prefix = '_orig_mod.' + for k, v in list(state_dict.items()): + if k.startswith(unwanted_prefix): + state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) + model.load_state_dict(state_dict) +elif init_from.startswith('gpt2'): + # `init` from a given GPT-2 model + model = GPT.from_pretrained(init_from, dict(dropout=0.0)) + +model.eval() +model.to(device) +if compile: + model = torch.compile(model) # requires PyTorch 2.0 (optional) + +# look for the meta pickle in case it is available in the dataset folder +load_meta = False +if init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint[ + 'config']: # older checkpoints might not have these... + meta_path = os.path.join('data', checkpoint['config']['dataset'], + 'meta.pkl') + load_meta = os.path.exists(meta_path) +if load_meta: + print(f"Loading meta from {meta_path}...") + with open(meta_path, 'rb') as f: + meta = pickle.load(f) + # TODO want to make this more general to arbitrary encoder/decoder schemes + stoi, itos = meta['stoi'], meta['itos'] + encode = lambda s: [stoi[c] for c in s] + decode = lambda l: ''.join([itos[i] for i in l]) +else: + # `ok` let's assume gpt-2 `encodings` by default + print("No meta.pkl found, assuming GPT-2 encodings...") + enc = tiktoken.get_encoding("gpt2") + encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"}) + decode = lambda l: enc.decode(l) + +# encode the beginning of the prompt +if start.startswith('FILE:'): + with open(start[5:], 'r', encoding='utf-8') as f: + start = f.read() +start_ids = encode(start) +x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]) + +# run generation +with torch.no_grad(): + with ctx: + for k in range(num_samples): + y = model.generate(x, + max_new_tokens, + temperature=temperature, + top_k=top_k) + print(decode(y[0].tolist())) + print('---------------') diff --git a/docs/sphinx/applications/python/nanoGPT/scaling_laws.ipynb b/docs/sphinx/applications/python/nanoGPT/scaling_laws.ipynb new file mode 100644 index 00000000000..946ea831dc7 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/scaling_laws.ipynb @@ -0,0 +1,792 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Reproducing some scaling laws results from [Chinchilla](https://arxiv.org/pdf/2203.15556.pdf). Can't get the numbers to match exactly, but can still be used as a rough guide to help determine compute-optimal models. Also contains related utilities for calculating flops and param counts." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "%matplotlib inline" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## params\n", + "\n", + "First some parameter calculations:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "123.653376" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def gpt_params(seq_len, vocab_size, d_model, num_heads, num_layers):\n", + " \"\"\" Given GPT config calculate total number of parameters \"\"\"\n", + " ffw_size = 4*d_model # in GPT the number of intermediate features is always 4*d_model\n", + " # token and position embeddings\n", + " embeddings = d_model * vocab_size + d_model * seq_len\n", + " # transformer blocks\n", + " attention = 3*d_model**2 + 3*d_model # weights and biases\n", + " attproj = d_model**2 + d_model\n", + " ffw = d_model*(ffw_size) + ffw_size\n", + " ffwproj = ffw_size*d_model + d_model\n", + " layernorms = 2*2*d_model\n", + " # dense\n", + " ln_f = 2*d_model\n", + " dense = d_model*vocab_size # note: no bias here\n", + " # note: embeddings are not included in the param count!\n", + " total_params = num_layers*(attention + attproj + ffw + ffwproj + layernorms) + ln_f + dense\n", + " return total_params\n", + "\n", + "gpt2 = dict(seq_len = 1024, vocab_size = 50257, d_model = 768, num_heads = 12, num_layers = 12)\n", + "gpt_params(**gpt2)/1e6" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "OpenAI reports gpt2 (small) as having 124M params, so this is a match. Also, loading the OpenAI weights into nanoGPT and then calling `model.parameters()` exactly matches the above number and verifies the implementation. Now Chinchilla parameters:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def chinchilla_params(seq_len, vocab_size, d_model, num_heads, num_layers, ffw_size):\n", + " \"\"\" Parameters in the Chinchilla models. Unlike GPT they use relative positional embeddings. \"\"\"\n", + " # token embeddings only\n", + " embeddings = d_model * vocab_size\n", + " # transformer blocks\n", + " attention = 3*d_model**2 + 3*d_model # weights and biases\n", + " relative_pos = d_model**2 + 2*d_model # relative keys, content bias, relative bias\n", + " attproj = d_model**2 + d_model\n", + " ffw = d_model*ffw_size + ffw_size\n", + " ffwproj = ffw_size*d_model + d_model\n", + " layernorms = 2*2*d_model\n", + " # dense\n", + " ln_f = 2*d_model\n", + " dense = d_model*vocab_size # note: no bias here\n", + " # note: embeddings are not included in the param count!\n", + " total_params = num_layers*(attention + relative_pos + attproj + ffw + ffwproj + layernorms) + ln_f + dense\n", + " return total_params\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[44000000.0, 512, 2048, 64, 8, 8]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load in all the 50 Chinchilla models on the last page of the paper\n", + "import json\n", + "chinchilla_models_txt = '[[44000000.0, 512, 2048, 64, 8, 8], [57000000.0, 576, 2304, 64, 9, 9], [74000000.0, 640, 2560, 64, 10, 10], [90000000.0, 640, 2560, 64, 10, 13], [106000000.0, 640, 2560, 64, 10, 16], [117000000.0, 768, 3072, 64, 12, 12], [140000000.0, 768, 3072, 64, 12, 15], [163000000.0, 768, 3072, 64, 12, 18], [175000000.0, 896, 3584, 64, 14, 14], [196000000.0, 896, 3584, 64, 14, 16], [217000000.0, 896, 3584, 64, 14, 18], [251000000.0, 1024, 4096, 64, 16, 16], [278000000.0, 1024, 4096, 64, 16, 18], [306000000.0, 1024, 4096, 64, 16, 20], [425000000.0, 1280, 5120, 128, 10, 18], [489000000.0, 1280, 5120, 128, 10, 21], [509000000.0, 1408, 5632, 128, 11, 18], [552000000.0, 1280, 5120, 128, 10, 24], [587000000.0, 1408, 5632, 128, 11, 21], [632000000.0, 1536, 6144, 128, 12, 19], [664000000.0, 1408, 5632, 128, 11, 24], [724000000.0, 1536, 6144, 128, 12, 22], [816000000.0, 1536, 6144, 128, 12, 25], [893000000.0, 1792, 7168, 128, 14, 20], [1018000000.0, 1792, 7168, 128, 14, 23], [1143000000.0, 1792, 7168, 128, 14, 26], [1266000000.0, 2048, 8192, 128, 16, 22], [1424000000.0, 2176, 8704, 128, 17, 22], [1429000000.0, 2048, 8192, 128, 16, 25], [1593000000.0, 2048, 8192, 128, 16, 28], [1609000000.0, 2176, 8704, 128, 17, 25], [1731000000.0, 2304, 9216, 128, 18, 24], [1794000000.0, 2176, 8704, 128, 17, 28], [2007000000.0, 2304, 9216, 128, 18, 28], [2283000000.0, 2304, 9216, 128, 18, 32], [2298000000.0, 2560, 10240, 128, 20, 26], [2639000000.0, 2560, 10240, 128, 20, 30], [2980000000.0, 2560, 10240, 128, 20, 34], [3530000000.0, 2688, 10752, 128, 22, 36], [3802000000.0, 2816, 11264, 128, 22, 36], [4084000000.0, 2944, 11776, 128, 22, 36], [4516000000.0, 3072, 12288, 128, 24, 36], [6796000000.0, 3584, 14336, 128, 28, 40], [9293000000.0, 4096, 16384, 128, 32, 42], [11452000000.0, 4352, 17408, 128, 32, 47], [12295000000.0, 4608, 18432, 128, 36, 44], [12569000000.0, 4608, 18432, 128, 32, 47], [13735000000.0, 4864, 19456, 128, 32, 47], [14940000000.0, 4992, 19968, 128, 32, 49], [16183000000.0, 5120, 20480, 128, 40, 47]]'\n", + "chilchilla_models = json.loads(chinchilla_models_txt) # all 50 models\n", + "chilchilla_models[0] # tuples of params, d_model, ffw_size, kv_size, n_heads, n_layers from Table A9" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "our estimated params: 12296.1623M, chinchilla params: 12295.0000M, d_model: 4608, n_heads: 36, n_layers: 44\n", + "our estimated params: 13124.4826M, chinchilla params: 12569.0000M, d_model: 4608, n_heads: 32, n_layers: 47\n", + "our estimated params: 14614.4279M, chinchilla params: 13735.0000M, d_model: 4864, n_heads: 32, n_layers: 47\n", + "our estimated params: 16037.5039M, chinchilla params: 14940.0000M, d_model: 4992, n_heads: 32, n_layers: 49\n", + "our estimated params: 16184.4582M, chinchilla params: 16183.0000M, d_model: 5120, n_heads: 40, n_layers: 47\n" + ] + } + ], + "source": [ + "for m in chilchilla_models[-5:]: # only print last 5 models of the table\n", + " p, d, f, k, h, l = m\n", + " nparams = chinchilla_params(seq_len = 1024, vocab_size = 32000, d_model = d, num_heads = h, num_layers = l, ffw_size=f)\n", + " print(f\"our estimated params: {nparams/1e6:.4f}M, chinchilla params: {p/1e6:.4f}M, d_model: {d}, n_heads: {h}, n_layers: {l}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are almost able to reproduce the parameter counts for the Chinchilla models.\n", + "\n", + "Now turning to FLOPs:" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## flops" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def chinchilla_flops(seq_len, vocab_size, d_model, num_heads, num_layers, ffw_size):\n", + " \"\"\" \n", + " Calculate total number of FLOPs, see Chinchilla \n", + " paper Appendix F as reference: https://arxiv.org/pdf/2203.15556.pdf\n", + " \"\"\" \n", + " key_size = d_model // num_heads\n", + "\n", + " # embeddings\n", + " embeddings = 2 * seq_len * vocab_size * d_model\n", + "\n", + " # attention\n", + " # key, query, value projections\n", + " attention = 2 * 3 * seq_len * d_model * (key_size * num_heads)\n", + " # key @ query logits\n", + " attlogits = 2 * seq_len * seq_len * (key_size * num_heads)\n", + " # softmax\n", + " attsoftmax = 3 * num_heads * seq_len * seq_len # 3* is for subtract (max), exp, divide (?)\n", + " # softmax @ value reductions\n", + " attvalue = 2 * seq_len * seq_len * (key_size * num_heads)\n", + " # final linear\n", + " attlinear = 2 * seq_len * (key_size * num_heads) * d_model\n", + " att = attention + attlogits + attsoftmax + attvalue + attlinear\n", + " # feed forward\n", + " dense = 2 * seq_len * (d_model * ffw_size + d_model * ffw_size)\n", + "\n", + " # logits\n", + " logits = 2 * seq_len * d_model * vocab_size\n", + " \n", + " # this is what you'd expect:\n", + " # forward_flops = embeddings + num_layers * (att + dense) + logits\n", + " # but:\n", + " # per author correspondence apparently there is typo in the paper,\n", + " # they do not count embeddings and logits to repro table 4. So instead:\n", + " forward_flops = num_layers * (att + dense)\n", + " backward_flops = 2 * forward_flops # as in Kaplan et al. 2020\n", + " total_flops = forward_flops + backward_flops\n", + "\n", + " return total_flops\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seq_lenvocab_sized_modelnum_headsnum_layersffw_sizeNFapprox_flopschinch_flopsratio
020483200064010102560738252809298771968009071650406409.298772e+111.025036
1204832000102416204096305707008413524819968037565277143044.135248e+121.100817
2204832000128010245120552604160735345377280067903999180807.353454e+121.082919
3204832000179214267168114345369614670316437504140507590164481.467032e+131.044094
4204832000204816288192159312691220220437594112195763434946562.022044e+131.032902
52048320003584284014336679627468883021046743040835126233661448.302105e+130.994114
\n", + "
" + ], + "text/plain": [ + " seq_len vocab_size d_model num_heads num_layers ffw_size N \\\n", + "0 2048 32000 640 10 10 2560 73825280 \n", + "1 2048 32000 1024 16 20 4096 305707008 \n", + "2 2048 32000 1280 10 24 5120 552604160 \n", + "3 2048 32000 1792 14 26 7168 1143453696 \n", + "4 2048 32000 2048 16 28 8192 1593126912 \n", + "5 2048 32000 3584 28 40 14336 6796274688 \n", + "\n", + " F approx_flops chinch_flops ratio \n", + "0 929877196800 907165040640 9.298772e+11 1.025036 \n", + "1 4135248199680 3756527714304 4.135248e+12 1.100817 \n", + "2 7353453772800 6790399918080 7.353454e+12 1.082919 \n", + "3 14670316437504 14050759016448 1.467032e+13 1.044094 \n", + "4 20220437594112 19576343494656 2.022044e+13 1.032902 \n", + "5 83021046743040 83512623366144 8.302105e+13 0.994114 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Now try reproduce Table A4 from Chinchilla paper Appendix, \n", + "# comparing accurate flops above to approximate flops F = 6*N*D\n", + "# note Chinchilla mentions using vocab_size = 32K\n", + "\n", + "chilchilla_models_table4 = [\n", + " [10, 640, 2560, 10, 64],\n", + " [20, 1024, 4096, 16, 64],\n", + " [24, 1280, 5120, 10, 128 ],\n", + " [26, 1792, 7168, 14, 128 ],\n", + " [28, 2048, 8192, 16, 128],\n", + " [40, 3584, 14336, 28, 128]\n", + "]\n", + "\n", + "rows = []\n", + "for num_layers, d_model, ffw_size, num_heads, _ in chilchilla_models_table4:\n", + "\n", + " args = dict(seq_len = 2048, vocab_size = 32000, d_model = d_model, \n", + " num_heads = num_heads, num_layers = num_layers, ffw_size=ffw_size)\n", + "\n", + " D = args['seq_len'] # dataset size (cancels anyway, for the purposes of the ratio calculation below)\n", + " N = chinchilla_params(**args)\n", + " F = chinchilla_flops(**args)\n", + "\n", + " approx_flops = 6*D*N # approximate flops\n", + " chinch_flops = F * (float(D) / args['seq_len']) # exact flops according to Chinchilla paper calculations\n", + "\n", + " # print('---')\n", + " # print(f\"params: {N/1e6:.2f}M\")\n", + " # print(f\"approx flops: {approx_flops/1e9:.2f}B\")\n", + " # print(f\"chinchilla flops: {chinch_flops/1e9:.2f}B\")\n", + " # print(f\"ratio (chinchilla / approx): {chinch_flops / approx_flops:.2f}\")\n", + "\n", + " # first copy all keyvalues from args into out\n", + " out = {k:v for k,v in args.items()}\n", + " # then add the calculated values\n", + " out['N'] = N\n", + " out['F'] = F\n", + " out['approx_flops'] = approx_flops\n", + " out['chinch_flops'] = chinch_flops\n", + " out['ratio'] = chinch_flops / approx_flops\n", + " rows.append(out)\n", + "\n", + "# make a pandas dataframe from rows\n", + "df = pd.DataFrame(rows)\n", + "df" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pretty good match! Except the param counts are still not perfectly accurate." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scaling Laws: Approach 3\n", + "\n", + "In their \"Aproach 3\", Chinchilla paper fits a function L(N,D) to approximate the final loss gives the model size and the data size. Here is the final fit:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def L(N, D):\n", + " \"\"\" \n", + " Approximates loss given N parameters and D dataset size (in tokens),\n", + " per Chinchilla paper.\n", + " \"\"\"\n", + " E = 1.69 # entropy of natural language, limit of infinite model on infinite data\n", + " A = 406.4\n", + " B = 410.7\n", + " alpha = 0.34\n", + " beta = 0.28\n", + " return A / (N ** alpha) + B / (D ** beta) + E\n", + "\n", + "ns = 10 ** np.arange(7, 11, step=2**-4) # model sizes from 10M to 100B\n", + "ds = 10 ** np.arange(9, 12, step=2**-4) # dataset sizes from 1B to 1T\n", + "plt.figure(figsize=(12, 5))\n", + "plt.subplot(121)\n", + "# create a 2D countour plot of loss L as a function of model size and dataset size in ns,ds\n", + "loss2d = np.log10(np.array([[L(n, d) for d in ds] for n in ns]))\n", + "plt.imshow(loss2d, extent=[9, 12, 7, 11], origin='lower', alpha=0.5)\n", + "plt.contour(loss2d, levels=30, extent=[9, 12, 7, 11], origin='lower')\n", + "plt.xlabel('log10(dataset size)')\n", + "plt.ylabel('log10(model size)')\n", + "plt.title('loss')\n", + "plt.colorbar()\n", + "# plot the compute for each point, which is a deterministic function: flops = 6*N*D\n", + "plt.subplot(122)\n", + "compute2d = np.log10(np.array([[6*n*d for d in ds] for n in ns]))\n", + "plt.imshow(compute2d, extent=[9, 12, 7, 11], origin='lower', alpha=0.5)\n", + "plt.contour(compute2d, levels=30, extent=[9, 12, 7, 11], origin='lower')\n", + "plt.xlabel('log10(dataset size)')\n", + "plt.ylabel('log10(model size)')\n", + "plt.title('log10 flops')\n", + "plt.colorbar()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ok so given any N,D we can estimate both: 1) the loss, and 2) the total flops. Now we want to solve the following problem: Given a specific budget of flops C, find: N_opt, D_opt = argmin_{FLOPs(N,D) = C} L(N, D). i.e. how big of a model should we train and for how many tokens?" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "best model size: 316.23M\n", + "best dataset size: 11.65B\n" + ] + }, + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'loss')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "c = 2.21e19 # target compute budget (usually know this because we know how many GPU for how long go brrr)\n", + "# (I got this flop number from row 1 of Table A3)\n", + "# sweep model sizes from 10M to 100B\n", + "ns = 10 ** np.arange(7, 11, step=2**-4)\n", + "# using C = 6*N*D, solve for D that maintains the compute budget c\n", + "ds = c / (6 * ns)\n", + "# evaluate the loss in each case\n", + "losses = L(ns, ds)\n", + "# find the argmin\n", + "best = np.argmin(losses)\n", + "print(f\"best model size: {ns[best]/1e6:.2f}M\")\n", + "print(f\"best dataset size: {ds[best]/1e9:.2f}B\")\n", + "# plot the loss\n", + "plt.figure(figsize=(3,3))\n", + "plt.plot(ns, losses)\n", + "plt.xscale('log')\n", + "# plot a vertical bar at the best model size\n", + "plt.axvline(ns[best], color='red')\n", + "plt.xlabel('model size')\n", + "plt.ylabel('loss')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the plot above, basically the models on the left of best are too small and trained for too long. The models on the right of best are way too large and trained for too little. The model at the red line is just right.\n", + "\n", + "Now, the Chinchilla paper says that best model size for this flop budget is 400M params and 9.2B tokens (instead of 316M params and 11.65B tokens) so there is some unresolved disagreement here too..." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2304" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Calculate the Chinchilla optimal models for a range of compute budgets\n", + "\n", + "# sweep over compute budgets from 1e17 to 1e26\n", + "cs = 10 ** np.arange(17, 26, step=2**-8)\n", + "models = []\n", + "for c in cs:\n", + " # sweep over model sizes\n", + " ns = 10 ** np.arange(7, 14, step=2**-8)\n", + " # the dataset sizes that would maintain the given compute budget\n", + " ds = c / (6 * ns)\n", + " # losses at each point\n", + " losses = L(ns, ds)\n", + " # n,d for the best model\n", + " best = np.argmin(losses)\n", + " models.append((c, ns[best], ds[best])) # c, n, d tuple log\n", + "\n", + "len(models)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "closest model found:\n", + "model size: 399.54M\n", + "dataset size: 14.43B\n", + "flops: 3.459892e+19\n", + "loss: 2.76\n" + ] + } + ], + "source": [ + "query_model_size = 400e6\n", + "ns = np.array([n for c, n, d in models])\n", + "ds = np.array([d for c, n, d in models])\n", + "# find the index of the closest model size in ns\n", + "ix = np.argmin(np.abs(ns - query_model_size))\n", + "# retrieve the corresponding params, flops, and data size\n", + "print(\"closest model found:\")\n", + "print(f\"model size: {ns[ix]/1e6:.2f}M\")\n", + "print(f\"dataset size: {ds[ix]/1e9:.2f}B\")\n", + "print(f\"flops: {6*ns[ix]*ds[ix]:e}\")\n", + "print(f\"loss: {L(ns[ix], ds[ix]):.2f}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This should have come out as 9.2B according to Table A3 in Chinchilla paper, per my understanding of it." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scaling Laws: Approach 2\n", + "\n", + "Approach 2 is probably my favorite one because it fixes a flop budget and runs a number of model/dataset sizes, measures the loss, fits a parabolla, and gets the minimum. So it's a fairly direct measurement of what we're after. The best way to then calculate the compute-optimal number of tokens for any given model size, as an example, is via simple interpolation." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Approach 1 numbers\n", + "# # parameters, tokens\n", + "# raw = [\n", + "# [400e6, 8e9],\n", + "# [1e9, 20.2e9],\n", + "# [10e9, 205.1e9],\n", + "# [67e9, 1.5e12],\n", + "# [175e9, 3.7e12],\n", + "# [280e9, 5.9e12],\n", + "# [520e9, 11e12],\n", + "# [1e12, 21.2e12],\n", + "# [10e12, 216.2e12],\n", + "# ]\n", + "\n", + "# Approach 2 numbers\n", + "# parameters, tokens\n", + "raw = [\n", + " [400e6, 7.7e9],\n", + " [1e9, 20.0e9],\n", + " [10e9, 219.5e9],\n", + " [67e9, 1.7e12],\n", + " [175e9, 4.3e12],\n", + " [280e9, 7.1e12],\n", + " [520e9, 13.4e12],\n", + " [1e12, 26.5e12],\n", + " [10e12, 292.0e12],\n", + "]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "y = 1.0409573169995892x + 0.9353887152390791\n" + ] + } + ], + "source": [ + "# fit a line by linear regression to the raw data\n", + "import numpy as np\n", + "x = np.array([np.log10(x[0]) for x in raw])\n", + "y = np.array([np.log10(x[1]) for x in raw])\n", + "A = np.vstack([x, np.ones(len(x))]).T\n", + "m, c = np.linalg.lstsq(A, y, rcond=None)[0]\n", + "print(f\"y = {m}x + {c}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(3, 3))\n", + "# plot the line\n", + "plt.plot([q[0] for q in raw], [10**(m*np.log10(q[0]) + c) for q in raw], label='linear regression', color='r')\n", + "# plot the raw data\n", + "plt.scatter([q[0] for q in raw], [q[1] for q in raw], label='raw data')\n", + "plt.xscale('log')\n", + "plt.yscale('log')\n", + "plt.xlabel('parameters')\n", + "plt.ylabel('tokens')\n", + "plt.title('compute optimal models')\n", + "plt.grid()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "predicted parameters for 1.240000e+08 tokens: 2.292426e+09\n" + ] + } + ], + "source": [ + "xquery = 124e6 # query model size here (e.g. GPT-2 small is 124M)\n", + "yquery = 10**(m*np.log10(xquery) + c)\n", + "print(f\"predicted parameters for {xquery:e} tokens: {yquery:e}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pytorch2", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "7f5833218766b48e6e35e4452ee875aac0e2188d05bbe5298f2c62b79f08b222" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/sphinx/applications/python/nanoGPT/train.py b/docs/sphinx/applications/python/nanoGPT/train.py new file mode 100644 index 00000000000..b9e68fcdf5d --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/train.py @@ -0,0 +1,383 @@ +""" +`This training script can be run both on a single gpu in debug mode,` +`and also in a larger training run with distributed data parallel (ddp).` + +`To run on a single GPU, example:` +`$ python train.py --batch_size=32 --compile=False` + +`To run with DDP on 4 gpus on 1 node, example:` +`$ torchrun --standalone --nproc_per_node=4 train.py` + +`To run with DDP on 4 gpus across 2 nodes, example:` +`- Run on the first (master) node with example IP 123.456.123.456:` +`$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py` +`- Run on the worker node:` +`$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py` +`(If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1)` +""" + +import os +import time +import math +import pickle +from contextlib import nullcontext + +import numpy as np +import torch +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.distributed import init_process_group, destroy_process_group + +from model import GPTConfig, GPT + +# ----------------------------------------------------------------------------- +# default config values designed to train a gpt2 (124M) on OpenWebText +# I/O +out_dir = 'out' +eval_interval = 2000 +log_interval = 1 +eval_iters = 200 +eval_only = False # if True, script exits right after the first `eval` +always_save_checkpoint = True # if True, always save a checkpoint after each `eval` +init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*' +# `wandb` logging +wandb_log = False # disabled by default +wandb_project = 'owt' +wandb_run_name = 'gpt2' # `'run' + str(time.time())` +# data +dataset = 'openwebtext' +gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes +batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size +block_size = 1024 +# model +n_layer = 12 +n_head = 12 +n_embd = 768 +dropout = 0.0 # for `pretraining` 0 is good, for finetuning try 0.1+ +bias = False # do we use bias inside LayerNorm and Linear layers? +# `adamw` optimizer +learning_rate = 6e-4 # max learning rate +max_iters = 600000 # total number of training iterations +weight_decay = 1e-1 +beta1 = 0.9 +beta2 = 0.95 +grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0 +# learning rate decay settings +decay_lr = True # whether to decay the learning rate +warmup_iters = 2000 # how many steps to warm up for +lr_decay_iters = 600000 # `should be ~= max_iters per Chinchilla` +min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla +# DDP settings +backend = 'nccl' # `'nccl', 'gloo', etc.` +# system +device = 'cuda' # examples: `'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks` +dtype = 'bfloat16' if torch.cuda.is_available( +) and torch.cuda.is_bf16_supported( +) else 'float16' # 'float32', '`bfloat16`', or 'float16', the latter will auto implement a GradScaler +compile = True # use PyTorch 2.0 to compile the model to be faster +# ----------------------------------------------------------------------------- +config_keys = [ + k for k, v in globals().items() + if not k.startswith('_') and isinstance(v, (int, float, bool, str)) +] +exec(open( + 'configurator.py').read()) # overrides from command line or config file +config = {k: globals()[k] for k in config_keys} # will be useful for logging +# ----------------------------------------------------------------------------- + +# various `inits`, derived attributes, I/O setup +ddp = int(os.environ.get('RANK', -1)) != -1 # is this a `ddp` run? +if ddp: + init_process_group(backend=backend) + ddp_rank = int(os.environ['RANK']) + ddp_local_rank = int(os.environ['LOCAL_RANK']) + ddp_world_size = int(os.environ['WORLD_SIZE']) + device = f'cuda:{ddp_local_rank}' + torch.cuda.set_device(device) + master_process = ddp_rank == 0 # this process will do logging, `checkpointing` etc. + seed_offset = ddp_rank # each process gets a different seed + # world_size number of processes will be training simultaneously, so we can scale + # down the desired gradient accumulation iterations per process proportionally + assert gradient_accumulation_steps % ddp_world_size == 0 + gradient_accumulation_steps //= ddp_world_size +else: + # if not `ddp`, we are running on a single `gpu`, and one process + master_process = True + seed_offset = 0 + ddp_world_size = 1 +tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size +print(f"tokens per iteration will be: {tokens_per_iter:,}") + +if master_process: + os.makedirs(out_dir, exist_ok=True) +torch.manual_seed(1337 + seed_offset) +torch.backends.cuda.matmul.allow_tf32 = True # `allow tf32 on matmul` +torch.backends.cudnn.allow_tf32 = True # `allow tf32 on cudnn` +device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in `torch.autocast` +# note: float16 data type will automatically use a GradScaler +ptdtype = { + 'float32': torch.float32, + 'bfloat16': torch.bfloat16, + 'float16': torch.float16 +}[dtype] +ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast( + device_type=device_type, dtype=ptdtype) + +# poor man's data loader +data_dir = os.path.join('data', dataset) + + +def get_batch(split): + # We recreate `np.memmap` every batch to avoid a memory leak, as per + # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122 + if split == 'train': + data = np.memmap(os.path.join(data_dir, 'train.bin'), + dtype=np.uint16, + mode='r') + else: + data = np.memmap(os.path.join(data_dir, 'val.bin'), + dtype=np.uint16, + mode='r') + ix = torch.randint(len(data) - block_size, (batch_size,)) + x = torch.stack([ + torch.from_numpy((data[i:i + block_size]).astype(np.int64)) for i in ix + ]) + y = torch.stack([ + torch.from_numpy((data[i + 1:i + 1 + block_size]).astype(np.int64)) + for i in ix + ]) + if device_type == 'cuda': + # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True) + x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to( + device, non_blocking=True) + else: + x, y = x.to(device), y.to(device) + return x, y + + +# `init` these up here, can override if `init_from='resume'` (i.e. from a checkpoint) +iter_num = 0 +best_val_loss = 1e9 + +# attempt to derive vocab_size from the dataset +meta_path = os.path.join(data_dir, 'meta.pkl') +meta_vocab_size = None +if os.path.exists(meta_path): + with open(meta_path, 'rb') as f: + meta = pickle.load(f) + meta_vocab_size = meta['vocab_size'] + print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})") + +# model `init` +model_args = dict(n_layer=n_layer, + n_head=n_head, + n_embd=n_embd, + block_size=block_size, + bias=bias, + vocab_size=None, + dropout=dropout) # start with `model_args` from command line +if init_from == 'scratch': + # `init` a new model from scratch + print("Initializing a new model from scratch") + # determine the vocab size we'll use for from-scratch training + if meta_vocab_size is None: + print( + "defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)" + ) + model_args[ + 'vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304 + gptconf = GPTConfig(**model_args) + model = GPT(gptconf) +elif init_from == 'resume': + print(f"Resuming training from {out_dir}") + # resume training from a checkpoint. + ckpt_path = os.path.join(out_dir, 'ckpt.pt') + checkpoint = torch.load(ckpt_path, map_location=device) + checkpoint_model_args = checkpoint['model_args'] + # force these config attributes to be equal otherwise we can't even resume training + # the rest of the attributes (e.g. dropout) can stay as desired from command line + for k in [ + 'n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size' + ]: + model_args[k] = checkpoint_model_args[k] + # create the model + gptconf = GPTConfig(**model_args) + model = GPT(gptconf) + state_dict = checkpoint['model'] + # fix the keys of the state dictionary :( + # honestly no idea how checkpoints sometimes get this prefix, have to debug more + unwanted_prefix = '_orig_mod.' + for k, v in list(state_dict.items()): + if k.startswith(unwanted_prefix): + state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) + model.load_state_dict(state_dict) + iter_num = checkpoint['iter_num'] + best_val_loss = checkpoint['best_val_loss'] +elif init_from.startswith('gpt2'): + print(f"Initializing from OpenAI GPT-2 weights: {init_from}") + # initialize from OpenAI GPT-2 weights + override_args = dict(dropout=dropout) + model = GPT.from_pretrained(init_from, override_args) + # read off the created config `params`, so we can store them into checkpoint correctly + for k in [ + 'n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size' + ]: + model_args[k] = getattr(model.config, k) +# crop down the model block size if desired, using model surgery +if block_size < model.config.block_size: + model.crop_block_size(block_size) + model_args[ + 'block_size'] = block_size # so that the checkpoint will have the right value +model.to(device) + +# initialize a GradScaler. If enabled=False `scaler` is a no-op +scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16')) + +# optimizer +optimizer = model.configure_optimizers(weight_decay, learning_rate, + (beta1, beta2), device_type) +if init_from == 'resume': + optimizer.load_state_dict(checkpoint['optimizer']) +checkpoint = None # free up memory + +# compile the model +if compile: + print("compiling the model... (takes a ~minute)") + unoptimized_model = model + model = torch.compile(model) # requires PyTorch 2.0 + +# wrap model into DDP container +if ddp: + model = DDP(model, device_ids=[ddp_local_rank]) + + +# helps estimate an arbitrarily accurate loss over either split using many batches +@torch.no_grad() +def estimate_loss(): + out = {} + model.eval() + for split in ['train', 'val']: + losses = torch.zeros(eval_iters) + for k in range(eval_iters): + X, Y = get_batch(split) + with ctx: + logits, loss = model(X, Y) + losses[k] = loss.item() + out[split] = losses.mean() + model.train() + return out + + +# learning rate decay scheduler (cosine with `warmup`) +def get_lr(it): + # 1) linear `warmup` for `warmup_iters` steps + if it < warmup_iters: + return learning_rate * (it + 1) / (warmup_iters + 1) + # 2) if `it > lr_decay_iters`, return min learning rate + if it > lr_decay_iters: + return min_lr + # 3) in between, use cosine decay down to min learning rate + decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters) + assert 0 <= decay_ratio <= 1 + coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # `coeff` ranges 0..1 + return min_lr + coeff * (learning_rate - min_lr) + + +# logging +if wandb_log and master_process: + import wandb + wandb.init(project=wandb_project, name=wandb_run_name, config=config) + +# training loop +X, Y = get_batch('train') # fetch the very first batch +t0 = time.time() +local_iter_num = 0 # number of iterations in the lifetime of this process +raw_model = model.module if ddp else model # unwrap DDP container if needed +running_mfu = -1.0 +while True: + + # determine and set the learning rate for this iteration + lr = get_lr(iter_num) if decay_lr else learning_rate + for param_group in optimizer.param_groups: + param_group['lr'] = lr + + # evaluate the loss on train/val sets and write checkpoints + if iter_num % eval_interval == 0 and master_process: + losses = estimate_loss() + print( + f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}" + ) + if wandb_log: + wandb.log({ + "iter": iter_num, + "train/loss": losses['train'], + "val/loss": losses['val'], + "lr": lr, + "mfu": running_mfu * 100, # convert to percentage + }) + if losses['val'] < best_val_loss or always_save_checkpoint: + best_val_loss = losses['val'] + if iter_num > 0: + checkpoint = { + 'model': raw_model.state_dict(), + 'optimizer': optimizer.state_dict(), + 'model_args': model_args, + 'iter_num': iter_num, + 'best_val_loss': best_val_loss, + 'config': config, + } + print(f"saving checkpoint to {out_dir}") + torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt')) + if iter_num == 0 and eval_only: + break + + # forward backward update, with optional gradient accumulation to simulate larger batch size + # and using the GradScaler if data type is float16 + for micro_step in range(gradient_accumulation_steps): + if ddp: + # in DDP training we only need to sync gradients at the last micro step. + # the official way to do this is with model.no_sync() context manager, but + # I really dislike that this bloats the code and forces us to repeat code + # looking at the source of that context manager, it just toggles this variable + model.require_backward_grad_sync = ( + micro_step == gradient_accumulation_steps - 1) + with ctx: + logits, loss = model(X, Y) + loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation + # immediately `async prefetch` next batch while model is doing the forward pass on the GPU + X, Y = get_batch('train') + # backward pass, with gradient scaling if training in `fp16` + scaler.scale(loss).backward() + # clip the gradient + if grad_clip != 0.0: + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + # step the optimizer and `scaler` if training in `fp16` + scaler.step(optimizer) + scaler.update() + # flush the gradients as soon as we can, no need for this memory anymore + optimizer.zero_grad(set_to_none=True) + + # timing and logging + t1 = time.time() + dt = t1 - t0 + t0 = t1 + if iter_num % log_interval == 0 and master_process: + # get loss as float. note: this is a CPU-GPU sync point + # scale up to undo the division above, approximating the true total loss (exact would have been a sum) + lossf = loss.item() * gradient_accumulation_steps + if local_iter_num >= 5: # let the training loop settle a bit + mfu = raw_model.estimate_mfu( + batch_size * gradient_accumulation_steps, dt) + running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu + print( + f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%" + ) + iter_num += 1 + local_iter_num += 1 + + # termination conditions + if iter_num > max_iters: + break + +if ddp: + destroy_process_group() diff --git a/docs/sphinx/applications/python/nanoGPT/train_pad_gemb.py b/docs/sphinx/applications/python/nanoGPT/train_pad_gemb.py new file mode 100644 index 00000000000..580edfdd319 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/train_pad_gemb.py @@ -0,0 +1,408 @@ +""" +`This training script can be run both on a single gpu in debug mode,` +`and also in a larger training run with distributed data parallel (ddp).` + +`To run on a single GPU, example:` +`$ python train.py --batch_size=32 --compile=False` + +`To run with DDP on 4 gpus on 1 node, example:` +`$ torchrun --standalone --nproc_per_node=4 train.py` + +`To run with DDP on 4 gpus across 2 nodes, example:` +`- Run on the first (master) node with example IP 123.456.123.456:` +`$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py` +`- Run on the worker node:` +`$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py` +`(If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1)` +""" + +import os +import time +import math +import pickle +from contextlib import nullcontext +from tqdm import tqdm + +import numpy as np +import torch +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.distributed import init_process_group, destroy_process_group + +from model_pad_gemb import GPTConfig, GPT + +n_epochs = 3 # (approximately) + +# ----------------------------------------------------------------------------- +# default config values designed to train a gpt2 (124M) on OpenWebText +# I/O +out_dir = 'out' +eval_interval = 20_000 +log_interval = 1 +eval_iters = 200 +eval_only = False # if True, script exits right after the first `eval` +always_save_checkpoint = True # if True, always save a checkpoint after each `eval` +init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*' +# `wandb` logging +wandb_log = False # disabled by default +wandb_project = 'owt' +wandb_run_name = 'gpt2' # `'run' + str(time.time())` +# data +dataset = 'openwebtext' +gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes +batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size +block_size = 1024 +# model +n_layer = 12 +n_head = 12 +n_embd = 768 +dropout = 0.0 # for `pretraining` 0 is good, for finetuning try 0.1+ +bias = False # do we use bias inside LayerNorm and Linear layers? +# `adamw` optimizer +learning_rate = 6e-4 # max learning rate +max_iters = 600000 # total number of training iterations +weight_decay = 1e-1 +beta1 = 0.9 +beta2 = 0.95 +grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0 +# learning rate decay settings +decay_lr = True # whether to decay the learning rate +warmup_iters = 2000 # how many steps to warm up for +lr_decay_iters = 600000 # `should be ~= max_iters per Chinchilla` +min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla +# DDP settings +backend = 'nccl' # `'nccl', 'gloo', etc.` +# system +device = 'cuda' # examples: `'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks` +dtype = 'bfloat16' if torch.cuda.is_available( +) and torch.cuda.is_bf16_supported( +) else 'float16' # 'float32', '`bfloat16`', or 'float16', the latter will auto implement a GradScaler +compile = True # use PyTorch 2.0 to compile the model to be faster +# ----------------------------------------------------------------------------- +config_keys = [ + k for k, v in globals().items() + if not k.startswith('_') and isinstance(v, (int, float, bool, str)) +] +exec(open( + 'configurator.py').read()) # overrides from command line or config file +config = {k: globals()[k] for k in config_keys} # will be useful for logging +# ----------------------------------------------------------------------------- + +# various `inits`, derived attributes, I/O setup +ddp = int(os.environ.get('RANK', -1)) != -1 # is this a `ddp` run? +if ddp: + init_process_group(backend=backend) + ddp_rank = int(os.environ['RANK']) + ddp_local_rank = int(os.environ['LOCAL_RANK']) + ddp_world_size = int(os.environ['WORLD_SIZE']) + device = f'cuda:{ddp_local_rank}' + torch.cuda.set_device(device) + master_process = ddp_rank == 0 # this process will do logging, `checkpointing` etc. + seed_offset = ddp_rank # each process gets a different seed + # world_size number of processes will be training simultaneously, so we can scale + # down the desired gradient accumulation iterations per process proportionally + assert gradient_accumulation_steps % ddp_world_size == 0 + gradient_accumulation_steps //= ddp_world_size +else: + # if not `ddp`, we are running on a single `gpu`, and one process + master_process = True + seed_offset = 0 + ddp_world_size = 1 +tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size +print(f"tokens per iteration will be: {tokens_per_iter:,}") + +if master_process: + os.makedirs(out_dir, exist_ok=True) +torch.manual_seed(1337 + seed_offset) +torch.backends.cuda.matmul.allow_tf32 = True # `allow tf32 on matmul` +torch.backends.cudnn.allow_tf32 = True # `allow tf32 on cudnn` +device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in `torch.autocast` +# note: float16 data type will automatically use a GradScaler +ptdtype = { + 'float32': torch.float32, + 'bfloat16': torch.bfloat16, + 'float16': torch.float16 +}[dtype] +ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast( + device_type=device_type, dtype=ptdtype) + +# poor man's data loader +data_dir = os.path.join('data', dataset) + +mmap = 'r' +mmap = None +print(f'Opening data (mmap mode: {mmap})...') +train_data = np.load(os.path.join(data_dir, 'train.npy'), mmap_mode=mmap) +val_data = np.load(os.path.join(data_dir, 'val.npy'), mmap_mode=mmap) +graph_emb_np = np.load(os.path.join(data_dir, 'feather_emb_d500.npy'), + mmap_mode=mmap) +emb_dim = graph_emb_np.shape[1] + + +def get_batch(split): + + if split == 'train': + data = train_data + emb_idx_data = train_data_graph_idx_list + else: + data = val_data + emb_idx_data = val_data_graph_idx_list + ix = np.random.randint(low=0, high=data.shape[0] - 1, size=batch_size) + data_batch_np = data[ix] + graph_emb_data = torch.tensor(graph_emb_np[emb_idx_data[ix]]) + + #`print(f"Get batch graph_emb_data shape: {graph_emb_data.shape}, {graph_emb_data.dtype}")` + x = torch.tensor(data_batch_np[:, :1, :].astype(np.int64)).flatten(1) + y = torch.tensor(data_batch_np[:, 1:2, :].astype(np.int64)).flatten(1) + if device_type == 'cuda': + # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True) + x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to( + device, non_blocking=True) + graph_emb_data = graph_emb_data.pin_memory().to(device, + non_blocking=True).to( + torch.bfloat16) + else: + x, y = x.to(device), y.to(device) + graph_emb_data = graph_emb_data.to(device) + #`print(f"graph_emb_data dtype: {graph_emb_data.dtype}\n\n\n")` + + return x, y, graph_emb_data + + +# `init` these up here, can override if `init_from='resume'` (i.e. from a checkpoint) +iter_num = 0 +best_val_loss = 1e9 + +# attempt to derive vocab_size from the dataset +meta_path = os.path.join(data_dir, 'meta.pkl') +meta_vocab_size = None +if os.path.exists(meta_path): + with open(meta_path, 'rb') as f: + meta = pickle.load(f) + meta_vocab_size = meta['vocab_size'] + print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})") + +# For graph `embeddings` +emb_graph_id_to_idx_dict = meta['emb_graph_id_to_idx_dict'] +emb_graph_idx_to_id_dict = meta['emb_graph_idx_to_id_dict'] +train_data_graph_idx_list = np.array(meta['train_data_graph_idx_list']) +val_data_graph_idx_list = np.array(meta['val_data_graph_idx_list']) + +# model `init` +model_args = dict(n_layer=n_layer, + n_head=n_head, + n_embd=n_embd, + block_size=block_size, + bias=bias, + vocab_size=None, + dropout=dropout) # start with `model_args` from command line +if init_from == 'scratch': + # `init` a new model from scratch + print("Initializing a new model from scratch") + # determine the vocab size we'll use for from-scratch training + if meta_vocab_size is None: + print( + "defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)" + ) + model_args[ + 'vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304 + gptconf = GPTConfig(**model_args) + model = GPT(gptconf) +elif init_from == 'resume': + print(f"Resuming training from {out_dir}") + # resume training from a checkpoint. + ckpt_path = os.path.join(out_dir, 'ckpt.pt') + checkpoint = torch.load(ckpt_path, map_location=device) + checkpoint_model_args = checkpoint['model_args'] + # force these config attributes to be equal otherwise we can't even resume training + # the rest of the attributes (e.g. dropout) can stay as desired from command line + for k in [ + 'n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size' + ]: + model_args[k] = checkpoint_model_args[k] + # create the model + gptconf = GPTConfig(**model_args) + model = GPT(gptconf) + state_dict = checkpoint['model'] + # fix the keys of the state dictionary :( + # honestly no idea how checkpoints sometimes get this prefix, have to debug more + unwanted_prefix = '_orig_mod.' + for k, v in list(state_dict.items()): + if k.startswith(unwanted_prefix): + state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) + model.load_state_dict(state_dict) + iter_num = checkpoint['iter_num'] + best_val_loss = checkpoint['best_val_loss'] +elif init_from.startswith('gpt2'): + print(f"Initializing from OpenAI GPT-2 weights: {init_from}") + # initialize from OpenAI GPT-2 weights + override_args = dict(dropout=dropout) + model = GPT.from_pretrained(init_from, override_args) + # read off the created config `params`, so we can store them into checkpoint correctly + for k in [ + 'n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size' + ]: + model_args[k] = getattr(model.config, k) +# crop down the model block size if desired, using model surgery +if block_size < model.config.block_size: + model.crop_block_size(block_size) + model_args[ + 'block_size'] = block_size # so that the checkpoint will have the right value +model.to(device) + +# initialize a GradScaler. If enabled=False `scaler` is a no-op +scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16')) + +# optimizer +optimizer = model.configure_optimizers(weight_decay, learning_rate, + (beta1, beta2), device_type) +if init_from == 'resume': + optimizer.load_state_dict(checkpoint['optimizer']) +checkpoint = None # free up memory + +# compile the model +if compile: + print("compiling the model... (takes a ~minute)") + unoptimized_model = model + model = torch.compile(model) # requires PyTorch 2.0 + +# wrap model into DDP container +if ddp: + model = DDP(model, device_ids=[ddp_local_rank]) + + +# helps estimate an arbitrarily accurate loss over either split using many batches +@torch.no_grad() +def estimate_loss(): + out = {} + model.eval() + for split in ['train', 'val']: + losses = torch.zeros(eval_iters) + for k in range(eval_iters): + X, Y, cur_graph_emb = get_batch(split) + with ctx: + logits, loss = model(X, cur_graph_emb, Y) + losses[k] = loss.item() + out[split] = losses.mean() + model.train() + return out + + +# learning rate decay scheduler (cosine with `warmup`) +def get_lr(it): + # 1) linear `warmup` for `warmup_iters` steps + if it < warmup_iters: + return learning_rate * (it + 1) / (warmup_iters + 1) + # 2) if `it > lr_decay_iters`, return min learning rate + if it > lr_decay_iters: + return min_lr + # 3) in between, use cosine decay down to min learning rate + decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters) + assert 0 <= decay_ratio <= 1 + coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # `coeff` ranges 0..1 + return min_lr + coeff * (learning_rate - min_lr) + + +# logging +if wandb_log and master_process: + import wandb + wandb.init(project=wandb_project, name=wandb_run_name, config=config) + +# training loop +X, Y, cur_graph_emb = get_batch('train') # fetch the very first batch +#`print(f"From training loop cur_graph_emb: {cur_graph_emb.shape}")` +t0 = time.time() +local_iter_num = 0 # number of iterations in the lifetime of this process +raw_model = model.module if ddp else model # unwrap DDP container if needed +running_mfu = -1.0 +#while True: + +dataset_n_batches = train_data.shape[0] // batch_size +pbar = tqdm(list(range(n_epochs * dataset_n_batches))) + +for i in pbar: + + # determine and set the learning rate for this iteration + lr = get_lr(iter_num) if decay_lr else learning_rate + for param_group in optimizer.param_groups: + param_group['lr'] = lr + + # evaluate the loss on train/val sets and write checkpoints + if iter_num % eval_interval == 0 and master_process: + losses = estimate_loss() + #`print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")` + pbar.set_description( + f"train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") + if wandb_log: + wandb.log({ + "iter": iter_num, + "train/loss": losses['train'], + "val/loss": losses['val'], + "lr": lr, + "mfu": running_mfu * 100, # convert to percentage + }) + if losses['val'] < best_val_loss or always_save_checkpoint: + best_val_loss = losses['val'] + if iter_num > 0: + checkpoint = { + 'model': raw_model.state_dict(), + 'optimizer': optimizer.state_dict(), + 'model_args': model_args, + 'iter_num': iter_num, + 'best_val_loss': best_val_loss, + 'config': config, + } + print(f"saving checkpoint to {out_dir}") + torch.save(checkpoint, os.path.join(out_dir, f'ckpt_{i}.pt')) + torch.save(checkpoint, os.path.join(out_dir, 'ckpt_overfit.pt')) + if iter_num == 0 and eval_only: + break + + # forward backward update, with optional gradient accumulation to simulate larger batch size + # and using the GradScaler if data type is float16 + for micro_step in range(gradient_accumulation_steps): + if ddp: + # in DDP training we only need to sync gradients at the last micro step. + # the official way to do this is with model.no_sync() context manager, but + # I really dislike that this bloats the code and forces us to repeat code + # looking at the source of that context manager, it just toggles this variable + model.require_backward_grad_sync = ( + micro_step == gradient_accumulation_steps - 1) + with ctx: + logits, loss = model(X, cur_graph_emb, Y) + loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation + # `immediately async prefetch next batch while model is doing the forward pass on the GPU` + X, Y, cur_graph_emb = get_batch('train') + # backward pass, with gradient scaling if training in `fp16` + scaler.scale(loss).backward() + # clip the gradient + if grad_clip != 0.0: + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + # step the optimizer and `scaler` if training in `fp16` + scaler.step(optimizer) + scaler.update() + # flush the gradients as soon as we can, no need for this memory anymore + optimizer.zero_grad(set_to_none=True) + + # timing and logging + t1 = time.time() + dt = t1 - t0 + t0 = t1 + if iter_num % log_interval == 0 and master_process: + # get loss as float. note: this is a CPU-GPU sync point + # scale up to undo the division above, approximating the true total loss (exact would have been a sum) + lossf = loss.item() * gradient_accumulation_steps + if local_iter_num >= 5: # let the training loop settle a bit + mfu = raw_model.estimate_mfu( + batch_size * gradient_accumulation_steps, dt) + running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu + #`print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")` + iter_num += 1 + local_iter_num += 1 + + # termination conditions + if iter_num > max_iters: + break + +if ddp: + destroy_process_group() diff --git a/docs/sphinx/applications/python/nanoGPT/train_pad_gemb_ar_eval.py b/docs/sphinx/applications/python/nanoGPT/train_pad_gemb_ar_eval.py new file mode 100644 index 00000000000..264816537e1 --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/train_pad_gemb_ar_eval.py @@ -0,0 +1,565 @@ +""" +`This training script can be run both on a single gpu in debug mode,` +`and also in a larger training run with distributed data parallel (ddp).` + +`To run on a single GPU, example:` +`$ python train.py --batch_size=32 --compile=False` + +`To run with DDP on 4 gpus on 1 node, example:` +`$ torchrun --standalone --nproc_per_node=4 train.py` + +`To run with DDP on 4 gpus across 2 nodes, example:` +`- Run on the first (master) node with example IP 123.456.123.456:` +`$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py` +`- Run on the worker node:` +`$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py` +`(If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1)` +""" + +import os +import time +import math +import pickle +from contextlib import nullcontext +from tqdm import tqdm +import sys +import pandas as pd + +sys.path.append("../") + +from datetime import datetime +import numpy as np +import torch +import json +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.distributed import init_process_group, destroy_process_group + +from nanoGPT.model_pad_gemb import GPTConfig as GPTConfig_gemb +from nanoGPT.model_pad_gemb import GPT as GPT_gemb + +from nanoGPT.model_pad import GPTConfig as GPTConfig_nogemb +from nanoGPT.model_pad import GPT as GPT_nogemb + +from qaoa_gpt_src.util import generate_circ_from_df, eval_adapt_gpt_circ_cudaq + +# `val_sampled_df = pd.read_pickle('data/qaoa_n10w_012325_v7/test_run_df.pkl')` +# `val_graph_emb_np = np.load('data/qaoa_n10w_012325_v7/feather_emb_d500.npy')` +# `val_meta = pd.read_pickle("data/qaoa_n10w_012325_v7/meta.pkl")` +# `val_emb_graph_id_to_idx_dict = val_meta['emb_graph_id_to_idx_dict']` + +n_epochs = 100 # (approximately) +eval_ar_every = 1000 + +# ----------------------------------------------------------------------------- +# default config values designed to train a gpt2 (124M) on OpenWebText +# I/O +out_dir = 'out' +eval_interval = 20000 +log_interval = 1 +eval_iters = 200 +eval_only = False # if True, script exits right after the first `eval` +always_save_checkpoint = True # if True, always save a checkpoint after each `eval` +init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*' +# `wandb` logging +wandb_log = False # disabled by default +wandb_project = 'owt' +wandb_run_name = 'gpt2' # `'run' + str(time.time())` +# data +dataset = 'openwebtext' +gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes +batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size +block_size = 1024 +# model +n_layer = 12 +n_head = 12 +n_embd = 768 +dropout = 0.0 # for `pretraining` 0 is good, for finetuning try 0.1+ +bias = False # do we use bias inside LayerNorm and Linear layers? +# `adamw` optimizer +learning_rate = 6e-4 # max learning rate +max_iters = 600000 # total number of training iterations +weight_decay = 1e-1 +beta1 = 0.9 +beta2 = 0.95 +grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0 +# learning rate decay settings +decay_lr = True # whether to decay the learning rate +warmup_iters = 2000 # how many steps to warm up for +lr_decay_iters = 600000 # `should be ~= max_iters per Chinchilla` +min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla +# DDP settings +backend = 'nccl' # `'nccl', 'gloo', etc.` +# system +device = 'cuda' # `examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks` +dtype = 'bfloat16' if torch.cuda.is_available( +) and torch.cuda.is_bf16_supported( +) else 'float16' # 'float32', '`bfloat16`', or 'float16', the latter will auto implement a GradScaler +compile = True # use PyTorch 2.0 to compile the model to be faster +# ----------------------------------------------------------------------------- +config_keys = [ + k for k, v in globals().items() + if not k.startswith('_') and isinstance(v, (int, float, bool, str)) +] +use_graph_emb = False +pool_type = "all_pool" +token_seq_round = "token_seq_round_d2" +n_samples = 5 +exec(open( + 'configurator.py').read()) # overrides from command line or config file +config = {k: globals()[k] for k in config_keys} # will be useful for logging +# ----------------------------------------------------------------------------- + +if use_graph_emb: + print("Training model with graph embeddings") + model_suf = 'gemb' +else: + print("Training model with NO graph embeddings") + model_suf = 'nogemb' + +# various `inits`, derived attributes, I/O setup +ddp = int(os.environ.get('RANK', -1)) != -1 # is this a `ddp` run? +if ddp: + init_process_group(backend=backend) + ddp_rank = int(os.environ['RANK']) + ddp_local_rank = int(os.environ['LOCAL_RANK']) + ddp_world_size = int(os.environ['WORLD_SIZE']) + device = f'cuda:{ddp_local_rank}' + torch.cuda.set_device(device) + master_process = ddp_rank == 0 # this process will do logging, `checkpointing` etc. + seed_offset = ddp_rank # each process gets a different seed + # world_size number of processes will be training simultaneously, so we can scale + # down the desired gradient accumulation iterations per process proportionally + assert gradient_accumulation_steps % ddp_world_size == 0 + gradient_accumulation_steps //= ddp_world_size +else: + # if not `ddp`, we are running on a single `gpu`, and one process + master_process = True + seed_offset = 0 + ddp_world_size = 1 +tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size +print(f"tokens per iteration will be: {tokens_per_iter:,}") + +if master_process: + os.makedirs(out_dir, exist_ok=True) +torch.manual_seed(1337 + seed_offset) +torch.backends.cuda.matmul.allow_tf32 = True # `allow tf32 on matmul` +torch.backends.cudnn.allow_tf32 = True # `allow tf32 on cudnn` +device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in `torch.autocast` +# note: float16 data type will automatically use a GradScaler +ptdtype = { + 'float32': torch.float32, + 'bfloat16': torch.bfloat16, + 'float16': torch.float16 +}[dtype] +ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast( + device_type=device_type, dtype=ptdtype) + +# poor man's data loader +data_dir = os.path.join('data', dataset) + +mmap = 'r' +mmap = None +print(f'Opening data (mmap mode: {mmap})...') +train_data = np.load(os.path.join(data_dir, 'train.npy'), mmap_mode=mmap) +val_data = np.load(os.path.join(data_dir, 'val.npy'), mmap_mode=mmap) +graph_emb_np = np.load(os.path.join(data_dir, 'feather_emb_d500.npy'), + mmap_mode=mmap) +emb_dim = graph_emb_np.shape[1] + +logging_json_file = os.path.join(out_dir, 'train_log.json') +logging_list = [] + + +def get_batch(split): + + if split == 'train': + data = train_data + emb_idx_data = train_data_graph_idx_list + else: + data = val_data + emb_idx_data = val_data_graph_idx_list + ix = np.random.randint(low=0, high=data.shape[0] - 1, size=batch_size) + data_batch_np = data[ix] + graph_emb_data = torch.tensor(graph_emb_np[emb_idx_data[ix]]) + + #`print(f"Get batch graph_emb_data shape: {graph_emb_data.shape}, {graph_emb_data.dtype}")` + x = torch.tensor(data_batch_np[:, :1, :].astype(np.int64)).flatten(1) + y = torch.tensor(data_batch_np[:, 1:2, :].astype(np.int64)).flatten(1) + if device_type == 'cuda': + # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True) + x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to( + device, non_blocking=True) + graph_emb_data = graph_emb_data.pin_memory().to(device, + non_blocking=True).to( + torch.bfloat16) + else: + x, y = x.to(device), y.to(device) + graph_emb_data = graph_emb_data.to(device) + #`print(f"graph_emb_data dtype: {graph_emb_data.dtype}\n\n\n")` + + return x, y, graph_emb_data + + +######################################### +# `init` these up here, can override if `init_from='resume'` (i.e. from a checkpoint) +iter_num = 0 +best_val_loss = 1e9 + +# attempt to derive vocab_size from the dataset +meta_path = os.path.join(data_dir, 'meta.pkl') +meta_vocab_size = None +if os.path.exists(meta_path): + with open(meta_path, 'rb') as f: + meta = pickle.load(f) + meta_vocab_size = meta['vocab_size'] + print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})") + +# For graph `embeddings` +emb_graph_id_to_idx_dict = meta['emb_graph_id_to_idx_dict'] +emb_graph_idx_to_id_dict = meta['emb_graph_idx_to_id_dict'] +train_data_graph_idx_list = np.array(meta['train_data_graph_idx_list']) +val_data_graph_idx_list = np.array(meta['val_data_graph_idx_list']) + +# For AR validation +########################################## +val_sampled_df = pd.read_pickle( + os.path.join(data_dir, 'combined_res_tok_shf_val_df.pkl')) +val_sampled_df = val_sampled_df[val_sampled_df['has_emb']] +val_n_nodes = int(val_sampled_df['n_nodes'].max()) +val_graph_emb_np = graph_emb_np +val_meta = meta +val_emb_graph_id_to_idx_dict = val_meta['emb_graph_id_to_idx_dict'] + +# ADAPT GPT-specific code +#------------------------- +#------------------------- +#------------------------- + + +def get_test_energies_df(): + + model.eval() + + print("Generating circuits with current state of the model") + gc_df = generate_circ_from_df( + val_sampled_df, + model=model, + graph_emb_np=val_graph_emb_np if use_graph_emb else None, + emb_graph_id_to_idx_dict=val_emb_graph_id_to_idx_dict + if use_graph_emb else None, + meta=meta, + device=device, + ctx=ctx, + n_samples_per_batch=50, # max number of distinct graphs in a batch + num_samples=n_samples, # number of samples to draw + max_new_tokens=150, # number of tokens generated in each sample + temperature= + 0.1, # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions + top_k= + 200, # retain only the top_k most likely tokens, clamp others to have 0 probability + token_seq_col=token_seq_round, + normalize_weights_flag=False, + ) + + ## Evaluating energies with ADAPT-QAOA in CUDA-Q + + print("Evaluating energies with ADAPT-QAOA in CUDA-Q") + energies_cudaq_gc_df = eval_adapt_gpt_circ_cudaq( + gc_df, + temp_folder='../temp_data/', + n_nodes=val_n_nodes, + pool_type=pool_type, + ) + + return energies_cudaq_gc_df + + +def eval_model_ar(): + + print("Model evaluation...") + test_energies_df = get_test_energies_df() + + test_energies_expl_df = test_energies_df[[ + 'adapt_gpt_energies', 'true_energy' + ]].explode('adapt_gpt_energies') + + test_energies_expl_corr_df = test_energies_expl_df[ + test_energies_expl_df['adapt_gpt_energies'] != 999] + + test_energies_expl_corr_df['ar'] = test_energies_expl_corr_df[ + 'adapt_gpt_energies'] / test_energies_expl_corr_df['true_energy'] + + avg_ar = round(test_energies_expl_corr_df['ar'].mean(), 5) + + test_energies_expl_inc_df = test_energies_expl_df[ + test_energies_expl_df['adapt_gpt_energies'] == 999] + + wrong_circ_rate = round( + len(test_energies_expl_inc_df) / len(test_energies_expl_df), 5) + + return test_energies_df, avg_ar, wrong_circ_rate + + +#------------------------- +#------------------------- +#------------------------- +########################################## + +# model `init` +model_args = dict(n_layer=n_layer, + n_head=n_head, + n_embd=n_embd, + block_size=block_size, + bias=bias, + vocab_size=None, + dropout=dropout) # start with `model_args` from command line +if init_from == 'scratch': + # `init` a new model from scratch + print("Initializing a new model from scratch") + # determine the vocab size we'll use for from-scratch training + if meta_vocab_size is None: + print( + "defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)" + ) + model_args[ + 'vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304 + if use_graph_emb: + gptconf = GPTConfig_gemb(**model_args) + model = GPT_gemb(gptconf) + else: + gptconf = GPTConfig_nogemb(**model_args) + model = GPT_nogemb(gptconf) +elif init_from == 'resume': + print(f"Resuming training from {out_dir}") + # resume training from a checkpoint. + ckpt_path = os.path.join(out_dir, 'ckpt.pt') + checkpoint = torch.load(ckpt_path, map_location=device) + checkpoint_model_args = checkpoint['model_args'] + # force these config attributes to be equal otherwise we can't even resume training + # the rest of the attributes (e.g. dropout) can stay as desired from command line + for k in [ + 'n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size' + ]: + model_args[k] = checkpoint_model_args[k] + # create the model + if use_graph_emb: + gptconf = GPTConfig_gemb(**model_args) + model = GPT_gemb(gptconf) + else: + gptconf = GPTConfig_nogemb(**model_args) + model = GPT_nogemb(gptconf) + state_dict = checkpoint['model'] + # fix the keys of the state dictionary :( + # honestly no idea how checkpoints sometimes get this prefix, have to debug more + unwanted_prefix = '_orig_mod.' + for k, v in list(state_dict.items()): + if k.startswith(unwanted_prefix): + state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) + model.load_state_dict(state_dict) + iter_num = checkpoint['iter_num'] + best_val_loss = checkpoint['best_val_loss'] +elif init_from.startswith('gpt2'): + print(f"Initializing from OpenAI GPT-2 weights: {init_from}") + # initialize from OpenAI GPT-2 weights + override_args = dict(dropout=dropout) + model = GPT.from_pretrained(init_from, override_args) + # read off the created config `params`, so we can store them into checkpoint correctly + for k in [ + 'n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size' + ]: + model_args[k] = getattr(model.config, k) +# crop down the model block size if desired, using model surgery +if block_size < model.config.block_size: + model.crop_block_size(block_size) + model_args[ + 'block_size'] = block_size # so that the checkpoint will have the right value +model.to(device) + +# `initialize a GradScaler. If enabled=False scaler is a no-op` +# `scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))` +scaler = torch.amp.GradScaler('cuda', enabled=(dtype == 'float16')) + +# optimizer +optimizer = model.configure_optimizers(weight_decay, learning_rate, + (beta1, beta2), device_type) +if init_from == 'resume': + optimizer.load_state_dict(checkpoint['optimizer']) +checkpoint = None # free up memory + +# compile the model +if compile: + print("compiling the model... (takes a ~minute)") + unoptimized_model = model + model = torch.compile(model) # requires PyTorch 2.0 + +# wrap model into DDP container +if ddp: + model = DDP(model, device_ids=[ddp_local_rank]) + + +# helps estimate an arbitrarily accurate loss over either split using many batches +@torch.no_grad() +def estimate_loss(): + out = {} + model.eval() + for split in ['train', 'val']: + losses = torch.zeros(eval_iters) + for k in range(eval_iters): + X, Y, cur_graph_emb = get_batch(split) + with ctx: + if use_graph_emb: + logits, loss = model(X, cur_graph_emb, Y) + else: + logits, loss = model(X, Y) + losses[k] = loss.item() + out[split] = losses.mean() + model.train() + return out + + +# learning rate decay scheduler (cosine with `warmup`) +def get_lr(it): + # 1) linear `warmup` for `warmup_iters` steps + if it < warmup_iters: + return learning_rate * (it + 1) / (warmup_iters + 1) + # 2) if `it > lr_decay_iters`, return min learning rate + if it > lr_decay_iters: + return min_lr + # 3) in between, use cosine decay down to min learning rate + decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters) + assert 0 <= decay_ratio <= 1 + coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # `coeff` ranges 0..1 + return min_lr + coeff * (learning_rate - min_lr) + + +# logging +if wandb_log and master_process: + import wandb + wandb.init(project=wandb_project, name=wandb_run_name, config=config) + +# training loop +X, Y, cur_graph_emb = get_batch('train') # fetch the very first batch +#`print(f"From training loop cur_graph_emb: {cur_graph_emb.shape}")` +t0 = time.time() +local_iter_num = 0 # number of iterations in the lifetime of this process +raw_model = model.module if ddp else model # unwrap DDP container if needed +running_mfu = -1.0 +#while True: + +dataset_n_batches = train_data.shape[0] // batch_size +pbar = tqdm(list(range(n_epochs * dataset_n_batches))) + +for i in pbar: + + # determine and set the learning rate for this iteration + lr = get_lr(iter_num) if decay_lr else learning_rate + for param_group in optimizer.param_groups: + param_group['lr'] = lr + + # evaluate the loss on train/val sets and write checkpoints + if iter_num % eval_interval == 0 and master_process: + losses = estimate_loss() + saving_model_name = f'ckpt_{i}_{model_suf}.pt' + if iter_num % eval_ar_every == 0 and iter_num > 0: + print("\tEvaluating model ER and AR...") + cur_test_energies_df, cur_ar, cur_er = eval_model_ar() + print(f"\tCurrent ar: {cur_ar}, error rate: {cur_er}\n\n") + cur_ar_str = str(cur_ar).replace('.', '_') + cur_er_str = str(cur_er).replace('.', '_') + saving_model_name = f'ckpt_{i}_{model_suf}__ar_{cur_ar_str}__er_{cur_er_str}.pt' + + logging_list.append({ + 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + 'model_dir': out_dir, + 'iter_num': iter_num, + 'cur_gpt_loss_train': losses['train'].item(), + 'cur_gpt_loss_val': losses['val'].item(), + 'cur_ar_val': cur_ar, + 'cur_er_val': cur_er, + 'cur_val_df': cur_test_energies_df.to_json(), + }) + with open(logging_json_file, 'w') as f: + json.dump(logging_list, f) + + #`print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")` + pbar.set_description( + f"train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") + if wandb_log: + wandb.log({ + "iter": iter_num, + "train/loss": losses['train'], + "val/loss": losses['val'], + "lr": lr, + "mfu": running_mfu * 100, # convert to percentage + }) + #if losses['val'] < best_val_loss or always_save_checkpoint: + if losses['val'] < best_val_loss: + best_val_loss = losses['val'] + if iter_num > 0: + checkpoint = { + 'model': raw_model.state_dict(), + 'optimizer': optimizer.state_dict(), + 'model_args': model_args, + 'iter_num': iter_num, + 'best_val_loss': best_val_loss, + 'config': config, + } + print(f"saving checkpoint to {out_dir}") + torch.save(checkpoint, os.path.join(out_dir, saving_model_name)) + #`torch.save(checkpoint, os.path.join(out_dir, 'ckpt_overfit.pt'))` + + if iter_num == 0 and eval_only: + break + + # forward backward update, with optional gradient accumulation to simulate larger batch size + # and using the GradScaler if data type is float16 + for micro_step in range(gradient_accumulation_steps): + if ddp: + # in DDP training we only need to sync gradients at the last micro step. + # the official way to do this is with model.no_sync() context manager, but + # I really dislike that this bloats the code and forces us to repeat code + # looking at the source of that context manager, it just toggles this variable + model.require_backward_grad_sync = ( + micro_step == gradient_accumulation_steps - 1) + with ctx: + if use_graph_emb: + logits, loss = model(X, cur_graph_emb, Y) + else: + logits, loss = model(X, Y) + loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation + # immediately `async prefetch` next batch while model is doing the forward pass on the GPU + X, Y, cur_graph_emb = get_batch('train') + # backward pass, with gradient scaling if training in `fp16` + scaler.scale(loss).backward() + # clip the gradient + if grad_clip != 0.0: + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + # step the optimizer and `scaler` if training in `fp16` + scaler.step(optimizer) + scaler.update() + # flush the gradients as soon as we can, no need for this memory anymore + optimizer.zero_grad(set_to_none=True) + + # timing and logging + t1 = time.time() + dt = t1 - t0 + t0 = t1 + if iter_num % log_interval == 0 and master_process: + # get loss as float. note: this is a CPU-GPU sync point + # scale up to undo the division above, approximating the true total loss (exact would have been a sum) + lossf = loss.item() * gradient_accumulation_steps + if local_iter_num >= 5: # let the training loop settle a bit + mfu = raw_model.estimate_mfu( + batch_size * gradient_accumulation_steps, dt) + running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu + #`print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")` + iter_num += 1 + local_iter_num += 1 + + # termination conditions + if iter_num > max_iters: + break + +if ddp: + destroy_process_group() diff --git a/docs/sphinx/applications/python/nanoGPT/transformer_sizing.ipynb b/docs/sphinx/applications/python/nanoGPT/transformer_sizing.ipynb new file mode 100644 index 00000000000..43262add3ba --- /dev/null +++ b/docs/sphinx/applications/python/nanoGPT/transformer_sizing.ipynb @@ -0,0 +1,402 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformer Theoretical Model\n", + "\n", + "This notebook stores a bunch of analysis about a Transformer, e.g. estimates the number of FLOPs, parameters, peak memory footprint, checkpoint size, etc." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import OrderedDict" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# config_args = {\n", + "# 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params\n", + "# 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params\n", + "# 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params\n", + "# 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params\n", + "# }[model_type]\n", + "\n", + "block_size = 1024\n", + "vocab_size = 50257\n", + "n_layer = 12\n", + "n_head = 12\n", + "n_embd = 768\n", + "bias = False\n", + "assert not bias, \"this notebook assumes bias=False just for simplicity\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "we see: 124337664, expected: 124337664, match: True\n", + "name params ratio (%) \n", + "emebedding/position 786432 0.6325\n", + "embedding/token 38597376 31.0424\n", + "embedding 39383808 31.6749\n", + "attention/ln 768 0.0006\n", + "attention/kqv 1769472 1.4231\n", + "attention/proj 589824 0.4744\n", + "attention 2360064 1.8981\n", + "mlp/ln 768 0.0006\n", + "mlp/ffw 2359296 1.8975\n", + "mlp/proj 2359296 1.8975\n", + "mlp 4719360 3.7956\n", + "block 7079424 5.6937\n", + "transformer 84953088 68.3245\n", + "ln_f 768 0.0006\n", + "dense 0 0.0000\n", + "total 124337664 100.0000\n" + ] + } + ], + "source": [ + "def params():\n", + " \"\"\" estimates the number of parameters in the model\"\"\"\n", + " out = OrderedDict()\n", + "\n", + " # token and position embeddings\n", + " out['emebedding/position'] = n_embd * block_size\n", + " out['embedding/token'] = n_embd * vocab_size\n", + " out['embedding'] = out['emebedding/position'] + out['embedding/token']\n", + "\n", + " # attention blocks\n", + " out['attention/ln'] = n_embd # note, bias=False in our LN\n", + " out['attention/kqv'] = n_embd * 3*n_embd\n", + " out['attention/proj'] = n_embd**2\n", + " out['attention'] = out['attention/ln'] + out['attention/kqv'] + out['attention/proj']\n", + "\n", + " # MLP blocks\n", + " ffw_size = 4*n_embd # feed forward size\n", + " out['mlp/ln'] = n_embd\n", + " out['mlp/ffw'] = n_embd * ffw_size\n", + " out['mlp/proj'] = ffw_size * n_embd\n", + " out['mlp'] = out['mlp/ln'] + out['mlp/ffw'] + out['mlp/proj']\n", + " \n", + " # the transformer and the rest of it\n", + " out['block'] = out['attention'] + out['mlp']\n", + " out['transformer'] = n_layer * out['block']\n", + " out['ln_f'] = n_embd # final layernorm\n", + " out['dense'] = 0 # 0 because of parameter sharing. This layer uses the weights from the embedding layer\n", + "\n", + " # total\n", + " out['total'] = out['embedding'] + out['transformer'] + out['ln_f'] + out['dense']\n", + "\n", + " return out\n", + "\n", + "# compare our param count to that reported by PyTorch\n", + "p = params()\n", + "params_total = p['total']\n", + "print(f\"we see: {params_total}, expected: {124337664}, match: {params_total == 124337664}\")\n", + "# create a header\n", + "print(f\"{'name':20s} {'params':10s} {'ratio (%)':10s}\")\n", + "for k,v in p.items():\n", + " print(f\"{k:20s} {v:10d} {v/params_total*100:10.4f}\")\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "est checkpoint size: 1.49 GB\n", + "measured with wc -c ckpt.pt: 1542470366\n", + "fluff ratio: 103.38%\n" + ] + } + ], + "source": [ + "# we can now calculate the size of each checkpoint\n", + "# params are stored in fp32, and the AdamW optimizer has 2 additional buffers per param for statistics\n", + "params_bytes = params_total*4\n", + "params_and_buffers_bytes = params_bytes + 2*params_bytes\n", + "print(f\"est checkpoint size: {params_and_buffers_bytes/1e9:.2f} GB\")\n", + "measured_bytes = 1542470366 # from wc -c ckpt.pt\n", + "print(f\"measured with wc -c ckpt.pt: {measured_bytes}\")\n", + "print(f\"fluff ratio: {measured_bytes/params_and_buffers_bytes*100:.2f}%\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also estimate the ratio of our GPU memory that will be taken up just by the weights and the buffers inside the AdamW optimizer" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "memory ratio taken up just for parameters: 3.73%\n" + ] + } + ], + "source": [ + "gpu_memory = 40e9 # 40 GB A100 GPU, roughly\n", + "print(f\"memory ratio taken up just for parameters: {params_and_buffers_bytes / gpu_memory * 100:.2f}%\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "i.e. not that much of the memory for this tiny model, most of the memory is activations (forward and backward). This of course changes dramatically for larger and larger models." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's estimate FLOPs for a single forward pass." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "name flops ratio (%) \n", + "attention/kqv 3623878656 1.2426\n", + "attention/scores 1610612736 0.5522\n", + "attention/reduce 1610612736 0.5522\n", + "attention/proj 1207959552 0.4142\n", + "attention 8053063680 2.7612\n", + "mlp/ffw1 4831838208 1.6567\n", + "mlp/ffw2 4831838208 1.6567\n", + "mlp 9663676416 3.3135\n", + "block 17716740096 6.0747\n", + "transformer 212600881152 72.8963\n", + "dense 79047426048 27.1037\n", + "forward_total 291648307200 100.0000\n", + "backward_total 583296614400 200.0000\n", + "total 874944921600 300.0000\n" + ] + } + ], + "source": [ + "def flops():\n", + " # we only count Weight FLOPs, all other layers (LayerNorm, Softmax, etc) are effectively irrelevant\n", + " # we count actual FLOPs, not MACs. Hence 2* all over the place\n", + " # basically for any matrix multiply A (BxC) @ B (CxD) -> (BxD) flops are 2*B*C*D\n", + "\n", + " out = OrderedDict()\n", + " head_size = n_embd // n_head\n", + "\n", + " # attention blocks\n", + " # 1) the projection to key, query, values\n", + " out['attention/kqv'] = 2 * block_size * (n_embd * 3*n_embd)\n", + " # 2) calculating the attention scores\n", + " out['attention/scores'] = 2 * block_size * block_size * n_embd\n", + " # 3) the reduction of the values (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)\n", + " out['attention/reduce'] = 2 * n_head * (block_size * block_size * head_size)\n", + " # 4) the final linear projection\n", + " out['attention/proj'] = 2 * block_size * (n_embd * n_embd)\n", + " out['attention'] = sum(out['attention/'+k] for k in ['kqv', 'scores', 'reduce', 'proj'])\n", + "\n", + " # MLP blocks\n", + " ffw_size = 4*n_embd # feed forward size\n", + " out['mlp/ffw1'] = 2 * block_size * (n_embd * ffw_size)\n", + " out['mlp/ffw2'] = 2 * block_size * (ffw_size * n_embd)\n", + " out['mlp'] = out['mlp/ffw1'] + out['mlp/ffw2']\n", + "\n", + " # the transformer and the rest of it\n", + " out['block'] = out['attention'] + out['mlp']\n", + " out['transformer'] = n_layer * out['block']\n", + " out['dense'] = 2 * block_size * (n_embd * vocab_size)\n", + "\n", + " # forward,backward,total\n", + " out['forward_total'] = out['transformer'] + out['dense']\n", + " out['backward_total'] = 2 * out['forward_total'] # use common estimate of bwd = 2*fwd\n", + " out['total'] = out['forward_total'] + out['backward_total']\n", + "\n", + " return out\n", + " \n", + "# compare our param count to that reported by PyTorch\n", + "f = flops()\n", + "flops_total = f['forward_total']\n", + "print(f\"{'name':20s} {'flops':14s} {'ratio (%)':10s}\")\n", + "for k,v in f.items():\n", + " print(f\"{k:20s} {v:14d} {v/flops_total*100:10.4f}\")\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "palm_flops: 875062886400, flops: 874944921600, ratio: 1.0001\n" + ] + } + ], + "source": [ + "# now here is an estimate copy pasted from the PaLM paper\n", + "# this formula is often used to calculate MFU (model flops utilization)\n", + "def palm_flops():\n", + " \"\"\"estimate of the model flops following PaLM paper formula\"\"\"\n", + " # non-embedding model parameters. note that we do not subtract the\n", + " # embedding/token params because those are tied and get used in the last layer.\n", + " N = params()['total'] - params()['emebedding/position']\n", + " L, H, Q, T = n_layer, n_head, n_embd//n_head, block_size\n", + " mf_per_token = 6*N + 12*L*H*Q*T\n", + " mf = mf_per_token * block_size\n", + " return mf\n", + "\n", + "print(f\"palm_flops: {palm_flops():d}, flops: {flops()['total']:d}, ratio: {palm_flops()/flops()['total']:.4f}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ok they are quite similar, giving some confidence that my math in flops() function was ~ok. Now, A100 is cited at 312TFLOPS bfloat16 on tensor cores. So what is our model flops utilization (MFU)? I trained the model above with a batch_size of 20 and grad_accum of 5, which runs in about 755ms on a single A100 GPU. We get:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "fraction of A100 used: 37.14%\n" + ] + } + ], + "source": [ + "# here is what we currently roughly measure\n", + "batch_size = 20 * 5 # 5 is grad_accum, so total batch size is 100\n", + "measured_time = 0.755 # in seconds per iteration\n", + "measured_throughput = batch_size / measured_time\n", + "flops_achieved = f['total'] * measured_throughput\n", + "\n", + "# A100 is cited to be 312 TFLOPS of bloat16 running on tensor cores\n", + "a100_flops_promised = 312e12\n", + "\n", + "# the fraction of the A100 that we are using:\n", + "print(f\"fraction of A100 used: {flops_achieved / a100_flops_promised * 100:.2f}%\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For reference, we'd prefer to be somewhere around 50%+, and not just for a single GPU but for an entire DDP run. So we still have some work to do, but at least we're within a factor of ~2X of what is achievable with this GPU." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "time needed to train the model: 3.46 days\n" + ] + } + ], + "source": [ + "# Finally let's check out the 6ND approximation as total cost of training in FLOPs\n", + "model_size = params()['total'] # this is number of parameters, N\n", + "tokens_num = 300e9 # 300B tokens, this is dataset size in tokens, D\n", + "a100_flops = 312e12 # 312 TFLOPS\n", + "assumed_mfu = 0.3 # assume this model flops utilization (take the current 37% from above and add some DDP overhead)\n", + "flops_throughput = a100_flops * 8 * assumed_mfu # assume an 8XA100 node at 30% utilization\n", + "flops_needed = 6 * model_size * tokens_num # 6ND\n", + "time_needed_s = flops_needed / flops_throughput # in seconds\n", + "print(f\"time needed to train the model: {time_needed_s/3600/24:.2f} days\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is not a bad estimate at all. I trained this model and it converged in roughly 4 days. Btw as a good reference for where 6ND comes from and some intuition around it I recommend [Dzmitry's post](https://medium.com/@dzmitrybahdanau/the-flops-calculus-of-language-model-training-3b19c1f025e4)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, FLOPs are just one constraint, the other that we have to keep a close track of is the memory bandwidth. TODO estimate LOAD/STORE costs of our model later." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pytorch2", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "7f5833218766b48e6e35e4452ee875aac0e2188d05bbe5298f2c62b79f08b222" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/sphinx/applications/python/qaoa_gpt.ipynb b/docs/sphinx/applications/python/qaoa_gpt.ipynb new file mode 100644 index 00000000000..fd993be8203 --- /dev/null +++ b/docs/sphinx/applications/python/qaoa_gpt.ipynb @@ -0,0 +1,1760 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e0044964", + "metadata": {}, + "source": [ + "# QAOA-GPT: GPT model for generating efficient QAOA quantum circuit.\n", + "\n", + "In this tutorial, we will explain the QAOA-GPT workflow introduced in this [paper](https://arxiv.org/pdf/2504.16350), [GitHub](https://github.com/IlyaTyagin/ADAPT-GPT): Ilya Tyagin, Marwa Farag, Kyle Sherbert, Karunya Shirali, Yuri Alexeev, Ilya Safro \"QAOA-GPT: Efficient Generation of Adaptive and Regular Quantum Approximate Optimization Algorithm Circuits\", IEEE International Conference on Quantum Computing and Engineering (QCE), 2025. \n", + "\n", + "## Overview:\n", + "\n", + "The QAOA-GPT framework is a novel approach that uses Generative Pretrained Transformers (GPT) to directly synthesize quantum circuits for solving Quadratic Unconstrained Binary Optimization (QUBO) problems, specifically demonstrated on the MaxCut problem. Unlike traditional Quantum Approximate Optimization Algorithm (QAOA) methods that rely on iterative classical optimization, QAOA-GPT aims to significantly reduce computational overhead by generating high-quality circuits through a single inference pass of a trained GPT model.\n", + "\n", + "## Background concepts:\n", + "\n", + "To understand QAOA-GPT, it's helpful to be familiar with these concepts:\n", + "\n", + "- MaxCut Problem: A combinatorial optimization problem that involves partitioning the vertices of a graph into two disjoint subsets to maximize the total weight of edges connecting nodes in different subsets. It can be formulated as a QUBO problem. To learn more see this [tutorial](https://nvidia.github.io/cuda-quantum/latest/applications/python/qaoa.html).\n", + "\n", + "- Quantum Approximate Optimization Algorithm (QAOA): A hybrid quantum-classical variational algorithm used for combinatorial optimization. It approximates optimal solutions by alternating between a cost Hamiltonian ($H_c$) derived from the problem's objective function and a mixing Hamiltonian ($H_B$). A classical optimizer iteratively adjusts parameters to minimize the expectation value of the cost Hamiltonian. To learn more see this [tutorial](https://nvidia.github.io/cuda-quantum/latest/applications/python/qaoa.html).\n", + "\n", + "- ADAPT-QAOA: An advancement of standard QAOA that iteratively constructs a problem-tailored ansatz from an operator pool. It addresses limitations of fixed-ansatz QAOA, such as slow convergence and high classical optimization overhead. Operators are selected based on their energy gradient, leading to more compact and expressive circuits. To learn more, check this [tutorial](https://nvidia.github.io/cuda-quantum/latest/applications/python/adapt_qaoa.html)\n", + "\n", + "\n", + "- Generative Pre-trained Transformer (GPT): A semi-supervised learning framework that uses a decoder-only transformer architecture to learn transferable language representations. In QAOA-GPT, this token-based generation paradigm is adapted for synthesizing quantum circuits. See link [here](https://github.com/karpathy/nanoGPT/tree/b580a454dca683cd8b5181767e32549f2d88541f).\n", + "\n", + "- FEATHER (Graph-level embeddings): A non-parametric graph embedding method that captures local distributions of node features and higher-order structural information. In this tutorial, we wraps the original FEATHER [library](https://github.com/benedekrozemberczki/FEATHER) and adapt it to use multiple features (log degree and clustering coefficient). These embeddings are used as conditioning input for the GPT model in QAOA-GPT. See this [paper](https://arxiv.org/pdf/2005.07959) to learn more about FEATHER.\n", + "\n", + "## QAOA-GPT Method: A Four-Stage Pipeline\n", + "\n", + "The QAOA-GPT framework operates through a four-stage training pipeline (illustrated in the figure below).\n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "### Step 1: Random Graph Generation\n", + "- Generate Graphs: Input graphs are sampled from the Erdős-Rényi distribution $G(n,s)$, where n is the number of nodes and s is the edge probability. The generated graphs are restricted to connected graphs without isolated nodes or multiple components. Each edge is assigned a random weight $w_{ij} ∈ U(0,1)$.\n", + "\n", + "- Process Graphs: Each generated graph is processed by two components: \n", + "\n", + "Graph Embedding (FEATHER): Computes fixed-length, low-dimensional vector representations of the graph based on characteristic functions of random walk distributions. These embeddings serve as conditioning input for the generative model.\n", + "\n", + "Circuit Generation (ADAPT-QAOA): Generates quantum circuits that provide high-quality approximate solutions to the MaxCut problem. This involves an iterative process:\n", + "\n", + "1- Initialize the variational parameter ($\\gamma_0$) of the problem Hamiltonian.\n", + "\n", + "2- At each iteration $k$, select an operator $O(k)$ from a predefined mixer pool $P$ that has the largest gradient; append the selected operator and re-optimize all variational parameters {$\\beta_1, \\gamma_1, ...., \\beta_k, \\gamma_k$}; repeat until stopping criteria are met (e.g., gradient norm falls below a threshold, circuit depth limit, or energy error-based conditions). \n", + "\n", + "Only circuits achieving a target approximation ratio ($\\alpha >= 0.97$)\n", + "are included in the dataset. Multiple valid circuits per graph are generated to enrich the dataset.\n", + "\n", + "### Step 2: Graph-Circuit Collection:\n", + "- Store Data: The generated graph embeddings (from FEATHER) and the optimized ADAPT-QAOA circuits are stored together in a structured collection.\n", + "\n", + "### Step 3: Training Set Construction: \n", + "- Curate and Tokenize: The graph-circuit collection is curated and tokenized to form the final training set.\n", + "\n", + "1- Graph Tokens: Each graph $G$ is represented as a weighted edge list, enclosed by `` and `` tokens.\n", + "\n", + "2- Circuit Tokens: Each circuit is expressed as a sequence of layer blocks, with ``, operator index $o_k$, and optimized parameters $\\gamma_k, \\beta_k$ for each layer.\n", + "\n", + "3- Numeric Handling: All real numbers (parameters) are rounded to two decimal places and clipped to [−10, 10]. Circuits with parameters outside this range are excluded.\n", + "\n", + "- Training Sample Construction: The dataset consists of independent token sequences, each encoding a graph-circuit pair. Training samples are constructed as consecutive segments of each instance using a sliding window. This ensures strict instance isolation, unlike typical large-scale language model training where sequences might be concatenated.\n", + "\n", + "### Step 4: Model Architecture and Training QAOA-GPT \n", + "\n", + "- Model Training: The tokenized training set is used to train QAOA-GPT, a decoder-only Transformer model based on the nanoGPT implementation of GPT-2. The model is trained from scratch without pre-trained weights, given its custom circuit tokenization schema.\n", + "\n", + "- Transformer Input: The input to the Transformer is computed by combining token embeddings, positional embeddings, and broadcasted graph embeddings from FEATHER. This allows the model to condition circuit generation on global graph features.\n", + "\n", + "- Training Procedure: The model is trained using an autoregressive next-token prediction loss. The average approximation ratio (AR) on a validation set and the circuit error rate (fraction of structurally invalid circuits) are calculated each predefined number of iterations during training.\n" + ] + }, + { + "cell_type": "markdown", + "id": "d93897d3", + "metadata": {}, + "source": [ + "### Requirments:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "645073f0", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install torch \n", + "%pip install numpy \n", + "%pip install transformers \n", + "%pip install datasets \n", + "%pip install tiktoken \n", + "%pip install wandb \n", + "%pip install ipykernel \n", + "%pip install pandas \n", + "%pip install tqdm \n", + "%pip install networkx \n", + "%pip install matplotlib \n", + "%pip install joblib \n", + "%pip install scipy \n", + "%pip install gurobipy\n", + "%pip install seaborn" + ] + }, + { + "cell_type": "markdown", + "id": "4989ac1d", + "metadata": {}, + "source": [ + "## 1- Dataset generation:\n", + "\n", + "Here, we generate graphs and the quantum circuits that provide a solution to the max-cut problem. ADAPT-QAOA is emplyed to generate the circuits. Data are stored in `adap_data` folder. For production, user must generate a large data size (5000 and above).\n", + "\n", + "To do: Dataset generation is the most expensive step. Parallelization of ADAPT-QAOA will accelerate data generation for larger graph problems. The current version is not paralleized yet. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b59364a", + "metadata": {}, + "outputs": [], + "source": [ + "from qaoa_gpt_src.generate_adapt_qaoa_data import generate_data_max_cut\n", + "\n", + "# out_dir is where user wants to save the output data\n", + "# The output will be saved in a folder called adapt_results in the current directory.\n", + "# The output will contain the results of the ADAPT-QAOA algorithm for various graphs.\n", + "adapt_data = 'adapt_results'\n", + "\n", + "# The following parameters are used to generate the dataset for the ADAPT-QAOA algorithm.\n", + "# The graphs_number parameter specifies how many graphs to generate, \n", + "# n_nodes specifies the number of nodes in each graph\n", + "# op_pool specifies the pool of operators to use for the ADAPT-QAOA algorithm. 'qaoa_mixer', 'qaoa_single_x', 'qaoa_double_ops', 'all_pool'\n", + "# use_brute_force, use_simulated_annealing, and use_one_exchange are boolean flags \n", + "# that determine how to calculate the optimal solution for the MaxCut problem using classical methods.\n", + "# init_gamma is a list of initial gamma values for the QAOA circuit, \n", + "# optimizer: BFGS, L-BFGS-B, or COBYLA.\n", + "# approx_ratio is the approximation ratio for the QAOA circuit (vqe_energy/classical_value)\n", + "# norm_threshold is the threshold for the norm of the ADAPT gradient,\n", + "# energy_threshold is the threshold for the energy of the ADAPT-QAOA circuit,\n", + "# max_iter is the maximum number of iterations for the ADAPT. \n", + "# If approx ratio or the other threshold achieved, the loop will break.\n", + "#p_init (float): Initial probability for Erdos-Renyi graph generation.\n", + "#p_final (float): Final probability for Erdos-Renyi graph generation. \n", + "#the probability will be randomly selected between p_init and p_final.\n", + "# seed_g (int): Random seed for for Erdos-Renyi graph generation reproducibility.\n", + "# seed_weight (int): Random seed for edge weight generation reproducibility.\n", + "#seed_adapt (int): Random seed for ADAPT-QAOA reproducibility. \n", + "# If more than operator pools are large, we randomly choose one.\n", + "\n", + "\n", + "generate_data_max_cut(output_dir= adapt_data, graphs_number=5, n_nodes=6, weighted=True, \n", + " use_negative_weights=False,\n", + " use_brute_force=True, use_simulated_annealing=True, \n", + " use_one_exchange=True,\n", + " op_pool='all_pool', \n", + " init_gamma=[0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1], \n", + " scaling_coef=1.0, norm_weights=False, norm_coef=1.0,\n", + " trials_per_graph=1, optimizer='BFGS', \n", + " approx_ratio=0.97, norm_threshold= 1e-4, \n", + " energy_threshold = 1e-15, max_iter=15, \n", + " p_init=0.3, p_final=0.9, seed_g=None, seed_weight=None,\n", + " seed_adapt=None, verbose=False)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "cc7fb505", + "metadata": {}, + "source": [ + "User can run the function above across different GPUs and collect all results in `adapt_data` in `csv` files. Multiple csv files are supported in the function below. The function below will read all files in `adapt_data` as long as the files are named distinctively and generate token files that will combin all results. " + ] + }, + { + "cell_type": "markdown", + "id": "0978f6a6", + "metadata": {}, + "source": [ + "## 2- Filter dataset and tokenization.\n", + "\n", + "First, we filter data. Only, circuits achieving the approximation ration are included. All real number are rounded to two decimal place. Afterward, we tokenize graph and circuits for the GPT training. Data will be stored in a folder inside the `~/nanoGPT/data/save_dir`.\n", + "\n", + "Note: `train_adapt_gpt_config_template.py` is the configuration file where user should set up the inputs for the subsequent GPT training of their data. In the configuration file, user can change `batch_size`, `max_iters`, `lr_decay_iters`, `eval_interval`, `eval_iters`, `eval_ar_every`, and `n_epochs` parameters based on their preferences. User can also change the GPT model: `n_layer`, `n_head`, `n_embd`, `dropout`. The configuration file `train_adapt_gpt_config.py` will be generated and stored together with the other data inside `~/nanoGPT/data/save_dir`. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ae5ed3da", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Opening ADAPT results (adapt_results): 100%|██████████| 1/1 [00:00<00:00, 310.83it/s]\n", + "Opening graphs (adapt_results): 100%|██████████| 1/1 [00:00<00:00, 855.63it/s]\n", + "100%|██████████| 5/5 [00:00<00:00, 26247.21it/s]\n", + "100%|██████████| 5/5 [00:00<00:00, 38550.59it/s]\n", + "100%|██████████| 5/5 [00:00<00:00, 7246.55it/s]\n", + "100%|██████████| 14/14 [00:00<00:00, 82938.21it/s]\n", + "100%|██████████| 5/5 [00:00<00:00, 25327.92it/s]\n", + "100%|██████████| 5/5 [00:00<00:00, 6474.69it/s]\n", + "100%|██████████| 5/5 [00:00<00:00, 107546.26it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated embedding dimension: 500\n", + "Expected dimension: 500\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5/5 [00:00<00:00, 6955.73it/s]\n", + "100%|██████████| 5/5 [00:00<00:00, 26247.21it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Applying sliding window...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5/5 [00:00<00:00, 24585.60it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\tNumber of training samples: 5, val samples: 0, test samples: 0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from qaoa_gpt_src.prepare_tokens import generate_tokens\n", + "import os\n", + "\n", + "# Get directory containing the current script\n", + "current_dir = os.getcwd()\n", + "token_folder_name = 'save_dir'\n", + "\n", + "# The path where the adapt results are saved\n", + "path_input = os.path.join(current_dir, adapt_data)\n", + "\n", + "# The path where the generated tokens will be saved \n", + "# (it should be inside the data folder inside nanoGPT)\n", + "path_save = os.path.join(current_dir, 'nanoGPT', 'data', token_folder_name)\n", + "\n", + "# The path to the configuration file for the QAOA-GPT trainaing\n", + "config_path = os.path.join(current_dir, 'qaoa_gpt_src')\n", + "\n", + "# The following parameters are used to generate the tokens for the ADAPT-QAOA algorithm.\n", + "# n_nodes is the number of nodes in the graph\n", + "# approx_ratio_thr is the threshold for the approximation ratio (vqe_energy/classical_value)\n", + "# val_frac is the fraction of the dataset to be used for validation\n", + "# test_frac is the fraction of the dataset to be used for testing\n", + "# target_val_size is the target size of the validation set\n", + "\n", + "generate_tokens(results_fpath_str = path_input, save_path_str = path_save, \n", + " config_path_temp = config_path, n_nodes = 6, approx_ratio_thr= 0.97, \n", + " val_frac = 0.1, test_frac = 0.0, target_val_size = 2, verbose = False)\n" + ] + }, + { + "cell_type": "markdown", + "id": "79c28b09", + "metadata": {}, + "source": [ + "Let's look at the generated data and tokens." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4dfc44bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
methodgraph_namegraph_numtrial_numn_nodesinit_gammaenergy_listtrue_energyoptimizerpool_typeedge_weight_scaling_coefedge_weight_norm_coefmixer_pool_pauli_wordmixer_pool_indexgamma_coefbeta_coefapprox_ratiocut_adaptcut_classicalnum_layersoptimizer_success_flagelapsed_timeworker_id_xprefixg_methodedgelist_jsonH_frob_normworker_id_yedgelist_listedgelist_list_lennum_connected_compn_layersgraph_idonly_qaoa_circhas_embtoken_seq_round_d2token_int_seq_round_d2labeltoken_int_seq_round_d2_sw
0ADAPT-QAOAGraph_22160.10[-3.359999999537821, -3.7949999999952433, -4.0...-4.50BFGSall_pool1.01.0[['YIZIII'], ['IZIYII'], ['IIIIYZ'], ['IIIIXX']][14, 35, 66, 64][0.01745972054338572, -0.019733899238238935, 0...[-0.7856887670086344, -0.7854085503209243, -0....0.9888951101('001101', '110010')4True2.087118pid1777360_25-08-21__00_59_resultspid1777360_25-08-21erdos_renyi[[1, 3, 0.97], [1, 4, 0.4], [1, 6, 0.16], [2, ...24.329998pid1777360_25-08-21__00_59_graphs[[1, 3, 0.97], [1, 4, 0.4], [1, 6, 0.16], [2, ...1114pid1777360_25-08-21_^_2FalseTrue[bos, (1, 3), 0.97, (1, 4), 0.4, (1, 6), 0.16,...[1, 18, 671, 10, 775, 16, 1926, 11, 1153, 5, 1...train[[[1, 18, 671, 10, 775, 16, 1926, 11, 1153, 5,...
1ADAPT-QAOAGraph_55160.01[-1.8899999987838123, -2.164999996558533, -2.4...-2.77BFGSall_pool1.01.0[['YZIIII'], ['IZIYII'], ['IIIIYZ'], ['ZIYIII'...[10, 35, 66, 15, 64][2.8691569393532094e-05, -0.000100819259514377...[-0.7853997378605556, -0.785397455801531, -0.7...1.000000100101('100101', '011010')5True3.434215pid1777360_25-08-21__00_59_resultspid1777360_25-08-21erdos_renyi[[1, 2, 0.93], [1, 3, 0.24], [2, 4, 0.55], [2,...12.461396pid1777360_25-08-21__00_59_graphs[[1, 2, 0.93], [1, 3, 0.24], [2, 4, 0.55], [2,...915pid1777360_25-08-21_^_5FalseTrue[bos, (1, 2), 0.93, (1, 3), 0.24, (2, 4), 0.55...[1, 6, 164, 18, 914, 5, 1624, 13, 406, 7, 1470...train[[[1, 6, 164, 18, 914, 5, 1624, 13, 406, 7, 14...
2ADAPT-QAOAGraph_33160.01[-3.179999999999917, -3.5199999985319583, -3.7...-4.48BFGSall_pool1.01.0[['IIYZII'], ['ZIIIIY'], ['IZIIYI'], ['YIIIIY'...[46, 27, 39, 25, 36, 45][6.274385167217878e-07, -5.3271375143128285e-0...[-0.7854006450112557, -0.785394076248958, -0.7...1.000000100110('100110', '011001')6True4.721877pid1777360_25-08-21__00_59_resultspid1777360_25-08-21erdos_renyi[[1, 2, 0.41], [1, 3, 0.6], [1, 4, 0.28], [1, ...23.108302pid1777360_25-08-21__00_59_graphs[[1, 2, 0.41], [1, 3, 0.6], [1, 4, 0.28], [1, ...1316pid1777360_25-08-21_^_3FalseTrue[bos, (1, 2), 0.41, (1, 3), 0.6, (1, 4), 0.28,...[1, 6, 1919, 18, 1238, 10, 1337, 8, 1293, 16, ...train[[[1, 6, 1919, 18, 1238, 10, 1337, 8, 1293, 16...
3ADAPT-QAOAGraph_11160.01[-2.674999999999453, -3.054999999160295, -3.38...-3.69BFGSall_pool1.01.0[['ZYIIII'], ['IIZYII'], ['ZIIIIY'], ['ZIIIYI']][11, 47, 27, 23][-0.0003976113260684765, 0.0006218543845905527...[-0.7853976823592153, -0.7853980176827251, -0....0.97425510111('100100', '011011')4True1.734386pid1777360_25-08-21__00_59_resultspid1777360_25-08-21erdos_renyi[[1, 2, 0.84], [1, 3, 0.68], [1, 4, 0.51], [1,...19.236133pid1777360_25-08-21__00_59_graphs[[1, 2, 0.84], [1, 3, 0.68], [1, 4, 0.51], [1,...1214pid1777360_25-08-21_^_1FalseTrue[bos, (1, 2), 0.84, (1, 3), 0.68, (1, 4), 0.51...[1, 6, 96, 18, 163, 10, 1111, 8, 1034, 16, 916...train[[[1, 6, 96, 18, 163, 10, 1111, 8, 1034, 16, 9...
4ADAPT-QAOAGraph_44160.01[-2.194999999966217, -2.6599999999399873, -2.9...-3.33BFGSall_pool1.01.0[['IIYIZI'], ['IZIIIY'], ['YIIZII'], ['IIXIXI'...[50, 43, 18, 48, 16][1.1212799835079733e-05, 0.008775041659259374,...[-0.7853989447692689, -0.7854053059827186, -0....0.97089311100('100011', '011100')5True3.711532pid1777360_25-08-21__00_59_resultspid1777360_25-08-21erdos_renyi[[1, 4, 0.59], [2, 4, 0.08], [2, 5, 0.41], [2,...15.203894pid1777360_25-08-21__00_59_graphs[[1, 4, 0.59], [2, 4, 0.08], [2, 5, 0.41], [2,...715pid1777360_25-08-21_^_4FalseTrue[bos, (1, 4), 0.59, (2, 4), 0.08, (2, 5), 0.41...[1, 10, 95, 5, 984, 17, 1919, 13, 164, 19, 164...train[[[1, 10, 95, 5, 984, 17, 1919, 13, 164, 19, 1...
\n", + "
" + ], + "text/plain": [ + " method graph_name graph_num trial_num n_nodes init_gamma \\\n", + "0 ADAPT-QAOA Graph_2 2 1 6 0.10 \n", + "1 ADAPT-QAOA Graph_5 5 1 6 0.01 \n", + "2 ADAPT-QAOA Graph_3 3 1 6 0.01 \n", + "3 ADAPT-QAOA Graph_1 1 1 6 0.01 \n", + "4 ADAPT-QAOA Graph_4 4 1 6 0.01 \n", + "\n", + " energy_list true_energy optimizer \\\n", + "0 [-3.359999999537821, -3.7949999999952433, -4.0... -4.50 BFGS \n", + "1 [-1.8899999987838123, -2.164999996558533, -2.4... -2.77 BFGS \n", + "2 [-3.179999999999917, -3.5199999985319583, -3.7... -4.48 BFGS \n", + "3 [-2.674999999999453, -3.054999999160295, -3.38... -3.69 BFGS \n", + "4 [-2.194999999966217, -2.6599999999399873, -2.9... -3.33 BFGS \n", + "\n", + " pool_type edge_weight_scaling_coef edge_weight_norm_coef \\\n", + "0 all_pool 1.0 1.0 \n", + "1 all_pool 1.0 1.0 \n", + "2 all_pool 1.0 1.0 \n", + "3 all_pool 1.0 1.0 \n", + "4 all_pool 1.0 1.0 \n", + "\n", + " mixer_pool_pauli_word \\\n", + "0 [['YIZIII'], ['IZIYII'], ['IIIIYZ'], ['IIIIXX']] \n", + "1 [['YZIIII'], ['IZIYII'], ['IIIIYZ'], ['ZIYIII'... \n", + "2 [['IIYZII'], ['ZIIIIY'], ['IZIIYI'], ['YIIIIY'... \n", + "3 [['ZYIIII'], ['IIZYII'], ['ZIIIIY'], ['ZIIIYI']] \n", + "4 [['IIYIZI'], ['IZIIIY'], ['YIIZII'], ['IIXIXI'... \n", + "\n", + " mixer_pool_index \\\n", + "0 [14, 35, 66, 64] \n", + "1 [10, 35, 66, 15, 64] \n", + "2 [46, 27, 39, 25, 36, 45] \n", + "3 [11, 47, 27, 23] \n", + "4 [50, 43, 18, 48, 16] \n", + "\n", + " gamma_coef \\\n", + "0 [0.01745972054338572, -0.019733899238238935, 0... \n", + "1 [2.8691569393532094e-05, -0.000100819259514377... \n", + "2 [6.274385167217878e-07, -5.3271375143128285e-0... \n", + "3 [-0.0003976113260684765, 0.0006218543845905527... \n", + "4 [1.1212799835079733e-05, 0.008775041659259374,... \n", + "\n", + " beta_coef approx_ratio cut_adapt \\\n", + "0 [-0.7856887670086344, -0.7854085503209243, -0.... 0.988895 1101 \n", + "1 [-0.7853997378605556, -0.785397455801531, -0.7... 1.000000 100101 \n", + "2 [-0.7854006450112557, -0.785394076248958, -0.7... 1.000000 100110 \n", + "3 [-0.7853976823592153, -0.7853980176827251, -0.... 0.974255 10111 \n", + "4 [-0.7853989447692689, -0.7854053059827186, -0.... 0.970893 11100 \n", + "\n", + " cut_classical num_layers optimizer_success_flag elapsed_time \\\n", + "0 ('001101', '110010') 4 True 2.087118 \n", + "1 ('100101', '011010') 5 True 3.434215 \n", + "2 ('100110', '011001') 6 True 4.721877 \n", + "3 ('100100', '011011') 4 True 1.734386 \n", + "4 ('100011', '011100') 5 True 3.711532 \n", + "\n", + " worker_id_x prefix g_method \\\n", + "0 pid1777360_25-08-21__00_59_results pid1777360_25-08-21 erdos_renyi \n", + "1 pid1777360_25-08-21__00_59_results pid1777360_25-08-21 erdos_renyi \n", + "2 pid1777360_25-08-21__00_59_results pid1777360_25-08-21 erdos_renyi \n", + "3 pid1777360_25-08-21__00_59_results pid1777360_25-08-21 erdos_renyi \n", + "4 pid1777360_25-08-21__00_59_results pid1777360_25-08-21 erdos_renyi \n", + "\n", + " edgelist_json H_frob_norm \\\n", + "0 [[1, 3, 0.97], [1, 4, 0.4], [1, 6, 0.16], [2, ... 24.329998 \n", + "1 [[1, 2, 0.93], [1, 3, 0.24], [2, 4, 0.55], [2,... 12.461396 \n", + "2 [[1, 2, 0.41], [1, 3, 0.6], [1, 4, 0.28], [1, ... 23.108302 \n", + "3 [[1, 2, 0.84], [1, 3, 0.68], [1, 4, 0.51], [1,... 19.236133 \n", + "4 [[1, 4, 0.59], [2, 4, 0.08], [2, 5, 0.41], [2,... 15.203894 \n", + "\n", + " worker_id_y \\\n", + "0 pid1777360_25-08-21__00_59_graphs \n", + "1 pid1777360_25-08-21__00_59_graphs \n", + "2 pid1777360_25-08-21__00_59_graphs \n", + "3 pid1777360_25-08-21__00_59_graphs \n", + "4 pid1777360_25-08-21__00_59_graphs \n", + "\n", + " edgelist_list edgelist_list_len \\\n", + "0 [[1, 3, 0.97], [1, 4, 0.4], [1, 6, 0.16], [2, ... 11 \n", + "1 [[1, 2, 0.93], [1, 3, 0.24], [2, 4, 0.55], [2,... 9 \n", + "2 [[1, 2, 0.41], [1, 3, 0.6], [1, 4, 0.28], [1, ... 13 \n", + "3 [[1, 2, 0.84], [1, 3, 0.68], [1, 4, 0.51], [1,... 12 \n", + "4 [[1, 4, 0.59], [2, 4, 0.08], [2, 5, 0.41], [2,... 7 \n", + "\n", + " num_connected_comp n_layers graph_id only_qaoa_circ \\\n", + "0 1 4 pid1777360_25-08-21_^_2 False \n", + "1 1 5 pid1777360_25-08-21_^_5 False \n", + "2 1 6 pid1777360_25-08-21_^_3 False \n", + "3 1 4 pid1777360_25-08-21_^_1 False \n", + "4 1 5 pid1777360_25-08-21_^_4 False \n", + "\n", + " has_emb token_seq_round_d2 \\\n", + "0 True [bos, (1, 3), 0.97, (1, 4), 0.4, (1, 6), 0.16,... \n", + "1 True [bos, (1, 2), 0.93, (1, 3), 0.24, (2, 4), 0.55... \n", + "2 True [bos, (1, 2), 0.41, (1, 3), 0.6, (1, 4), 0.28,... \n", + "3 True [bos, (1, 2), 0.84, (1, 3), 0.68, (1, 4), 0.51... \n", + "4 True [bos, (1, 4), 0.59, (2, 4), 0.08, (2, 5), 0.41... \n", + "\n", + " token_int_seq_round_d2 label \\\n", + "0 [1, 18, 671, 10, 775, 16, 1926, 11, 1153, 5, 1... train \n", + "1 [1, 6, 164, 18, 914, 5, 1624, 13, 406, 7, 1470... train \n", + "2 [1, 6, 1919, 18, 1238, 10, 1337, 8, 1293, 16, ... train \n", + "3 [1, 6, 96, 18, 163, 10, 1111, 8, 1034, 16, 916... train \n", + "4 [1, 10, 95, 5, 984, 17, 1919, 13, 164, 19, 164... train \n", + "\n", + " token_int_seq_round_d2_sw \n", + "0 [[[1, 18, 671, 10, 775, 16, 1926, 11, 1153, 5,... \n", + "1 [[[1, 6, 164, 18, 914, 5, 1624, 13, 406, 7, 14... \n", + "2 [[[1, 6, 1919, 18, 1238, 10, 1337, 8, 1293, 16... \n", + "3 [[[1, 6, 96, 18, 163, 10, 1111, 8, 1034, 16, 9... \n", + "4 [[[1, 10, 95, 5, 984, 17, 1919, 13, 164, 19, 1... " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pickle\n", + "import pandas as pd\n", + "\n", + "with open(f'{path_save}/combined_res_tok_shf_df.pkl', 'rb') as f:\n", + " data = pickle.load(f)\n", + "\n", + "if isinstance(data, pd.DataFrame):\n", + " pd.set_option('display.max_columns', None)\n", + " display(data) # Show the whole DataFrame\n", + "\n", + " # Specify the row index you want to view\n", + " #row_index = 0 # Change this to the desired row number\n", + " #print(f\"Row {row_index}:\")\n", + " #print(data.iloc[row_index])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "711e4692", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Value at row 0, column 'token_seq_round_d2':\n", + "['bos', (1, 3), 0.97, (1, 4), 0.4, (1, 6), 0.16, (2, 3), 0.54, (2, 4), 0.87, (2, 6), 0.53, (3, 4), 0.98, (3, 5), 0.23, (3, 6), 0.27, (4, 5), 0.21, (5, 6), 0.59, 'end_of_graph', 'new_layer_p', 14, -0.79, 0.02, 'new_layer_p', 35, -0.79, -0.02, 'new_layer_p', 66, -0.79, 0.08, 'new_layer_p', 64, -0.79, 1.97, 'eos']\n" + ] + } + ], + "source": [ + "row_index = 0 # Change this to the desired row number\n", + "column_name = \"token_seq_round_d2\" # Change this to your column name\n", + "\n", + "print(f\"Value at row {row_index}, column '{column_name}':\")\n", + "print(data.iloc[row_index][column_name])" + ] + }, + { + "cell_type": "markdown", + "id": "ba379a4c", + "metadata": {}, + "source": [ + "## 3- GPT model training:\n", + "\n", + "Here, the model is trained from scratch. \n", + "\n", + "We are perfoming training on small datapoints we generated for `n_nodes =8` and they are stored in `save_dir_3k.zip` inside the `~/nanoGPT/data` folder. We have 2398 samples for training; and 599 samples for evaluation. Unzip the file for this experiment. The purpose of this small experiment is to only explain how QAOA-GPT works, and therefore, we are not expecting to produce results from QAOA-GPT as shown in the paper. User who is interested to employ QAOA-GPT for producing more accurate results as shown in the paper should perfom training on large data size. \n", + " \n", + "- The file used for training is called `train_pad_gemb_ar_eval.py`. \n", + "\n", + "- Output `ckpt` files will be stored inside nanoGPT folder. Output folder name is defined in the config file. For this experiment it is called `out-save_dir_3k`\n", + "\n", + "\n", + "To do: During the training the GPT model is calling ADAPT-QAOA each `eval_ar_every` defined in the config file. It calculates the approximation ratio from the circuits that GPT synthesize using evaluation samples. This part where we call ADAPT-QAOA will be expensive if user has large data for evalaution and so parallelization can accelerate training. The current version is not parallelized yet.\n" + ] + }, + { + "cell_type": "markdown", + "id": "fb4f5bde", + "metadata": {}, + "source": [ + "\n", + "\n", + "The following Python code shows how we train the GPT model using the prepared data:\n", + "\n", + "```python\n", + "import os\n", + "\n", + "current_dir = os.getcwd()\n", + "nanoGPT_dir = os.path.join(current_dir, 'nanoGPT')\n", + "sample_data = 'save_dir_3k'\n", + "save_dir_data = os.path.join(nanoGPT_dir, 'data', sample_data)\n", + "\n", + "\n", + "# Check if the directory exists\n", + "if not os.path.exists(save_dir_data):\n", + " print(f\"Directory {save_dir_data} does not exist!\")\n", + " # Uncomment to create the directory if it doesn't exist\n", + " # os.makedirs(save_dir_data, exist_ok=True)\n", + "\n", + "# Check if the config file exists\n", + "config_file = os.path.join(save_dir_data, 'train_adapt_gpt_config.py')\n", + "if not os.path.exists(config_file):\n", + " print(f\"Config file {config_file} does not exist!\")\n", + "\n", + "# If both directory and config file exist, change directory and run\n", + "if os.path.exists(save_dir_data) and os.path.exists(config_file):\n", + " %cd $nanoGPT_dir\n", + " !python train_pad_gemb_ar_eval.py $config_file\n", + "```\n", + "\n", + "To execute this code, remove the code block formatting and convert the cell back to a code cell.\n", + "\n", + "Note: The error message `Error in eval_ansatz:..` you observe during the training comes from the generated bad samples during the evaluation. " + ] + }, + { + "cell_type": "markdown", + "id": "f3f2b177", + "metadata": {}, + "source": [ + "## 4- QAOA-GPT inference model\n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "Once trained, QAOA-GPT can generate quantum circuits for new unseen graphs. \n", + "- User Input: A user supplies an input graph, typically as an edgelist.\n", + "- Processing Input: The system automatically computes a fixed-length graph embedding (using FEATHER) and tokenizes the graph structure.\n", + "- Circuit Generation: Both the graph embedding and the tokenized graph are passed to the QAOA-GPT model.\n", + "- Output: QAOA-GPT generates a QAOA quantum circuit for the given input. These circuits are then ready for execution on a quantum device or classical simulator.\n", + "\n", + "This entire process bypasses the need for iterative optimization of variational algorithms, significantly speeding up circuit generation.\n", + "\n", + "Below, we will generate new graphs and we will use the trained QAOA-GPT to directly synthesize circuits and execute them." + ] + }, + { + "cell_type": "markdown", + "id": "7da9250c", + "metadata": {}, + "source": [ + "### Generate circuits from the trained QAOA-GPT model above.\n", + "\n", + "### Note: \n", + "you might encounter an error with PyTorch's CUDA dependencies when running the cell below. If you restart the notebook, and only run this cell again, the issue will be resolved. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c486c80e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/IPython/core/magics/osm.py:417: UserWarning: This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.\n", + " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/cudaq/cuda-quantum/docs/sphinx/applications/python\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "#current_dir = os.getcwd()\n", + "current_dir = '/home/cudaq/cuda-quantum/docs/sphinx/applications/python'\n", + "%cd {current_dir}\n", + "\n", + "from qaoa_gpt_src.model_interface import QAOA_GPT\n", + "import pandas as pd\n", + "import numpy as np\n", + "import networkx as nx\n", + "import random\n", + "from tqdm import tqdm\n", + "from collections import defaultdict, Counter\n", + "import json\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "markdown", + "id": "3cf58bbf", + "metadata": {}, + "source": [ + "Note: to call the trained `QAOA_GPT` to synthesize circuits, user needs to provide `model_ckpt`. This is the checkpoint file user wants to employ for the inference. The ckpt files can be found in the output directory from the GPT model training. During the training process, the checkpoint files are saved with: \n", + "- the format `ckpt_{iter}_gemb__ar_{approx_ratio}__er_{energy_ratio}.pt` (with AR/ER values)\n", + "- the format `ckpt_{iter}_gemb.pt` (basic checkpoint). \n", + "\n", + "User should use the last or their best checkpoint file for inference. Below, we will use `ckpt-3500_gemb__ar_0_96254__er_0_00605.pt`.\n", + "\n", + "User will need to unzip the `out-save_dir_3k.zip`. This folder has the inferenece model pre-trained with 3K data points.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0b064ff9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading config from: /home/cudaq/cuda-quantum/docs/sphinx/applications/python/nanoGPT/data/save_dir_3k/train_adapt_gpt_config.py\n", + "Initiating nanoGPT model with padding support\n", + "number of parameters: 11.60M\n", + "Pool type: qaoa_mixer\n", + "Using graph embeddings: True\n", + "Number of nodes: 8\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "# Get current directory for relative paths\n", + "#current_dir = os.getcwd()\n", + "current_dir = '/home/cudaq/cuda-quantum/docs/sphinx/applications/python'\n", + "nanogpt_dir = os.path.join(current_dir, 'nanoGPT')\n", + "\n", + "# Define the input and output directories and checkpoint file\n", + "gpt_input_dir = 'save_dir_3k'\n", + "gpt_output_dir = 'out-save_dir_3k'\n", + "ckpt_file = 'ckpt_3500_gemb__ar_0_96205__er_0_00336.pt'\n", + "\n", + "# Use relative paths instead of absolute paths\n", + "path = os.path.join(current_dir, 'nanoGPT')\n", + "qaoa_gpt_n8_obj = QAOA_GPT(\n", + " model_ckpt = os.path.join(nanogpt_dir, gpt_output_dir, ckpt_file),\n", + " data_dir = os.path.join(nanogpt_dir, 'data', gpt_input_dir),\n", + " config_file = os.path.join(nanogpt_dir, 'data', gpt_input_dir, 'train_adapt_gpt_config.py'),\n", + " temp_folder = 'temp_data',\n", + " device = 'cuda' # You may want to change to 'cpu' if CUDA is not available\n", + ")\n", + "\n", + "# Print the values\n", + "print(f\"Pool type: {qaoa_gpt_n8_obj.pool_type}\")\n", + "print(f\"Using graph embeddings: {qaoa_gpt_n8_obj.use_graph_emb}\")\n", + "print(f\"Number of nodes: {qaoa_gpt_n8_obj.n_nodes}\")" + ] + }, + { + "cell_type": "markdown", + "id": "84a1b39e", + "metadata": {}, + "source": [ + "### Generate random graphs\n", + "\n", + "We will generate `n_graphs` with `n_nodes = 8`. We choose 8 nodes because QAOA-GPT model in this tutorial is trained on graphs with 8 nodes." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7acaa57a", + "metadata": {}, + "outputs": [], + "source": [ + "def add_weights_to_nx_graph(nx_graph):\n", + " for u, v in nx_graph.edges():\n", + " cur_weight = round(random.uniform(0, 1), 2)\n", + " while cur_weight == 0:\n", + " cur_weight = round(random.uniform(0, 1), 2)\n", + " nx_graph[u][v]['weight'] = cur_weight\n", + " return nx_graph" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "dd8b9021", + "metadata": {}, + "outputs": [], + "source": [ + "tqdm.pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1126a691", + "metadata": {}, + "outputs": [], + "source": [ + "n_graphs = 200\n", + "n_nodes = 8" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1ad5d9a7", + "metadata": {}, + "outputs": [], + "source": [ + "graphs_edgelist_list_dict = dict()\n", + "\n", + "er_graphs_edgelist_list_dict = dict()\n", + "for i in range(n_graphs):\n", + " p = random.randrange(6,9) / 10\n", + " cur_graph = nx.erdos_renyi_graph(\n", + " n=n_nodes,\n", + " p=p\n", + " )\n", + " er_graphs_edgelist_list_dict[f'er_graph_{i}'] = add_weights_to_nx_graph(cur_graph)\n", + "\n", + "graphs_edgelist_list_dict.update(er_graphs_edgelist_list_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "88cef046", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "EdgeDataView([(0, 1, {'weight': 0.9}), (0, 3, {'weight': 0.53}), (0, 4, {'weight': 0.93}), (0, 6, {'weight': 0.22}), (0, 7, {'weight': 0.45}), (1, 6, {'weight': 0.82}), (2, 3, {'weight': 0.75}), (2, 5, {'weight': 0.41}), (3, 4, {'weight': 0.05}), (3, 5, {'weight': 0.34}), (3, 6, {'weight': 0.6}), (3, 7, {'weight': 0.56}), (4, 5, {'weight': 0.41})])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graphs_edgelist_list_dict['er_graph_2'].edges(data=True)" + ] + }, + { + "cell_type": "markdown", + "id": "ed703c87", + "metadata": {}, + "source": [ + "### Generate circuit with QAOA-GPT model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "370622c9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Preparing graphs...: 0%| | 0/200 [00:00 1.0 = more random, in predictions\n", + " top_k=200, # retain only the top_k most likely tokens, clamp others to have 0 probability\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "06a52395", + "metadata": {}, + "source": [ + "Note: the warning above `Restricted license - for non-production use only - expires 2026-11-23` comes from `gurobipy` that is used to calculate the optimum classical value. \n", + "\n", + "To do: we will update this function later and allow different options to estimate the optimum classical value. " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "87b53c38", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
graphn_edgesq_circuitsadapt_circuitadapt_full_argraph_prefixenergy_gurobilabelgraph_w_pygraph_weight_norm
0[(1, 2), 0.3, (1, 3), 0.8, (1, 4), 0.04, (1, 5...22[[new_layer_p, 1, -0.52, 0.41, new_layer_p, 1,...[]Noneer_graph_10-7.21test_interactive[[1, 2, 0.3], [1, 3, 0.8], [1, 4, 0.04], [1, 5...1.0
1[(1, 2), 0.59, (1, 3), 0.54, (1, 4), 0.05, (1,...22[[new_layer_p, 1, -0.52, 0.46, new_layer_p, 1,...[]Noneer_graph_16-6.54test_interactive[[1, 2, 0.59], [1, 3, 0.54], [1, 4, 0.05], [1,...1.0
2[(1, 2), 0.74, (1, 3), 0.06, (1, 4), 0.07, (1,...22[[new_layer_p, 1, -0.49, 0.35, new_layer_p, 1,...[]Noneer_graph_35-8.80test_interactive[[1, 2, 0.74], [1, 3, 0.06], [1, 4, 0.07], [1,...1.0
\n", + "
" + ], + "text/plain": [ + " graph n_edges \\\n", + "0 [(1, 2), 0.3, (1, 3), 0.8, (1, 4), 0.04, (1, 5... 22 \n", + "1 [(1, 2), 0.59, (1, 3), 0.54, (1, 4), 0.05, (1,... 22 \n", + "2 [(1, 2), 0.74, (1, 3), 0.06, (1, 4), 0.07, (1,... 22 \n", + "\n", + " q_circuits adapt_circuit \\\n", + "0 [[new_layer_p, 1, -0.52, 0.41, new_layer_p, 1,... [] \n", + "1 [[new_layer_p, 1, -0.52, 0.46, new_layer_p, 1,... [] \n", + "2 [[new_layer_p, 1, -0.49, 0.35, new_layer_p, 1,... [] \n", + "\n", + " adapt_full_ar graph_prefix energy_gurobi label \\\n", + "0 None er_graph_10 -7.21 test_interactive \n", + "1 None er_graph_16 -6.54 test_interactive \n", + "2 None er_graph_35 -8.80 test_interactive \n", + "\n", + " graph_w_py graph_weight_norm \n", + "0 [[1, 2, 0.3], [1, 3, 0.8], [1, 4, 0.04], [1, 5... 1.0 \n", + "1 [[1, 2, 0.59], [1, 3, 0.54], [1, 4, 0.05], [1,... 1.0 \n", + "2 [[1, 2, 0.74], [1, 3, 0.06], [1, 4, 0.07], [1,... 1.0 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qaoa_gpt_circ_df[:3]" + ] + }, + { + "cell_type": "markdown", + "id": "6ab1c3d4", + "metadata": {}, + "source": [ + "### Evaluate circuits." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ba15acbf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> eval_adapt_gpt_circ_cudaq CALLED <<<\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing graphs: 0%| | 0/200 [00:00 0)]\n", + "\n", + "print(\"\\nAfter filtering:\")\n", + "print(\"Filtered DataFrame shape:\", filtered_df.shape)\n", + "#print(\"Sample of filtered adapt_gpt_energies:\")\n", + "#print(filtered_df['adapt_gpt_energies'].head())\n", + "\n", + "# Calculate metrics with the filtered data\n", + "filtered_df['best_adapt_gpt_ar'] = filtered_df.apply(\n", + " lambda x: max([val / x['energy_gurobi'] for val in x['adapt_gpt_energies']]) \n", + " if len(x['adapt_gpt_energies']) > 0 else np.nan,\n", + " axis=1\n", + ")\n", + "\n", + "filtered_df['avg_adapt_gpt_ar'] = filtered_df.apply(\n", + " lambda x: np.mean([val / x['energy_gurobi'] for val in x['adapt_gpt_energies']]) \n", + " if len(x['adapt_gpt_energies']) > 0 else np.nan,\n", + " axis=1\n", + ")\n", + "\n", + "# Create best_adapt_gpt_energy column\n", + "filtered_df['best_adapt_gpt_energy'] = filtered_df.apply(\n", + " lambda x: max(x['adapt_gpt_energies']) if len(x['adapt_gpt_energies']) > 0 else np.nan,\n", + " axis=1\n", + ")\n", + "\n", + "# Add this code to check the adapt_gpt_energies column structure\n", + "list_count = 0\n", + "for val in filtered_df['adapt_gpt_energies']:\n", + " if len(val) > 1:\n", + " #print(f\"Example list with multiple values: {val[:5]}... (length: {len(val)})\")\n", + " list_count += 1\n", + " if list_count >= 3:\n", + " break\n", + "\n", + "print(f\"Difference between best and avg AR: {(filtered_df['best_adapt_gpt_ar'] - filtered_df['avg_adapt_gpt_ar']).abs().mean()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "416275e5", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average QAOA-GPT best approximation ratio: 0.9663\n", + "Average QAOA-GPT mean approximation ratio: 0.9584\n", + "Difference between best and avg AR: 0.0079\n" + ] + } + ], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Set the style\n", + "sns.set_style(\"whitegrid\")\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "# Group by n_edges and calculate mean for both metrics\n", + "grouped_best = filtered_df[['n_edges', 'best_adapt_gpt_ar']].groupby('n_edges').mean().reset_index()\n", + "grouped_avg = filtered_df[['n_edges', 'avg_adapt_gpt_ar']].groupby('n_edges').mean().reset_index()\n", + "\n", + "# Create the plot with two lines\n", + "plt.plot(\n", + " grouped_best['n_edges'], \n", + " grouped_best['best_adapt_gpt_ar'], \n", + " 'o-',\n", + " color='#1f77b4',\n", + " linewidth=2,\n", + " markersize=8,\n", + " alpha=0.8,\n", + " label='Best QAOA-GPT Approx Ratio'\n", + ")\n", + "\n", + "plt.plot(\n", + " grouped_avg['n_edges'], \n", + " grouped_avg['avg_adapt_gpt_ar'], \n", + " 's--',\n", + " color='#ff7f0e',\n", + " linewidth=2,\n", + " markersize=8,\n", + " alpha=0.8,\n", + " label='Average QAOA-GPT Approx Ratio'\n", + ")\n", + "\n", + "# Enhance the plot appearance\n", + "plt.grid(True, linestyle='--', alpha=0.7)\n", + "plt.xlabel('Number of Edges', fontsize=12, fontweight='bold')\n", + "plt.ylabel('Approximation Ratio', fontsize=12, fontweight='bold')\n", + "plt.title('Best vs Average Approximation Ratio by Number of Edges', fontsize=14, fontweight='bold')\n", + "\n", + "# Add a horizontal line at y=0.97 (near-optimal approximation)\n", + "plt.axhline(y=0.97, color='red', linestyle='--', alpha=0.5, label='Approx. Ratio (0.97)')\n", + "\n", + "# Show the overall averages\n", + "best_avg = filtered_df['best_adapt_gpt_ar'].mean()\n", + "avg_avg = filtered_df['avg_adapt_gpt_ar'].mean()\n", + "\n", + "# Add text boxes for the averages\n", + "plt.text(\n", + " 0.02, 0.95, f'Avg. Best AR: {best_avg:.4f}',\n", + " transform=plt.gca().transAxes,\n", + " bbox=dict(boxstyle=\"round,pad=0.4\", facecolor='white', edgecolor='#1f77b4', alpha=0.8)\n", + ")\n", + "\n", + "plt.text(\n", + " 0.02, 0.87, f'Avg. Mean AR: {avg_avg:.4f}',\n", + " transform=plt.gca().transAxes,\n", + " bbox=dict(boxstyle=\"round,pad=0.4\", facecolor='white', edgecolor='#ff7f0e', alpha=0.8)\n", + ")\n", + "\n", + "# Add legend and adjust layout\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Print some statistics\n", + "print(f\"Average QAOA-GPT best approximation ratio: {best_avg:.4f}\")\n", + "print(f\"Average QAOA-GPT mean approximation ratio: {avg_avg:.4f}\")\n", + "print(f\"Difference between best and avg AR: {(best_avg - avg_avg):.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7fe2866a", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average approximation ratio: 0.9663\n", + "Min approximation ratio: 0.9178\n", + "Max approximation ratio: 0.9877\n" + ] + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Create a new column with the best energy from each row's adapt_gpt_energies\n", + "filtered_df['best_adapt_gpt_energy'] = filtered_df.apply(\n", + " lambda x: x['adapt_gpt_energies'] if isinstance(x['adapt_gpt_energies'], (int, float))\n", + " else min(x['adapt_gpt_energies']),\n", + " axis=1\n", + ")\n", + "\n", + "# Create the scatter plot\n", + "plt.figure(figsize=(10, 6))\n", + "plt.scatter(filtered_df['energy_gurobi'], filtered_df['best_adapt_gpt_energy'], alpha=0.6)\n", + "\n", + "# Add a diagonal line representing y=x (perfect approximation)\n", + "max_val = max(filtered_df['energy_gurobi'].max(), filtered_df['best_adapt_gpt_energy'].max())\n", + "min_val = min(filtered_df['energy_gurobi'].min(), filtered_df['best_adapt_gpt_energy'].min())\n", + "plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect approximation')\n", + "\n", + "# Add labels and title\n", + "plt.xlabel('Gurobi Energy (Classical Optimum)')\n", + "plt.ylabel('Best QAOA-GPT Energy')\n", + "plt.title('Best QAOA-GPT Energy vs Classical Optimum Energy')\n", + "plt.grid(True, alpha=0.3)\n", + "plt.legend()\n", + "\n", + "# Add a text box with the average approximation ratio\n", + "avg_ratio = filtered_df['best_adapt_gpt_ar'].mean()\n", + "plt.annotate(f'Avg. Approx. Ratio: {avg_ratio:.4f}', \n", + " xy=(0.05, 0.95), xycoords='axes fraction',\n", + " bbox=dict(boxstyle=\"round,pad=0.3\", fc=\"white\", ec=\"gray\", alpha=0.8))\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Print some statistics\n", + "print(f\"Average approximation ratio: {avg_ratio:.4f}\")\n", + "print(f\"Min approximation ratio: {filtered_df['best_adapt_gpt_ar'].min():.4f}\")\n", + "print(f\"Max approximation ratio: {filtered_df['best_adapt_gpt_ar'].max():.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d40e2632", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average number of new_layer occurrences per circuit: 7.21\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Calculate average number of layers per circuit for each row\n", + "filtered_df['avg_layers_per_circuit'] = filtered_df['q_circuits'].apply(\n", + " lambda x: np.mean([str(circuit).count('new_layer_p') for circuit in x]) if isinstance(x, list)\n", + " else str(x).count('new_layer_p')\n", + ")\n", + "\n", + "# Calculate the overall average (mean of means)\n", + "mean_of_means = filtered_df['avg_layers_per_circuit'].mean()\n", + "print(f\"Average number of new_layer occurrences per circuit: {mean_of_means:.2f}\")\n", + "\n", + "# Visualize the distribution of average layers per row\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "plt.hist(filtered_df['avg_layers_per_circuit'], bins=15, alpha=0.7, \n", + " color='skyblue', edgecolor='black')\n", + "plt.axvline(mean_of_means, color='red', linestyle='--', \n", + " label=f'Mean: {mean_of_means:.2f}')\n", + "plt.title('Average Number of Layers per Circuit in Each Row', fontsize=14)\n", + "plt.xlabel('Average Number of Layers', fontsize=12)\n", + "plt.ylabel('Frequency', fontsize=12)\n", + "plt.grid(True, alpha=0.3)\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/sphinx/applications/python/qaoa_gpt_src/FEATHER b/docs/sphinx/applications/python/qaoa_gpt_src/FEATHER new file mode 160000 index 00000000000..045db7963fb --- /dev/null +++ b/docs/sphinx/applications/python/qaoa_gpt_src/FEATHER @@ -0,0 +1 @@ +Subproject commit 045db7963fb367eb9aed6439ceea96320e8ac7cc diff --git a/docs/sphinx/applications/python/qaoa_gpt_src/adapt_qaoa.py b/docs/sphinx/applications/python/qaoa_gpt_src/adapt_qaoa.py new file mode 100644 index 00000000000..c7a2c58a411 --- /dev/null +++ b/docs/sphinx/applications/python/qaoa_gpt_src/adapt_qaoa.py @@ -0,0 +1,435 @@ +#============================================================================== # +# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# The QAOA-GPT implementation in CUDA-Q is based on this paper: # +# https://arxiv.org/pdf/2504.16350 # +# Usage or reference of this code or algorithms requires citation of the paper: # +# Ilya Tyagin, Marwa Farag, Kyle Sherbert, Karunya Shirali, Yuri Alexeev, # +# Ilya Safro "QAOA-GPT: Efficient Generation of Adaptive and Regular Quantum # +# Approximate Optimization Algorithm Circuits", IEEE International Conference # +# on Quantum Computing and Engineering (QCE), 2025. # +# ============================================================================= # + +import cudaq +import numpy as np +from scipy.optimize import minimize +import random +from qaoa_gpt_src.adapt_qaoa_pool import all_pool, qaoa_mixer, qaoa_single_x, qaoa_double +from qaoa_gpt_src.hamiltonian_graph import term_coefficients, term_words + + +def adapt_qaoa_run(hamiltonian, + qubits_num, + pool='all_pool', + gamma_0=0.01, + norm_threshold=1e-3, + energy_threshold=1e-5, + approx_ratio=1.0, + true_energy=0.0, + optimizer='BFGS', + parameter_shift=False, + max_iter=10, + seed_adapt=None, + verbose=False): + + E_prev = 0.0 + energy_list = [] + pool_list_index = [] + + # Get the coefficients and pauli words of the Hamiltonian + ham_coeffs = term_coefficients(hamiltonian) + ham_words = term_words(hamiltonian, qubits_num) + + # Get the pool of operators + if pool == 'all_pool': + pools = all_pool(qubits_num) + elif pool == 'qaoa_mixer': + pools = qaoa_mixer(qubits_num) + elif pool == 'qaoa_single_x': + pools = qaoa_single_x(qubits_num) + elif pool == 'qaoa_double_ops': + pools = qaoa_double(qubits_num) + else: + raise ValueError( + "Invalid pool name. Choose from 'all_pool', 'qaoa_mixer', 'qaoa_single_x', or 'qaoa_double'." + ) + + if verbose: + #print(f"Hamiltonian: {hamiltonian}") + #print(f"coefficients: {ham_coeffs}") + #print(f"words: {ham_words}") + print(f"Number of hamiltoninian terms: {hamiltonian.term_count}") + print(f"Pool size: {len(pools)}") + + pool_word = [] + for i in range(len(pools)): + temp = [] + for term in pools[i]: + temp.append(term.get_pauli_word(qubits_num)) + pool_word.append(temp) + #print(f"Pool words: {pool_word}") + + # Generate the commutator operator [H,Ai] + com_op = [] + for i in range(len(pools)): + op = pools[i] + com_op.append(hamiltonian * op - op * hamiltonian) + + ########################################### + # Get the initial state (psi_ref) + + @cudaq.kernel + def initial_state(qubits_num: int): + qubits = cudaq.qvector(qubits_num) + h(qubits) + + state = cudaq.get_state(initial_state, qubits_num) + + #print(state) + ############################################### + + # Circuit to compute the energy gradient with respect to the pool + @cudaq.kernel + def grad(state: cudaq.State, ham_words: list[cudaq.pauli_word], + ham_coeffs: list[complex], gamma_0: float): + q = cudaq.qvector(state) + + for i in range(len(ham_coeffs)): + exp_pauli(gamma_0 * ham_coeffs[i].real, q, ham_words[i]) + + # The qaoa circuit using the selected pool operator with max gradient + + @cudaq.kernel + def kernel_qaoa(qubits_num: int, ham_words: list[cudaq.pauli_word], + ham_coeffs: list[complex], + mixer_pool: list[list[cudaq.pauli_word]], + gamma: list[float], beta: list[float], num_layer: int): + + qubits = cudaq.qvector(qubits_num) + + h(qubits) + + for p in range(num_layer): + for i in range(len(ham_coeffs)): + exp_pauli(gamma[p] * ham_coeffs[i].real, qubits, ham_words[i]) + + for word in mixer_pool[p]: + exp_pauli(beta[p], qubits, word) + + beta = [] + gamma = [] + + mixer_pool = [] + mixer_pool_str = [] + layer = [] + + istep = 1 + + for iter in range(max_iter): + + if verbose: + print('Step: ', istep) + + # compute the gradient and find the mixer pool with large values. + # If norm is below the predefined threshold, stop calculation + + gradient_vec = [] + for op in com_op: + op = op * -1j + gradient_vec.append( + cudaq.observe(grad, op, state, ham_words, ham_coeffs, + gamma_0).expectation()) + + # Compute the norm of the gradient vector + norm = np.linalg.norm(np.array(gradient_vec)) + if verbose: + print('Norm of the gradient: ', norm) + + if norm <= norm_threshold: + if verbose: + print('\n', 'Final Result: ', '\n') + if verbose: + print('Norm of the gradient is below the threshold', norm) + if verbose: + print('Final mixer_pool: ', mixer_pool_str) + if verbose: + print('Number of layers: ', len(layer)) + if verbose: + print('Number of mixer pool in each layer: ', layer) + if verbose: + print('Final Energy: ', E_current) + if verbose: + print('Ratio of the energy: ', ratio) + + break + + else: + temp_pool = [] + temp_index = [] + tot_pool = 0 + + max_grad = np.max(np.abs(gradient_vec)) + + for i in range(len(pools)): + if np.abs(gradient_vec[i]) == max_grad: + tot_pool += 1 + temp_pool.append(pools[i]) + temp_index.append(i) + + if verbose: + print('Total number of pool with max gradient: ', tot_pool) + # Set the seed for the random number generator + # This ensures that the random choices are reproducible + # in each step of the iteration. + if seed_adapt is not None: + random.seed(seed_adapt) + + layer.append(1) + random_mixer = random.choice(temp_pool) + + # Save the mixer pool of the current step + for i in range(len(temp_index)): + if temp_pool[i] == random_mixer: + pool_list_index.append(temp_index[i]) + + pool_added = [] + pool_added_str = [] + for term in random_mixer: + pool_added.append( + cudaq.pauli_word(term.get_pauli_word(qubits_num))) + pool_added_str.append(term.get_pauli_word(qubits_num)) + + #mixer_pool = mixer_pool + [random_mixer.get_pauli_word(qubits_num)] + mixer_pool.append(pool_added) + mixer_pool_str.append(pool_added_str) + + if verbose: + print('Mixer pool at step', istep) + if verbose: + print(mixer_pool_str) + + num_layer = len(layer) + if verbose: + print('Number of layers: ', num_layer) + + beta_count = layer[num_layer - 1] + init_beta = [0.0] * beta_count + beta = beta + init_beta + gamma = gamma + [gamma_0] + theta = gamma + beta + + def cost(theta): + + #theta = theta.tolist() + gamma = theta[:num_layer] + beta = theta[num_layer:] + + energy = cudaq.observe(kernel_qaoa, hamiltonian, qubits_num, + ham_words, ham_coeffs, mixer_pool, gamma, + beta, num_layer).expectation() + return energy + + if parameter_shift: + + def parameter_shift(theta): + + parameter_count = len(theta) + epsilon = np.pi / 4 + # The gradient is calculated using parameter shift. + grad = np.zeros(parameter_count) + theta2 = theta.copy() + + for i in range(parameter_count): + theta2[i] = theta[i] + epsilon + exp_val_plus = cost(theta2) + theta2[i] = theta[i] - epsilon + exp_val_minus = cost(theta2) + grad[i] = (exp_val_plus - exp_val_minus) / (2 * epsilon) + theta2[i] = theta[i] + return grad + + if optimizer == 'COBYLA': + result_vqe = minimize(cost, + theta, + method='COBYLA', + options={ + 'rhobeg': 1.0, + 'maxiter': 10000, + 'disp': False, + 'tol': 1e-6 + }) + E_current = result_vqe.fun + theta = result_vqe.x.tolist() + if verbose: + print('Optmized Energy: ', result_vqe.fun, flush=True) + if verbose: + print('Optimizer exited successfully: ', + result_vqe.success, + flush=True) + + elif optimizer == 'BFGS': + if parameter_shift: + result_vqe = minimize(cost, + theta, + method='BFGS', + jac=parameter_shift, + tol=1e-5) + E_current = result_vqe.fun + theta = result_vqe.x.tolist() + if verbose: + print('Optmized Energy: ', result_vqe.fun, flush=True) + if verbose: + print('Optimizer exited successfully: ', + result_vqe.success, + flush=True) + else: + result_vqe = minimize(cost, + theta, + method='BFGS', + jac='2-point', + options={'gtol': 1e-4}) + E_current = result_vqe.fun + theta = result_vqe.x.tolist() + if verbose: + print('Optmized Energy: ', result_vqe.fun, flush=True) + if verbose: + print('Optimizer exited successfully: ', + result_vqe.success, + flush=True) + + elif optimizer == 'L-BFGS-B': + if parameter_shift: + result_vqe = minimize(cost, + theta, + method='L-BFGS-B', + jac=parameter_shift, + tol=1e-5) + E_current = result_vqe.fun + theta = result_vqe.x.tolist() + if verbose: + print('Optmized Energy: ', result_vqe.fun, flush=True) + if verbose: + print('Optimizer exited successfully: ', + result_vqe.success, + flush=True) + else: + result_vqe = minimize(cost, + theta, + method='L-BFGS-B', + jac='2-point', + tol=1e-5) + E_current = result_vqe.fun + theta = result_vqe.x.tolist() + if verbose: + print('Optmized Energy: ', result_vqe.fun, flush=True) + if verbose: + print('Optimizer exited successfully: ', + result_vqe.success, + flush=True) + + energy_list.append(E_current) + + if verbose: + print('Result from the step ', istep) + if verbose: + print('Optmized Energy: ', result_vqe.fun) + + dE = np.abs(E_current - E_prev) + E_prev = E_current + + if verbose: + print('dE= :', dE) + + ratio = E_current / true_energy + if verbose: + print('Ratio of the energy: ', ratio) + + gamma = theta[:num_layer] + beta = theta[num_layer:] + + if dE <= energy_threshold: + if verbose: + print('\n', 'Final Result: ', '\n') + if verbose: + print('dE below the threshold is satisfied: ', dE) + if verbose: + print('Final mixer_pool: ', mixer_pool_str) + if verbose: + print('Number of layers: ', len(layer)) + if verbose: + print('Number of mixer pool in each layer: ', layer) + if verbose: + print('Final Energy= ', E_current) + if verbose: + print('Ratio of the energy: ', ratio) + + break + + elif ratio >= approx_ratio: + if verbose: + print('\n', 'Final Result: ', '\n') + if verbose: + print('Approximation ratio is satisfied', ratio) + if verbose: + print('Final mixer_pool: ', mixer_pool_str) + if verbose: + print('Number of layers: ', len(layer)) + if verbose: + print('Number of mixer pool in each layer: ', layer) + if verbose: + print('Final Energy= ', E_current) + if verbose: + print('Ratio of the energy: ', ratio) + break + + else: + + # Compute the state of this current step for the gradient + state = cudaq.get_state(kernel_qaoa, qubits_num, ham_words, + ham_coeffs, mixer_pool, gamma, beta, + num_layer) + if verbose: + print('State at step ', istep) + #print(state) + istep += 1 + if verbose: + print('\n') + + if iter == max_iter - 1: + if verbose: + print('\n', 'Final Result: ', '\n') + if verbose: + print( + 'Maximum number of iterations reached without satisfying the convergence criteria.' + ) + if verbose: + print('Final mixer_pool: ', mixer_pool_str) + if verbose: + print('Number of layers: ', len(layer)) + if verbose: + print('Number of mixer pool in each layer: ', layer) + if verbose: + print('Final Energy= ', E_current) + + if verbose: + print('\n', 'Sampling the Final ADAPT QAOA circuit', '\n') + # Sample the circuit + count = cudaq.sample(kernel_qaoa, + qubits_num, + ham_words, + ham_coeffs, + mixer_pool, + gamma, + beta, + num_layer, + shots_count=5000) + if verbose: + print('The most probable max cut: ', count.most_probable()) + if verbose: + print('All bitstring from circuit sampling: ', count) + + return (energy_list, mixer_pool_str, pool_list_index, gamma, beta, ratio, + str(count.most_probable()), len(layer), result_vqe.success) diff --git a/docs/sphinx/applications/python/qaoa_gpt_src/adapt_qaoa_pool.py b/docs/sphinx/applications/python/qaoa_gpt_src/adapt_qaoa_pool.py new file mode 100644 index 00000000000..705ccfe5e05 --- /dev/null +++ b/docs/sphinx/applications/python/qaoa_gpt_src/adapt_qaoa_pool.py @@ -0,0 +1,61 @@ +#============================================================================== # +# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# The QAOA-GPT implementation in CUDA-Q is based on this paper: # +# https://arxiv.org/pdf/2504.16350 # +# Usage or reference of this code or algorithms requires citation of the paper: # +# Ilya Tyagin, Marwa Farag, Kyle Sherbert, Karunya Shirali, Yuri Alexeev, # +# Ilya Safro "QAOA-GPT: Efficient Generation of Adaptive and Regular Quantum # +# Approximate Optimization Algorithm Circuits", IEEE International Conference # +# on Quantum Computing and Engineering (QCE), 2025. # +# ============================================================================= # + +import cudaq +from cudaq import spin + + +def qaoa_mixer(n): + + term = spin.x(0) + + for i in range(1, n): + term += spin.x(i) + + pool = [term] + return pool + + +def qaoa_single_x(n): + + pool = [] + + for i in range(n): + pool.append(cudaq.SpinOperator(spin.x(i))) + + return pool + + +def qaoa_double(n): + + pool = [] + + for i in range(n - 1): + for j in range(i + 1, n): + pool.append( + cudaq.SpinOperator(spin.x(i)) * cudaq.SpinOperator(spin.x(j))) + pool.append( + cudaq.SpinOperator(spin.y(i)) * cudaq.SpinOperator(spin.y(j))) + pool.append( + cudaq.SpinOperator(spin.y(i)) * cudaq.SpinOperator(spin.z(j))) + pool.append( + cudaq.SpinOperator(spin.z(i)) * cudaq.SpinOperator(spin.y(j))) + + return pool + + +def all_pool(qubits_num): + return (qaoa_single_x(qubits_num) + qaoa_mixer(qubits_num) + + qaoa_double(qubits_num)) diff --git a/docs/sphinx/applications/python/qaoa_gpt_src/custom_feather.py b/docs/sphinx/applications/python/qaoa_gpt_src/custom_feather.py new file mode 100644 index 00000000000..e18e0f57eeb --- /dev/null +++ b/docs/sphinx/applications/python/qaoa_gpt_src/custom_feather.py @@ -0,0 +1,154 @@ +#============================================================================== # +# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# The QAOA-GPT implementation in CUDA-Q is based on this paper: # +# https://arxiv.org/pdf/2504.16350 # +# Usage or reference of this code or algorithms requires citation of the paper: # +# Ilya Tyagin, Marwa Farag, Kyle Sherbert, Karunya Shirali, Yuri Alexeev, # +# Ilya Safro "QAOA-GPT: Efficient Generation of Adaptive and Regular Quantum # +# Approximate Optimization Algorithm Circuits", IEEE International Conference # +# on Quantum Computing and Engineering (QCE), 2025. # +# ============================================================================= # + +import numpy as np +import networkx as nx +import sys +import os +import math +import glob + +# Add the FEATHER module path +#feather_path = '/home/cudaq/qaoa-gpt-cudaq-feather' +# Change from absolute path to current directory +feather_path = os.path.dirname(os.path.abspath(__file__)) + +if feather_path not in sys.path: + sys.path.append(feather_path) + +# Import the original FEATHER implementation +try: + from FEATHER.src.feather import FEATHER as OrigFeatherNode # For node-level embedding + from FEATHER.src.feather import FEATHERG as OrigFeatherGraph # For graph-level embedding +except ImportError as e: + print(f"Import error: {e}") + print("Current sys.path:", sys.path) + print("Checking if files exist:") + print(glob.glob(os.path.join(feather_path, 'FEATHER/src/*'))) + + +class CustomFeatherGraph: + """Custom wrapper for the original FEATHER implementation with multiple node features""" + + def __init__( + self, + theta_max=2.5, # Default from original implementation + eval_points=25, # Default from original implementation + order=5, # Default from original implementation + pooling="mean", # Default from original implementation + seed=42 # Add seed parameter with default value + ): + self.theta_max = theta_max + self.eval_points = eval_points + self.order = order + self.pooling = pooling + self.seed = seed + self.embedding = None + + def _extract_features(self, graph): + """ + Extract node features: + 1. Log degree + 2. Clustering coefficient + """ + # Number of nodes in the graph + num_nodes = graph.number_of_nodes() + + # Initialize feature matrix + features = np.zeros((num_nodes, 2)) # 2 features per node + + # Calculate clustering coefficients for all nodes at once + clustering_coeffs = nx.clustering(graph) + + # Extract log degree and clustering coefficient for each node + for node in range(num_nodes): + # Log of degree + 1 (to avoid log(0)) + features[node, 0] = math.log(graph.degree(node) + 1.0) + + # Clustering coefficient (default to 0 if not available) + features[node, 1] = clustering_coeffs.get(node, 0.0) + + return features + + def fit(self, graphs): + """ + Fit the model and generate embeddings for a list of graphs + + Args: + graphs: List of networkx graphs to embed + """ + # Set random seed + np.random.seed(self.seed) + + # Process each graph to ensure it meets FEATHER requirements + processed_graphs = [] + for graph in graphs: + # Ensure the graph has self-loops and integer node labels + graph = nx.convert_node_labels_to_integers(graph) + graph = graph.copy() + for node in graph.nodes(): + if not graph.has_edge(node, node): + graph.add_edge(node, node) + processed_graphs.append(graph) + + # Initialize embedding list + all_embeddings = [] + + # Process each graph individually to use multiple features + for graph in processed_graphs: + # Extract node features (log degree and clustering coefficient) + node_features = self._extract_features(graph) + + # Create edge list for FEATHER + edge_list = [(int(u), int(v)) for u, v in graph.edges()] + + # Create FEATHER node-level embedding model + feather_node = OrigFeatherNode(theta_max=self.theta_max, + eval_points=self.eval_points, + order=self.order) + + # Fit and transform to get node embeddings + feather_node.fit(graph, node_features) + node_embeddings = feather_node.get_embedding() + + # Pool node embeddings to get graph embedding + if self.pooling == "mean": + graph_embedding = np.mean(node_embeddings, axis=0) + elif self.pooling == "max": + graph_embedding = np.max(node_embeddings, axis=0) + elif self.pooling == "min": + graph_embedding = np.min(node_embeddings, axis=0) + else: + graph_embedding = np.mean(node_embeddings, + axis=0) # Default to mean + + all_embeddings.append(graph_embedding) + + # Convert to numpy array + self.embedding = np.array(all_embeddings) + + # Print embedding dimension for verification + if len(processed_graphs) > 0: + print(f"Generated embedding dimension: {self.embedding.shape[1]}") + + # Expected dimension calculation + expected_dim = 2 * self.eval_points * 2 * self.order # 2 features * eval_points * 2 (sin/cos) * order + print(f"Expected dimension: {expected_dim}") + + return self + + def get_embedding(self): + """Return the graph embeddings""" + return self.embedding diff --git a/docs/sphinx/applications/python/qaoa_gpt_src/generate_adapt_qaoa_data.py b/docs/sphinx/applications/python/qaoa_gpt_src/generate_adapt_qaoa_data.py new file mode 100644 index 00000000000..98a74459925 --- /dev/null +++ b/docs/sphinx/applications/python/qaoa_gpt_src/generate_adapt_qaoa_data.py @@ -0,0 +1,391 @@ +#============================================================================== # +# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# The QAOA-GPT implementation in CUDA-Q is based on this paper: # +# https://arxiv.org/pdf/2504.16350 # +# Usage or reference of this code or algorithms requires citation of the paper: # +# Ilya Tyagin, Marwa Farag, Kyle Sherbert, Karunya Shirali, Yuri Alexeev, # +# Ilya Safro "QAOA-GPT: Efficient Generation of Adaptive and Regular Quantum # +# Approximate Optimization Algorithm Circuits", IEEE International Conference # +# on Quantum Computing and Engineering (QCE), 2025. # +# ============================================================================= # + +import os +import json +import socket +import time +import numpy as np +import pandas as pd +from datetime import datetime + +import cudaq +from qaoa_gpt_src.adapt_qaoa import adapt_qaoa_run +from qaoa_gpt_src.hamiltonian_graph import term_coefficients, term_words, max_cut_ham +from qaoa_gpt_src.graph_functions import generate_random_graph, graph_to_edgelist, edgelist_to_graph, graph_to_adj_m, add_rand_weights_to_graph, norm_elist_weights + +from qaoa_gpt_src.max_cut_classical_sol import brute_force_max_cut, one_exchange, simulated_annealing_maxcut + +# Set the target to NVIDIA GPU +####################################################### +# Set the target to NVIDIA GPU with double precision support +# cudaq.set_target("nvidia") # Uncomment this line if you want to use single precision +# For double precision, use the following line: +cudaq.set_target("nvidia", option="fp64") +####################################################### + + +def ensure_dirs(output_dir): + for sub in ['hams', 'res', 'graphs', 'traces']: + os.makedirs(os.path.join(output_dir, sub), exist_ok=True) + + +####################################################### + + +# Scale the weights of the edges in the edge list by a given coefficient +def scale_elist_weights(e_list, coef): + """ + Scales the weights of each edge in the edge list by the given coefficient. + + Args: + e_list (list of tuples): Each tuple is (node1, node2, weight). + coef (float): The scaling coefficient. + + Returns: + list of tuples: New edge list with scaled weights. + """ + return [(node1, node2, weight * coef) for (node1, node2, weight) in e_list] + + +###################################################### +def generate_data_max_cut(output_dir='adapt_results', + graphs_number=1, + graphs_input_json="N/A", + n_nodes=8, + weighted=True, + use_negative_weights=False, + use_brute_force=True, + use_simulated_annealing=True, + use_one_exchange=True, + op_pool='all_pool', + init_gamma: list[float] = [0.01], + scaling_coef=1.0, + norm_weights=False, + norm_coef=1.0, + trials_per_graph=1, + optimizer='BFGS', + approx_ratio=0.97, + max_iter=10, + norm_threshold=1e-3, + energy_threshold=1e-9, + multi_gamma=False, + p_init=0.3, + p_final=0.9, + seed_g=None, + seed_weight=None, + seed_adapt=None, + verbose=True): + """ + Generates data for ADAPT-QAOA on the Max-Cut problem. + Args: + output_dir (str): Directory to save the results. + graphs_number (int): Number of graphs to generate or process. + graphs_input_json (str): Path to a JSON file containing graph data. If "N/A", generates random graphs. + n_nodes (int): Number of nodes in each graph. + weighted (bool): Whether to use weighted edges. + use_negative_weights (bool): Whether to allow negative weights in the graph. + use_brute_force (bool): Whether to compute the brute force solution. + use_simulated_annealing (bool): Whether to compute the simulated annealing solution. + use_one_exchange (bool): Whether to compute the one exchange solution. + op_pool (str): Type of operator pool to use ('all_pool' or 'qaoa_mixer' or 'qaoa_single_x' or 'qaoa_double_ops'). + init_gamma (list): Initial gamma values for ADAPT-QAOA trials. + scaling_coef (float): Coefficient to scale edge weights. + norm_weights (bool): Whether to normalize edge weights. + norm_coef (float): Coefficient for normalizing edge weights if norm_weights is True. + trials_per_graph (int): Number of trials per graph for ADAPT-QAOA. + optimizer (str): Optimizer to use for ADAPT-QAOA ('BFGS', 'L-BFGS-B', 'COBYLA'). + approx_ratio (float): Approximation ratio threshold for early stopping. + max_iter (int): Maximum number of iterations for the adapt-qaoa iteration. + norm_threshold (float): Threshold for gradients norm in ADAPT-QAOA for early stopping. + energy_threshold (float): Threshold for energy convergence in ADAPT-QAOA. + multi_gamma (bool): Whether to run multiple trials with different initial gamma values to generate multiple validated circuit. + Useful if you want to check for more validated circuits after approx ratio achieved with one gamma. + p_init (float): Initial probability for Erdos-Renyi graph generation. + p_final (float): Final probability for Erdos-Renyi graph generation. + the probability will be randomly selected between p_init and p_final. + seed_g (int): Random seed for for Erdos-Renyi graph generation reproducibility. + seed_weight (int): Random seed for edge weight generation reproducibility. + seed_adapt (int): Random seed for ADAPT-QAOA reproducibility. + verbose (bool): Whether to print detailed logs during execution. + + Returns: + None: The function saves results to the specified output directory. + + """ + + ensure_dirs(output_dir) + + pid = os.getpid() + hostname = socket.gethostname() + ts_string = datetime.now().strftime("%y-%m-%d__%H_%M") + + results_df = pd.DataFrame() + hams_df = pd.DataFrame() + graphs_df = pd.DataFrame( + columns=['graph_num', 'g_method', 'edgelist_json', 'H_frob_norm']) + traces_df = pd.DataFrame() + + # Load graphs from JSON if provided + if graphs_input_json != "N/A": + with open(graphs_input_json, 'r') as f: + json_graphs_dict = json.load(f) + graphs_number = len(json_graphs_dict) + graph_names_list = list(json_graphs_dict.keys()) + else: + graphs_number = graphs_number + + # Loop through the number of graphs to generate or process + if verbose: + print(f"Generating or processing {graphs_number} graphs...") + + graph_rows = [] + result_rows = [] + + for graph_num in range(graphs_number): + + if graphs_input_json == "N/A": + cur_graph_name = f"Graph_{graph_num+1}" + g_unweighted, g_method = generate_random_graph( + n_nodes, p_init=p_init, p_final=p_final, seed_g=seed_g, methods=["erdos_renyi"]) + if weighted: + g = add_rand_weights_to_graph(g_unweighted, + seed_weight=seed_weight, neg_weights=use_negative_weights) + else: + g = g_unweighted + else: + cur_graph_name = graph_names_list[graph_num] + cur_graph_elist = json_graphs_dict[cur_graph_name]["elist"] + n_nodes = json_graphs_dict[cur_graph_name]["n_nodes"] + g = edgelist_to_graph(cur_graph_elist, num_vertices=n_nodes) + g_method = "input_file" + + if verbose: + print(f"Processing {cur_graph_name}...") + if verbose: + print(f"Graph method: {g_method}") + if verbose: + print(f"Graph edgelist: {graph_to_edgelist(g)}") + + e_list = graph_to_edgelist(g) + + # update e_list to change index to 1 based for tokenization later. + e_list_mod = [ + (node1 + 1, node2 + 1, weight) for (node1, node2, weight) in e_list + ] + edgelist_json = json.dumps(e_list_mod) + + if scaling_coef != 1.0: + e_list = scale_elist_weights(e_list, scaling_coef) + + if norm_weights: + e_list, norm_coef = norm_elist_weights(e_list) + + ############################################### + # Build up the problem hamiltonian + + spin_ham = max_cut_ham(e_list) + #if verbose: print(f"Problem Hamiltonian: {spin_ham}") + + h_frob_norm = np.linalg.norm(spin_ham.to_matrix()) + if verbose: + print(f"Frobenius norm of the Hamiltonian: {h_frob_norm}") + + # Store the graph data + graph_rows.append({ + 'graph_num': graph_num + 1, + 'g_method': g_method, + 'edgelist_json': edgelist_json, + 'H_frob_norm': h_frob_norm + }) + + # After the loop: + graphs_df = pd.DataFrame( + graph_rows, + columns=['graph_num', 'g_method', 'edgelist_json', 'H_frob_norm']) + + ############################################ + # Classical solutions of max-cut problem + if use_brute_force: + # Brute Force + brute_force_cut_value, partition, binary_vector = brute_force_max_cut( + g) + + if use_simulated_annealing: + # Simulated Annealing + sa_partition, sa_cut_value, sa_binary_vector = simulated_annealing_maxcut( + g) + + if use_one_exchange: + # one_exchange + one_exchange_cut_value, one_exchange_partition, one_exchange_binary_vector = one_exchange( + g) + ################################################ + + # Quantum solutions of max-cut problem using ADAPT-QAOA + if verbose: + print(f"Preparing to run ADAPT-QAOA for graph {graph_num+1}...") + + for gamma in init_gamma: + for trial_num in range(trials_per_graph): + + if verbose: + print( + f"Running ADAPT-QAOA for graph {graph_num+1}, trial {trial_num+1}..." + ) + if verbose: + print(f"Using initial gamma: {gamma}") + + # Run ADAPT-QAOA + if verbose: + print("Running ADAPT-QAOA...") + + # Run the ADAPT-QAOA algorithm + qubits_num = len(g.nodes) + pool = op_pool + g0 = gamma + + if use_simulated_annealing: + true_energy = sa_cut_value # Use the simulated annealing cut value as the true energy + classical_cut = sa_binary_vector + elif use_one_exchange: + true_energy = one_exchange_cut_value + classical_cut = one_exchange_binary_vector + elif use_brute_force: + true_energy = brute_force_cut_value + classical_cut = binary_vector + else: + true_energy = -999.0 + classical_cut = "N/A" + + start_time = time.time() + + # Run the adapt_qaoa function + adapt_qaoa_result = adapt_qaoa_run( + spin_ham, + qubits_num, + pool=pool, + gamma_0=g0, + norm_threshold=norm_threshold, + energy_threshold=energy_threshold, + approx_ratio=approx_ratio, + true_energy=true_energy, + optimizer=optimizer, + max_iter=max_iter, + seed_adapt=seed_adapt, + verbose=verbose) + + end_time = time.time() + elapsed_time = end_time - start_time + + if isinstance(adapt_qaoa_result, tuple): + adapt_qaoa_result = list( + adapt_qaoa_result) # Convert to list + adapt_qaoa_result[2] = [ + int(i) + 1 for i in adapt_qaoa_result[2] + ] # Modify the third element indexes to be 1-based + adapt_qaoa_result = tuple( + adapt_qaoa_result + ) # Convert back to tuple (if required) + else: + adapt_qaoa_result[2] = [ + int(i) + 1 for i in adapt_qaoa_result[2] + ] # Modify directly if not a tuple + + if verbose: + print( + f"ADAPT-QAOA completed in {elapsed_time:.2f} seconds.") + print('Energy list: ', adapt_qaoa_result[0]) + print('Mixer pool as pauli word: ', adapt_qaoa_result[1]) + print('Mixer pool as index: ', adapt_qaoa_result[2]) + print('gamma list: ', adapt_qaoa_result[3]) + print('beta list: ', adapt_qaoa_result[4]) + print('Approx. ratio: ', adapt_qaoa_result[5]) + print('Max cut: ', adapt_qaoa_result[6]) + print('Number of layers: ', adapt_qaoa_result[7]) + print('Optimizer success flag: ', adapt_qaoa_result[8]) + print('\n') + + # Prepare the results for saving + result_rows.append({ + 'method': 'ADAPT-QAOA', + 'graph_name': cur_graph_name, + 'graph_num': graph_num + 1, + 'trial_num': trial_num + 1, + 'n_nodes': n_nodes, + 'init_gamma': gamma, + 'optimizer': optimizer, + 'pool_type': op_pool, + 'edge_weight_scaling_coef': scaling_coef, + 'edge_weight_norm_coef': norm_coef, + 'energy_list': adapt_qaoa_result[0], + 'true_energy': true_energy, + 'mixer_pool_pauli_word': adapt_qaoa_result[1], + 'mixer_pool_index': adapt_qaoa_result[2], + 'gamma_coef': adapt_qaoa_result[3], + 'beta_coef': adapt_qaoa_result[4], + 'approx_ratio': adapt_qaoa_result[5], + 'cut_adapt': adapt_qaoa_result[6], + 'cut_classical': classical_cut, + 'num_layers': adapt_qaoa_result[7], + 'optimizer_success_flag': adapt_qaoa_result[8], + 'elapsed_time': elapsed_time + }) + + results_df = pd.DataFrame( + result_rows, + columns=[ + 'method', 'graph_name', 'graph_num', 'trial_num', + 'n_nodes', 'init_gamma', 'energy_list', 'true_energy', + 'optimizer', 'pool_type', 'edge_weight_scaling_coef', + 'edge_weight_norm_coef', 'mixer_pool_pauli_word', + 'mixer_pool_index', 'gamma_coef', 'beta_coef', + 'approx_ratio', 'cut_adapt', 'cut_classical', + 'num_layers', 'optimizer_success_flag', 'elapsed_time' + ]) + # Early stopping: End of trial if approximation ratio is reached + if adapt_qaoa_result[5] >= approx_ratio: + if verbose: + print( + f"Approximation ratio {adapt_qaoa_result[5]} reached, stopping early." + ) + break + # Early stopping: End of graph processing if approximation ratio is reached + if adapt_qaoa_result[5] >= approx_ratio and not multi_gamma: + if verbose: + print( + f"Approximation ratio {adapt_qaoa_result[5]} reached for graph {graph_num+1}, stopping further trials." + ) + # uncomment if you do not want to check for more validated circuits + break + + #write results to files + if verbose: + print("Writing results to files...") + + # Save results DataFrame to CSV + results_df.to_csv(os.path.join(output_dir, 'res', + f'pid{pid}_{ts_string}_results.csv'), + index=False) + + # Save graphs DataFrame to CSV + graphs_df.to_csv(os.path.join(output_dir, 'graphs', + f'pid{pid}_{ts_string}_graphs.csv'), + index=False) + + if verbose: + print("Data generation completed successfully.") + + return diff --git a/docs/sphinx/applications/python/qaoa_gpt_src/graph_functions.py b/docs/sphinx/applications/python/qaoa_gpt_src/graph_functions.py new file mode 100644 index 00000000000..7bb1e80f439 --- /dev/null +++ b/docs/sphinx/applications/python/qaoa_gpt_src/graph_functions.py @@ -0,0 +1,111 @@ +#============================================================================== # +# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# The QAOA-GPT implementation in CUDA-Q is based on this paper: # +# https://arxiv.org/pdf/2504.16350 # +# Usage or reference of this code or algorithms requires citation of the paper: # +# Ilya Tyagin, Marwa Farag, Kyle Sherbert, Karunya Shirali, Yuri Alexeev, # +# Ilya Safro "QAOA-GPT: Efficient Generation of Adaptive and Regular Quantum # +# Approximate Optimization Algorithm Circuits", IEEE International Conference # +# on Quantum Computing and Engineering (QCE), 2025. # +# ============================================================================= # + +import networkx as nx +import numpy as np +import random + + +def graph_to_adj_m(g): + """Convert a NetworkX graph to an adjacency matrix (numpy array).""" + return nx.to_numpy_array(g) + + +def graph_to_edgelist(g): + """Return a weighted edge list: (src, dst, weight) for all edges.""" + return [(u, v, d.get('weight', 1.0)) for u, v, d in g.edges(data=True)] + + +def edgelist_to_graph(edgelist, num_vertices=0): + """Create a weighted undirected graph from an edge list.""" + if num_vertices == 0: + num_vertices = max(max(src, dst) for src, dst, _ in edgelist) + 1 + g = nx.Graph() + g.add_nodes_from(range(num_vertices)) + for src, dst, w in edgelist: + g.add_edge(src, dst, weight=w) + return g + + +def generate_random_graph(n, p_init, p_final, seed_g, methods=None): + """ + Generate a connected random graph using specified methods with random parameters. + Returns: (graph, method) + """ + if methods is None: + methods = [ + "erdos_renyi", "barabasi_albert", "watts_strogatz", + "random_regular", "bipartite" + ] + method = random.choice(methods) + while True: + if method == "erdos_renyi": + p = random.uniform(p_init, p_final) + G = nx.erdos_renyi_graph(n, p, seed_g) + elif method == "barabasi_albert": + m = random.randint(1, n - 1) + G = nx.barabasi_albert_graph(n, m) + elif method == "watts_strogatz": + k = random.randint(2, n - 1) + p = random.uniform(p_init, p_final) + G = nx.watts_strogatz_graph(n, k, p) + elif method == "random_regular": + d = random.randint(2, n - 1) + if n * d % 2 == 0: + G = nx.random_regular_graph(d, n) + else: + continue + elif method == "bipartite": + n1 = random.randint(2, n - 1) + n2 = n - n1 + G = nx.complete_bipartite_graph(n1, n2) + else: + raise ValueError(f"Unknown method: {method}") + + if G.number_of_edges() > 0 and nx.is_connected(G): + break + return G, method + + +def add_rand_weights_to_graph(g, seed_weight, neg_weights=False): + """ + Add random weights to all edges in the graph. + Returns a new weighted graph. + """ + if seed_weight is not None: + random.seed(seed_weight) + + g_weighted = nx.Graph() + g_weighted.add_nodes_from(g.nodes()) + for u, v in g.edges(): + w = round(random.random(), 2) + while w == 0.0: + w = round(random.random(), 2) + if neg_weights: + w *= random.choice([-1, 1]) + g_weighted.add_edge(u, v, weight=w) + return g_weighted + + +def norm_elist_weights(e_list): + """ + Normalize edge weights so their sum of absolute values is 1. + Returns: (normalized_edge_list, total_weight) + """ + total_weight = sum(abs(weight) for _, _, weight in e_list) + scaled_weighted_edge_list = [ + (u, v, weight / total_weight) for u, v, weight in e_list + ] + return scaled_weighted_edge_list, total_weight diff --git a/docs/sphinx/applications/python/qaoa_gpt_src/hamiltonian_graph.py b/docs/sphinx/applications/python/qaoa_gpt_src/hamiltonian_graph.py new file mode 100644 index 00000000000..553fc62bead --- /dev/null +++ b/docs/sphinx/applications/python/qaoa_gpt_src/hamiltonian_graph.py @@ -0,0 +1,59 @@ +#============================================================================== # +# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# The QAOA-GPT implementation in CUDA-Q is based on this paper: # +# https://arxiv.org/pdf/2504.16350 # +# Usage or reference of this code or algorithms requires citation of the paper: # +# Ilya Tyagin, Marwa Farag, Kyle Sherbert, Karunya Shirali, Yuri Alexeev, # +# Ilya Safro "QAOA-GPT: Efficient Generation of Adaptive and Regular Quantum # +# Approximate Optimization Algorithm Circuits", IEEE International Conference # +# on Quantum Computing and Engineering (QCE), 2025. # +# ============================================================================= # + +from cudaq import spin +import cudaq + + +def max_cut_ham(edges): + """ + Generate a Hamiltonian for the Max-Cut problem. + Args: + edges: List of edges in the graph. + weight: List of weights for each edge. + Returns: + Hamiltonian for the Max-Cut problem. + """ + ham = 0.0 + + for edge in range(len(edges)): + + qubitu = edges[edge][0] + qubitv = edges[edge][1] + weight = edges[edge][2] + # Add a term to the Hamiltonian for the edge (u,v) + ham += 0.5 * (weight * spin.z(qubitu) * spin.z(qubitv) - + weight * spin.i(qubitu) * spin.i(qubitv)) + + return ham + + +# Collect coefficients from a spin operator so we can pass them to a kernel +def term_coefficients(ham: cudaq.SpinOperator) -> list[complex]: + result = [] + for term in ham: + result.append(term.evaluate_coefficient()) + return result + + +# Collect Pauli words from a spin operator so we can pass them to a kernel +def term_words(ham: cudaq.SpinOperator, qubits_num) -> list[str]: + # Our kernel uses these words to apply exp_pauli to the entire state. + # we hence ensure that each pauli word covers the entire space. + + result = [] + for term in ham: + result.append(term.get_pauli_word(qubits_num)) + return result diff --git a/docs/sphinx/applications/python/qaoa_gpt_src/max_cut_classical_sol.py b/docs/sphinx/applications/python/qaoa_gpt_src/max_cut_classical_sol.py new file mode 100644 index 00000000000..652d57fdb8b --- /dev/null +++ b/docs/sphinx/applications/python/qaoa_gpt_src/max_cut_classical_sol.py @@ -0,0 +1,140 @@ +#============================================================================== # +# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# The QAOA-GPT implementation in CUDA-Q is based on this paper: # +# https://arxiv.org/pdf/2504.16350 # +# Usage or reference of this code or algorithms requires citation of the paper: # +# Ilya Tyagin, Marwa Farag, Kyle Sherbert, Karunya Shirali, Yuri Alexeev, # +# Ilya Safro "QAOA-GPT: Efficient Generation of Adaptive and Regular Quantum # +# Approximate Optimization Algorithm Circuits", IEEE International Conference # +# on Quantum Computing and Engineering (QCE), 2025. # +# ============================================================================= # + +import random +import numpy as np +import networkx as nx + + +# Compute the max cut of a graph and its value using brute force, simulated annealing, or one-exchange algorithm. +# The brute force method is exact but computationally expensive for large graphs. +# Simulated annealing provides a probabilistic approach that can yield good results in reasonable time. +# One-exchange is a heuristic method provided by NetworkX. +def brute_force_max_cut(graph): + """ + Computes the Max-Cut of a weighted graph using a brute-force approach. + + Args: + graph (nx.Graph): The input weighted graph. + + Returns: + tuple: A tuple containing: + - float: The value of the Max-Cut. + - tuple: A tuple of two sets representing the partition of nodes + that achieves the Max-Cut. + """ + nodes = list(graph.nodes) + max_cut_value = -1 + best_partition = (set(), set()) + + for i in range(1, 2**len(nodes) - 1): + binary_representation = bin(i)[2:].zfill(len(nodes)) + group1 = { + nodes[j] + for j, bit in enumerate(binary_representation) + if bit == '1' + } + group2 = set(nodes) - group1 + + cut_value = 0 + for u, v, data in graph.edges(data=True): + if (u in group1 and v in group2) or (u in group2 and v in group1): + cut_value += data.get( + 'weight', 1) # Use 1 as default weight if not specified + + if cut_value > max_cut_value: + max_cut_value = cut_value + best_partition = (group1, group2) + + # convert to binary representation + binary_vector_1 = [0] * len(nodes) + for node in best_partition[0]: + binary_vector_1[node] = 1 + + binary_vector_2 = [0] * len(nodes) + for node in best_partition[1]: + binary_vector_2[node] = 1 + + binary_vector = (''.join(str(bit) for bit in binary_vector_1), + ''.join(str(bit) for bit in binary_vector_2)) + + return (-1 * max_cut_value), best_partition, binary_vector + + +def simulated_annealing_maxcut(graph, + initial_temp=1000, + cooling_rate=0.95, + iterations=1000): + nodes = list(graph.nodes) + current_solution = {node: random.choice([0, 1]) for node in nodes} + + def cut_value(solution): + return sum(data['weight'] + for u, v, data in graph.edges(data=True) + if solution[u] != solution[v]) + + current_value = cut_value(current_solution) + best_solution = current_solution.copy() + best_value = current_value + temp = initial_temp + + for _ in range(iterations): + node = random.choice(nodes) + new_solution = current_solution.copy() + new_solution[node] = 1 - new_solution[node] # flip side + new_value = cut_value(new_solution) + + delta = new_value - current_value + if delta > 0 or random.random() < np.exp(delta / temp): + current_solution = new_solution + current_value = new_value + if new_value > best_value: + best_solution = new_solution + best_value = new_value + + temp *= cooling_rate + + set1 = [node for node in best_solution if best_solution[node] == 0] + set2 = [node for node in best_solution if best_solution[node] == 1] + + binary_vector_1 = [0] * len(nodes) + for node in set1: + binary_vector_1[node] = 1 + + binary_vector_2 = [0] * len(nodes) + for node in set2: + binary_vector_2[node] = 1 + + binary_vector = (''.join(str(bit) for bit in binary_vector_1), + ''.join(str(bit) for bit in binary_vector_2)) + + return (set(set1), set(set2)), (-1 * best_value), binary_vector + + +def one_exchange(graph): + + curr_cut_size, partition = nx.approximation.one_exchange(graph, + weight='weight') + # convert to binary representation + binary_vector_1 = [0] * len(graph.nodes) + for node in partition[0]: + binary_vector_1[node] = 1 + binary_vector_2 = [0] * len(graph.nodes) + for node in partition[1]: + binary_vector_2[node] = 1 + binary_vector = (''.join(str(bit) for bit in binary_vector_1), + ''.join(str(bit) for bit in binary_vector_2)) + + return (curr_cut_size * -1), partition, binary_vector diff --git a/docs/sphinx/applications/python/qaoa_gpt_src/model_interface.py b/docs/sphinx/applications/python/qaoa_gpt_src/model_interface.py new file mode 100644 index 00000000000..fe95c144309 --- /dev/null +++ b/docs/sphinx/applications/python/qaoa_gpt_src/model_interface.py @@ -0,0 +1,207 @@ +#============================================================================== # +# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# The QAOA-GPT implementation in CUDA-Q is based on this paper: # +# https://arxiv.org/pdf/2504.16350 # +# Usage or reference of this code or algorithms requires citation of the paper: # +# Ilya Tyagin, Marwa Farag, Kyle Sherbert, Karunya Shirali, Yuri Alexeev, # +# Ilya Safro "QAOA-GPT: Efficient Generation of Adaptive and Regular Quantum # +# Approximate Optimization Algorithm Circuits", IEEE International Conference # +# on Quantum Computing and Engineering (QCE), 2025. # +# ============================================================================= # + +import pickle +from contextlib import nullcontext +import torch +import tiktoken +from nanoGPT.model_pad_gemb import GPTConfig as GPTConfig_gemb +from nanoGPT.model_pad_gemb import GPT as GPT_gemb + +from nanoGPT.model_pad import GPTConfig as GPTConfig_nogemb +from nanoGPT.model_pad import GPT as GPT_nogemb + +import pandas as pd +import json +from tqdm import tqdm +import random +import numpy as np +from matplotlib import pyplot as plt +from collections import defaultdict +from pathlib import Path + +from qaoa_gpt_src.util import (generate_circ_from_df, eval_adapt_gpt_circ_cudaq, + prepare_model_input) + +dtype_str_to_torch_dict = { + "float32": torch.float32, + "float": torch.float32, + "float16": torch.float16, + "half": torch.float16, + "bfloat16": torch.bfloat16, + "float64": torch.float64, + "double": torch.float64, +} + +class QAOA_GPT(): + + def __init__( + self, + model_ckpt, + config_file, + data_dir, + device, # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc. + n_nodes='infer', + temp_folder='adapt_gpt_temp_data', + ): + + config_fpath = Path(config_file) + assert config_fpath.is_file() + + print(f"Loading config from: {config_fpath}") + config_vars = {} + with open(config_fpath) as f: + exec(f.read(), config_vars) + + self.pool_type = config_vars['pool_type'] + self.use_graph_emb = config_vars['use_graph_emb'] + + if 'n_nodes' not in config_vars: + if n_nodes == 'infer': + raise AttributeError( + """Number of nodes is not found in the provided config. + You need to supply it as an argument in AdaptGPT constructor: + AdaptGPT(..., n_nodes=,...) + """) + else: + assert type(n_nodes) == int + self.n_nodes = n_nodes + else: + self.n_nodes = config_vars['n_nodes'] + + #self.out_dir = Path(out_dir) + self.data_dir = Path(data_dir) + self.model_ckpt = Path(model_ckpt) + self.temp_folder = Path(temp_folder) + + self.seed = 1337 + self.init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl') + self.device = device + if self.device == 'cuda': + if torch.cuda.is_available() and torch.cuda.is_bf16_supported(): + self.dtype = 'bfloat16' + else: + self.dtype = 'float16' + + self.compile = False # use PyTorch 2.0 to compile the model to be faster + + torch.manual_seed(self.seed) + torch.cuda.manual_seed(self.seed) + torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul + torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn + self.device_type = 'cuda' if 'cuda' in self.device else 'cpu' # for later use in torch.autocast + ptdtype = dtype_str_to_torch_dict[self.dtype] + #ptdtype = { + # 'float32': torch.float32, + # 'bfloat16': torch.bfloat16, + # 'float16': torch.float16 + #}[self.dtype] + self.ctx = nullcontext( + ) if self.device_type == 'cpu' else torch.amp.autocast( + device_type=self.device_type, dtype=ptdtype) + + self.meta = pd.read_pickle(f'{data_dir}/meta.pkl') + + if self.use_graph_emb: + self.gptconfig = GPTConfig_gemb + self.gpt = GPT_gemb + else: + self.gptconfig = GPTConfig_nogemb + self.gpt = GPT_nogemb + + self.model = self.open_model(self.model_ckpt) + + return None + + def open_model( + self, + model_fpath, + ): + # init from a model saved in a specific directory + out_path = Path(model_fpath) + checkpoint = torch.load(out_path, map_location=self.device) + gptconf = self.gptconfig(**checkpoint['model_args']) + model = self.gpt(gptconf) + state_dict = checkpoint['model'] + unwanted_prefix = '_orig_mod.' + for k, v in list(state_dict.items()): + if k.startswith(unwanted_prefix): + state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) + model.load_state_dict(state_dict) + model.eval() + model.to(self.device) + + return model + + def generate_circ_from_nx( + self, + graphs_container, + calculate_classic_maxcut=True, + n_samples_per_batch=50, # max number of distinct graphs in a batch + num_samples=5, # number of samples to draw + max_new_tokens=150, # number of tokens generated in each sample + temperature=0.1, # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions + top_k=200, # retain only the top_k most likely tokens, clamp others to have 0 probability + ): + graphs_nx_df, feather_par_emb, emb_graph_id_to_idx_dict = prepare_model_input( + graphs_container, + n_nodes=self.n_nodes, + calculate_classic_maxcut=calculate_classic_maxcut, + ) + + if self.device == 'cpu': + emb_dtype = "float" + else: + emb_dtype = self.dtype + + gc_df = generate_circ_from_df( + graphs_nx_df, + graph_emb_np=feather_par_emb, + emb_graph_id_to_idx_dict=emb_graph_id_to_idx_dict, + meta=self.meta, + model=self.model, + device=self.device, + ctx=self.ctx, + n_samples_per_batch=n_samples_per_batch, + num_samples=num_samples, + max_new_tokens=max_new_tokens, + temperature=temperature, + top_k=top_k, + token_seq_col='token_seq_round_d2', + normalize_weights_flag=False, + emb_dtype=dtype_str_to_torch_dict[emb_dtype], + ) + + return gc_df + + def eval_circ_df_cudaq(self, qaoa_gpt_circ_df, adapt_gpt_path='.'): + qaoa_gpt_circ_eval_df = eval_adapt_gpt_circ_cudaq( + qaoa_gpt_circ_df, + n_nodes=self.n_nodes, + temp_folder=self.temp_folder, + pool_type=self.pool_type) + + output_columns_list = [ + "graph_prefix", "graph", "n_edges", "q_circuits", + "adapt_gpt_energies" + ] + + if "true_energy" in qaoa_gpt_circ_df.columns: + output_columns_list.append("true_energy") + + if "energy_gurobi" in qaoa_gpt_circ_df.columns: + output_columns_list.append("energy_gurobi") + + return qaoa_gpt_circ_eval_df[output_columns_list] diff --git a/docs/sphinx/applications/python/qaoa_gpt_src/prepare_tokens.py b/docs/sphinx/applications/python/qaoa_gpt_src/prepare_tokens.py new file mode 100644 index 00000000000..8e4b2831c7f --- /dev/null +++ b/docs/sphinx/applications/python/qaoa_gpt_src/prepare_tokens.py @@ -0,0 +1,764 @@ +#============================================================================== # +# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# The QAOA-GPT implementation in CUDA-Q is based on this paper: # +# https://arxiv.org/pdf/2504.16350 # +# Usage or reference of this code or algorithms requires citation of the paper: # +# Ilya Tyagin, Marwa Farag, Kyle Sherbert, Karunya Shirali, Yuri Alexeev, # +# Ilya Safro "QAOA-GPT: Efficient Generation of Adaptive and Regular Quantum # +# Approximate Optimization Algorithm Circuits", IEEE International Conference # +# on Quantum Computing and Engineering (QCE), 2025. # +# ============================================================================= # + +#from karateclub.feathergraph import FeatherGraph +from qaoa_gpt_src.custom_feather import CustomFeatherGraph as FeatherGraph +import pandas as pd +from pathlib import Path +from tqdm import tqdm +import json +import networkx as nx +import numpy as np +from collections import Counter +import random +import argparse +from joblib import Parallel, delayed +from itertools import islice +from networkx import convert_node_labels_to_integers +import ast + +tqdm.pandas() + + +def open_df_from_res_csv(fname): + #df_list = [] + try: + cur_df = pd.read_csv(fname) + cur_df['worker_id'] = fname.stem + #df_list.append(cur_df) + except Exception as e: + print(f'{e} (file: {fname})') + cur_df = None + return cur_df + + +def generate_tokens(results_fpath_str: str, + save_path_str: str, + config_path_temp: str, + n_nodes: int, + rounding_digits=2, + min_block_size=128, + max_block_size=256, + val_frac=0.1, + test_frac=0.1, + max_abs_param_val=10, + perform_coef_mod_range=True, + apply_sliding_window=True, + n_workers=1, + skip_only_qaoa_circ=False, + allowed_graph_generators_list=['all'], + approx_ratio_thr=0.97, + debug_limit=0, + target_val_size=10, + verbose=True): + """ + Args: + results_fpath_str. Path to the directory with ADAPT-QAOA results. Should contain 'res' and 'graphs' subdirectories. + save_path_str. Path to the directory where the generated tokens will be saved. + config_path_temp: path to the template config file for training the model. + n_nodes. Number of nodes in the graphs. + rounding_digits. Number of digits to round the coefficients to. + min_block_size. Minimum size of the sliding window for tokenization (min sequence length in sliding window). + max_block_size. Maximum size of the sliding window for tokenization (nanoGPT block size). + val_frac. Fraction of the data to be used for validation. + test_frac. Fraction of the data to be used for testing. + max_abs_param_val. Maximum absolute value of the coefficients (gamma and beta params). + perform_coef_mod_range. Whether to perform coefficient modulation to the range [-max_abs_param_val, max_abs_param_val]. + (Wrap beta to [0; pi] range; true (default)) + apply_sliding_window. Whether to apply sliding window for tokenization. (Apply sliding window to generate training samples) + n_workers. Number of workers to use for parallel processing. + skip_only_qaoa_circ. Whether to skip circuits that only use QAOA mixer (sum_i X(i)). + allowed_graph_generators_list. List of allowed graph generators to filter the results. + Default: all. Should be separated with ; . Allowed values: erdos_renyi;barabasi_albert;watts_strogatz;random_regular;bipartite + approx_ratio_thr. Threshold for the approximation ratio to filter the results. + debug_limit. Limit the number of results to process for debugging purposes. + tragtet_val_size. control the target size of the validation set, + regardless of the overall dataset size or class distribution. + verbose. Whether to print verbose output. + + Returns: + None. The function saves the generated tokens and metadata to the specified directory. + + """ + + results_fpath_list = [Path(el) for el in results_fpath_str.split(';')] + save_path = Path(save_path_str) + + for results_fpath in results_fpath_list: + assert results_fpath.exists() and results_fpath.is_dir( + ), "Results path is invalid." + + if verbose: + print(f"Results paths: {results_fpath_list}") + + ########################################## + ## ADAPT-QAOA results + ########################################### + if verbose: + print("Opening ADAPT-QAOA results...") + df_list = [] + df_list_all = [] + + for cur_dataset_res_path in results_fpath_list: + cur_dataset_res_flist = sorted( + cur_dataset_res_path.joinpath('res').glob('*.csv')) + if debug_limit: + cur_dataset_res_flist = cur_dataset_res_flist[:debug_limit] + # for fname in tqdm(cur_dataset_res_flist, desc='Opening ADAPT results'): + # try: + # cur_df = pd.read_csv(fname) + # cur_df['worker_id'] = fname.stem + # df_list.append(cur_df) + # except Exception as e: + # print(f'{e} (file: {fname})') + df_list = Parallel(n_jobs=n_workers)( + delayed(open_df_from_res_csv)(fname) for fname in tqdm( + cur_dataset_res_flist, + desc=f'Opening ADAPT results ({cur_dataset_res_path.stem})')) + df_list_all += df_list + df_list = [df for df in df_list_all if df is not None] + + if verbose: + print("df_list len:", len(df_list)) + + full_run_df = pd.concat(df_list) + full_run_df['prefix'] = full_run_df['worker_id'].apply(lambda x: x[:-15]) + + #################################### + ## Graphs + #################################### + if verbose: + print("Opening graphs results...") + df_list = [] + df_list_all = [] + for cur_dataset_res_path in results_fpath_list: + cur_dataset_res_flist = sorted( + cur_dataset_res_path.joinpath('graphs').glob('*.csv')) + if debug_limit: + cur_dataset_res_flist = cur_dataset_res_flist[:debug_limit] + # for fname in tqdm(cur_dataset_res_flist, desc='Opening graphs'): + # cur_df = pd.read_csv(fname) + # cur_df['worker_id'] = fname.stem + # df_list.append(cur_df) + df_list = Parallel(n_jobs=n_workers)( + delayed(open_df_from_res_csv)(fname) for fname in tqdm( + cur_dataset_res_flist, + desc=f'Opening graphs ({cur_dataset_res_path.stem})')) + df_list_all += df_list + + for df in df_list: + if df is not None: + if 'g_method' not in df.columns: + #print("Graphs were generated with older version of ADAPT-GPT preprocessor. Most likely, they are ER.") + df['g_method'] = "erdos_renyi" + df_list = [df for df in df_list_all if df is not None] + if verbose: + print("df_list len:", len(df_list)) + + full_run_graphs_df = pd.concat(df_list) + full_run_graphs_df['edgelist_list'] = ( + full_run_graphs_df['edgelist_json'].progress_apply( + lambda x: json.loads(x))) + full_run_graphs_df['edgelist_list_len'] = ( + full_run_graphs_df['edgelist_list'].progress_apply(lambda x: len(x))) + full_run_graphs_df['num_connected_comp'] = full_run_graphs_df[ + 'edgelist_list'].progress_apply(lambda x: len( + list(nx.connected_components(nx.Graph([edge[:2] for edge in x]))))) + full_run_graphs_df['prefix'] = full_run_graphs_df['worker_id'].apply( + lambda x: x[:-14]) + if verbose: + print("Graphs count:") + if verbose: + print(full_run_graphs_df['g_method'].value_counts()) + + ################################################### + ## Merge adapt-qaoa results and graphs results + ################################################### + if verbose: + print("Merging ADAPT-QAOA results and graphs results...") + + if verbose: + print(f"Number of rows in full_run_df: {len(full_run_df)}") + print("Columns in full_run_df:", full_run_df.columns) + print("Sample rows from full_run_df:") + print(full_run_df.head()) + + combined_res_df = pd.merge(left=full_run_df, + right=full_run_graphs_df, + left_on=['prefix', 'graph_num'], + right_on=['prefix', 'graph_num']) + + if verbose: + print("Unique prefixes in full_run_df:", full_run_df['prefix'].unique()) + print("Unique prefixes in full_run_graphs_df:", + full_run_graphs_df['prefix'].unique()) + + print("Unique graph_num in full_run_df:", + full_run_df['graph_num'].unique()) + print("Unique graph_num in full_run_graphs_df:", + full_run_graphs_df['graph_num'].unique()) + + ############################################### + # Add derived columns + ############################################### + + # convert energy_list from string to list + combined_res_df["energy_list"] = combined_res_df["energy_list"].apply( + lambda x: ast.literal_eval(x) if isinstance(x, str) else x) + + combined_res_df["n_layers"] = combined_res_df["energy_list"].apply(len) + + combined_res_df['graph_id'] = (combined_res_df['prefix'] + '_^_' + + combined_res_df['graph_num'].astype(str)) + + # convert mixer_pool_index from string to list of ints + combined_res_df['mixer_pool_index'] = combined_res_df[ + 'mixer_pool_index'].apply( + lambda x: [int(i) for i in ast.literal_eval(x)] + if isinstance(x, str) else x) + + combined_res_df['only_qaoa_circ'] = combined_res_df[ + 'mixer_pool_index'].progress_apply( + lambda x: all(e == n_nodes + 1 for e in x)) + + # Filter by allowed graph generators if needed + if allowed_graph_generators_list != ['all']: + if verbose: + print( + f"Filtering graphs based on allowed generators: {allowed_graph_generators_list}" + ) + if verbose: + print(f"N circuits before: {len(combined_res_df)}") + combined_res_df = combined_res_df[combined_res_df['g_method'].isin( + allowed_graph_generators_list)] + if verbose: + print(f"N circuits after: {len(combined_res_df)}") + + if verbose: + print(combined_res_df['g_method'].value_counts()) + + ############################################ + # Graph embedding + ############################################ + if verbose: + print("Applying FEATHER graph embedding...") + + combined_unique_graphs_df = (combined_res_df[['graph_id', 'edgelist_json' + ]].drop_duplicates()) + + def create_weighted_graph_nx(w_elist): + G = nx.Graph() + G.add_weighted_edges_from(w_elist) + return G + + combined_unique_graphs_df['edgelist_py_list'] = combined_unique_graphs_df[ + 'edgelist_json'].progress_apply(lambda x: [(e[0] - 1, e[1] - 1, e[2]) + for e in json.loads(x) + #(e[0]-1, e[1]-1) for e in x + ]) + + #combined_unique_graphs_df['graph_nx'] = ( + # combined_unique_graphs_df['edgelist_py_list'] + # .progress_apply(lambda x: create_weighted_graph_nx(x)) + #) + + combined_unique_graphs_df['graph_nx'] = ( + combined_unique_graphs_df['edgelist_py_list'].progress_apply( + lambda x: convert_node_labels_to_integers( + create_weighted_graph_nx(x)))) + + combined_unique_graphs_w_idx_df = combined_unique_graphs_df.set_index( + 'graph_id') + graphs_nx_dict = combined_unique_graphs_w_idx_df['graph_nx'].to_dict() + graphs_nx_filt_dict = dict([(name, g) + for name, g in tqdm(graphs_nx_dict.items()) + if g.number_of_nodes() == n_nodes]) + graphs_nx_filt_names = list(graphs_nx_filt_dict.keys()) + graphs_nx_filt_list = list(graphs_nx_filt_dict.values()) + + emb_graph_idx_to_id_dict = { + k: v for k, v in enumerate(graphs_nx_filt_names) + } + emb_graph_id_to_idx_dict = { + v: k for k, v in enumerate(graphs_nx_filt_names) + } + + #def get_feather_emb(g_list): + # feather_model = FeatherGraph() + # feather_model.fit(graphs=g_list) + # return feather_model.get_embedding() + def get_feather_emb(g_list): + # Using our custom wrapper with the original FEATHER implementation + feather_model = FeatherGraph(order=5, + eval_points=25, + theta_max=2.5, + seed=42, + pooling="mean") + feather_model.fit(graphs=g_list) + return feather_model.get_embedding() + + def split_list(lst, n): + it = iter(lst) + return [ + list(islice(it, i)) for i in + [len(lst) // n + (1 if x < len(lst) % n else 0) for x in range(n)] + ] + + def embed_nx_w_feather_parallel(graphs_list, n_workers=2): + graphs_chunked_list = split_list(graphs_list, n_workers) + + #graphs_chunked_list=[graphs_list] + + emb_np_list = Parallel(n_jobs=n_workers)( + delayed(get_feather_emb)(g_chunk) + for g_chunk in graphs_chunked_list) + + return np.vstack(emb_np_list) + + feather_par_emb = embed_nx_w_feather_parallel(graphs_nx_filt_list[:], + n_workers=n_workers) + feather_par_emb = feather_par_emb.round(rounding_digits) + #print(f"Graph embedding shape: {feather_par_emb.shape} (n_graphs × dimension)") + #print(f"Graph embedding dimension: {feather_par_emb.shape[1]}") + + combined_res_df['has_emb'] = combined_res_df['graph_id'].apply( + lambda x: True if x in emb_graph_id_to_idx_dict else False) + + ####################################### + # Filtering + ####################################### + + if verbose: + print("Filtering results...") + if verbose: + print( + f"Number of rows in combined_res_df before filtering: {len(combined_res_df)}" + ) + + # Convert gamma_coef from string to list of floats + combined_res_df['gamma_coef'] = combined_res_df['gamma_coef'].apply( + lambda x: [float(coef) for coef in ast.literal_eval(x)] + if isinstance(x, str) else x) + + # convert approx_ratio from string to float + combined_res_df['approx_ratio'] = pd.to_numeric( + combined_res_df['approx_ratio'], errors='coerce') + + combined_res_filt_df = combined_res_df[ + # ( + # combined_res_df['beta_coef'].apply( + # lambda x: all([abs(coef) < max_abs_param_val for coef in x]) + # ) + # ) + # & + (combined_res_df['gamma_coef'].apply( + lambda x: all([abs(coef) < max_abs_param_val for coef in x]))) & + (combined_res_df['approx_ratio'] > approx_ratio_thr)].copy() + + if skip_only_qaoa_circ: + if verbose: + print("Filtering out only QAOA circuits...") + n_only_qaoa_circ = combined_res_filt_df['only_qaoa_circ'].sum() + if verbose: + print( + f"Removing {n_only_qaoa_circ} out of total {len(combined_res_filt_df)}" + ) + combined_res_filt_df = combined_res_filt_df[ + combined_res_filt_df['only_qaoa_circ'] == False] + + # Convert beta_coef and gamma_coef from strings to lists of floats + + combined_res_filt_df['beta_coef'] = combined_res_filt_df['beta_coef'].apply( + lambda x: [float(coef) for coef in ast.literal_eval(x)] + if isinstance(x, str) else x) + + combined_res_filt_df['gamma_coef'] = combined_res_filt_df[ + 'gamma_coef'].apply( + lambda x: [float(coef) for coef in ast.literal_eval(x)] + if isinstance(x, str) else x) + + combined_res_filt_df['mixer_pool_index'] = combined_res_filt_df[ + 'mixer_pool_index'].apply( + lambda x: [int(idx) for idx in ast.literal_eval(x)] + if isinstance(x, str) else x) + + if verbose: + print( + f"Number of rows in combined_res_filt_df after filtering: {len(combined_res_filt_df)}" + ) + + ######################################## + # Tokenization + ######################################## + + if verbose: + print("Tokenizing...") + tokens_list = [] + + ## Special symbols + special_symbols_list = ['pad', 'bos', 'eos', 'new_layer_p', 'end_of_graph'] + tokens_list += special_symbols_list + + ## Edges + all_edges_list = [] + for g in combined_res_filt_df['edgelist_list']: + for e in g: + all_edges_list.append(tuple(e[:2])) + all_edges_set = set(all_edges_list) + + if verbose: + print(f"\tTotal tokens for edges: {len(all_edges_set)}") + tokens_list += list(all_edges_set) + + ## Coeffs + + n_steps = int((max_abs_param_val * 2 / (10**-rounding_digits)) + 1) + + all_coefs_round_set = set([ + round(coef, rounding_digits) + for coef in np.linspace(start=-max_abs_param_val, + stop=max_abs_param_val, + num=n_steps).tolist() + ]) + len(all_coefs_round_set) + tokens_list += list(all_coefs_round_set) + + if verbose: + print(f"\tTotal tokens for coefs: {len(all_coefs_round_set)}") + + ## Operator pool + + ops_list = [] + for l in combined_res_filt_df['mixer_pool_index']: + + ops_list += l + + ops_list = list(set(ops_list)) + if verbose: + print(f"\tTotal tokens for operator pool: {len(ops_list)}") + tokens_list += ops_list + + ###################################### + ## Tokenization + ####################################### + + int_idx_to_token_dict = dict(enumerate(tokens_list)) + token_to_int_idx_dict = {v: k for k, v in int_idx_to_token_dict.items()} + + vocab_size = len(int_idx_to_token_dict) + if verbose: + print(f"\tTotal tokens in vocab: {vocab_size}") + """ + We know that beta coefficients can't exceed pi in QAOA formulation, + but when we were using ADAPT.jl it happens occasionally due to the optimization process. + Since our GPT model requires that all numerical values have distinct tokens, + we are limited to what value range we can represent, so we apply this function to effectively + return beta values into their canonical range. Beta values are pi-periodical, + so if a beta value exceeds the range, we simply return it back to pi-range. + We also experimented with gamma parameters, but it turned out to be more complicated, + and we did not get consistent results, so we don't do gamma modulation at this time. + + """ + + def arth_mod(a, b): + result = a % b + return result if a >= 0 else result - b + + def tokenize_row(row, coef_mod=True): + + tokens_seq_list = ['bos'] + + for edge in row['edgelist_list']: + edge_tuple = tuple(edge[:2]) + edge_weight = edge[2] + tokens_seq_list.append(edge_tuple) + tokens_seq_list.append(edge_weight) + + tokens_seq_list.append('end_of_graph') + + for p in range(row['n_layers']): + tokens_seq_list.append('new_layer_p') + tokens_seq_list.append(row['mixer_pool_index'][p]) + + cur_beta = row['beta_coef'][p] + + if coef_mod: + # cur_beta = cur_beta % (np.pi) + cur_beta = arth_mod(cur_beta, np.pi) + if cur_beta > -max_abs_param_val and cur_beta < max_abs_param_val: + cur_beta_round = round(cur_beta, rounding_digits) + tokens_seq_list.append(cur_beta_round) + else: + return None + + cur_gamma = row['gamma_coef'][p] + + # if coef_mod: + # cur_gamma = cur_gamma % (2*np.pi) + if cur_gamma > -max_abs_param_val and cur_gamma < max_abs_param_val: + cur_gamma_round = round(cur_gamma, rounding_digits) + tokens_seq_list.append(cur_gamma_round) + else: + return None + + tokens_seq_list.append('eos') + + return tokens_seq_list + + combined_res_filt_df[ + f'token_seq_round_d{rounding_digits}'] = combined_res_filt_df.progress_apply( + lambda x: tokenize_row(x, coef_mod=perform_coef_mod_range), + axis=1, + ) + + combined_res_tok_df = combined_res_filt_df.dropna() + combined_res_tok_df[f'token_int_seq_round_d{rounding_digits}'] = ( + combined_res_tok_df[f'token_seq_round_d{rounding_digits}']. + progress_apply(lambda x: [token_to_int_idx_dict[token] for token in x])) + + ######################################################## + # Generating training split for nanoGPT + ######################################################## + if verbose: + print("Generating training, validation and test splits...") + + n = len(combined_res_tok_df) + + combined_res_tok_shf_df = (combined_res_tok_df.sample(frac=1).reset_index( + drop=True)) + + if verbose: + print(f"combined_res_df shape: {combined_res_df.shape}") + print(f"combined_res_tok_df shape: {combined_res_tok_df.shape}") + print(f"combined_res_tok_shf_df shape: {combined_res_tok_shf_df.shape}") + + graph_ids = combined_res_tok_shf_df['graph_id'].drop_duplicates().to_list() + + # Compute the number of graphs for each set + n_total = len(graph_ids) + n_test = int(n_total * + test_frac) # Define test_frac for the size of the test set + n_val = int(n_total * val_frac) # val_frac defines the validation set size + n_train = n_total - n_test - n_val # Remaining will be the training set + + # Split into train, val, and test sets + train_graph_ids_set = set(graph_ids[:n_train]) + val_graph_ids_set = set(graph_ids[n_train:n_train + n_val]) + test_graph_ids_set = set(graph_ids[n_train + n_val:]) + + assert len(train_graph_ids_set.intersection(val_graph_ids_set)) == 0 + assert len(train_graph_ids_set.intersection(test_graph_ids_set)) == 0 + assert len(val_graph_ids_set.intersection(test_graph_ids_set)) == 0 + + def pad_with_zeros(seq, target_len): + pad_len = target_len - len(seq) + if pad_len > 0: + padded_seq = seq + [0] * pad_len + else: + padded_seq = seq + + if len(padded_seq) != max_block_size: + if verbose: + print(f"padded_seq len: {len(padded_seq)}") + return padded_seq + + def sliding_window(numbers, min_block_size, max_block_size): + + if min_block_size != max_block_size: + block_size = random.randint(min_block_size, max_block_size) + else: + block_size = min_block_size + + if block_size >= len(numbers): + window = numbers[:-1] + window_shifted = numbers[1:] + return [[ + pad_with_zeros(window, target_len=max_block_size), + pad_with_zeros(window_shifted, target_len=max_block_size) + ]] + + result_xy_list = [] + result = [] + for i in range(0, len(numbers) - block_size + 1): + window = numbers[i:i + block_size] + result.append(window) + + for x, y in zip(result, result[1:]): + result_xy_list.append([ + pad_with_zeros(x, target_len=max_block_size), + pad_with_zeros(y, target_len=max_block_size) + ]) + + return result_xy_list + + # Assign the 'label' column based on the split + combined_res_tok_shf_df['label'] = 'train' + combined_res_tok_shf_df.loc[ + combined_res_tok_shf_df['graph_id'].isin(val_graph_ids_set), + 'label'] = 'val' + combined_res_tok_shf_df.loc[ + combined_res_tok_shf_df['graph_id'].isin(test_graph_ids_set), + 'label'] = 'test' + + if apply_sliding_window: + print('Applying sliding window...') + combined_res_tok_shf_df[ + f'token_int_seq_round_d{rounding_digits}_sw'] = combined_res_tok_shf_df[ + f'token_int_seq_round_d{rounding_digits}'].progress_apply( + lambda x: sliding_window(x, + min_block_size=min_block_size, + max_block_size=max_block_size)) + + train_data = combined_res_tok_shf_df[combined_res_tok_shf_df['label'] == + 'train'] + val_data = combined_res_tok_shf_df[combined_res_tok_shf_df['label'] == + 'val'] + test_data = combined_res_tok_shf_df[combined_res_tok_shf_df['label'] == + 'test'] + + print( + f"\tNumber of training samples: {len(train_data)}, val samples: {len(val_data)}, test samples: {len(test_data)}" + ) + + if apply_sliding_window: + + train_data_conc = [] + train_data_graph_idx_list = [] + for cur_graph_id, l in zip( + train_data['graph_id'], + train_data[f'token_int_seq_round_d{rounding_digits}_sw']): + if cur_graph_id in emb_graph_id_to_idx_dict: + train_data_conc += l + train_data_graph_idx_list += [ + emb_graph_id_to_idx_dict[cur_graph_id] + ] * len(l) + train_data_conc_np = np.array(train_data_conc, dtype=np.uint16) + + val_data_conc = [] + val_data_graph_idx_list = [] + for cur_graph_id, l in zip( + val_data['graph_id'], + val_data[f'token_int_seq_round_d{rounding_digits}_sw']): + if cur_graph_id in emb_graph_id_to_idx_dict: + val_data_conc += l + val_data_graph_idx_list += [ + emb_graph_id_to_idx_dict[cur_graph_id] + ] * len(l) + val_data_conc_np = np.array(val_data_conc, dtype=np.uint16) + + test_data_conc = [] + test_data_graph_idx_list = [] + for cur_graph_id, l in zip( + test_data['graph_id'], + test_data[f'token_int_seq_round_d{rounding_digits}_sw']): + if cur_graph_id in emb_graph_id_to_idx_dict: + test_data_conc += l + test_data_graph_idx_list += [ + emb_graph_id_to_idx_dict[cur_graph_id] + ] * len(l) + test_data_conc_np = np.array(test_data_conc, dtype=np.uint16) + + #print(f"\tTrain has {len(train_data_conc_np):,} samples") + #print(f"\tVal has {len(val_data_conc_np):,} samples") + #print(f"\tTest has {len(test_data_conc_np):,} samples") + + ####################################### + # Saving + ####################################### + if verbose: + print("Saving data...") + + save_path.mkdir(parents=True, exist_ok=True) + + if apply_sliding_window: + + np.save(save_path.joinpath('train.npy'), train_data_conc_np) + np.save(save_path.joinpath('val.npy'), val_data_conc_np) + np.save(save_path.joinpath('test.npy'), test_data_conc_np) + + combined_res_df.to_pickle(save_path.joinpath('combined_res_df.pkl')) + + combined_res_tok_shf_df.to_pickle( + save_path.joinpath('combined_res_tok_shf_df.pkl')) + + sample_size_per_w_bucket = int( + target_val_size / + len(combined_res_df['edgelist_list_len'].drop_duplicates())) + ''' + val_data_sampled = ( + val_data[ + (~val_data['token_seq_round_d2'].isna()) + ] + .groupby('edgelist_list_len', group_keys=False).apply( + lambda x: x.sample(sample_size_per_w_bucket) if len(x) > sample_size_per_w_bucket else x + ) + .reset_index(drop=True) + ) + ''' + + val_data_sampled = (val_data[( + ~val_data[f'token_seq_round_d{rounding_digits}'].isna())].groupby( + 'edgelist_list_len', group_keys=False).filter(lambda group: len( + group) > sample_size_per_w_bucket).reset_index(drop=True)) + + #print(val_data_sampled.columns) + #print('Checking val_data_sampled...') + #print(val_data_sampled.groupby('edgelist_list_len').groups.keys()) + #print(val_data_sampled.groupby('edgelist_list_len').apply(lambda group: group.drop(columns=['edgelist_list_len']).sample(sample_size_per_w_bucket))) + + val_data_sampled.to_pickle( + save_path.joinpath('combined_res_tok_shf_val_df.pkl')) + + emb_size = feather_par_emb.shape[1] + np.save(save_path.joinpath(f'feather_emb_d{emb_size}.npy'), feather_par_emb) + + meta = { + 'vocab_size': vocab_size, + 'itos': int_idx_to_token_dict, + 'stoi': token_to_int_idx_dict, + 'train_data_graph_idx_list': train_data_graph_idx_list, + 'val_data_graph_idx_list': val_data_graph_idx_list, + 'test_data_graph_idx_list': test_data_graph_idx_list, + 'emb_graph_id_to_idx_dict': emb_graph_id_to_idx_dict, + 'emb_graph_idx_to_id_dict': emb_graph_idx_to_id_dict, + } + + pd.to_pickle(meta, save_path.joinpath('meta.pkl')) + + with open(f'{config_path_temp}/train_adapt_gpt_config_template.py') as f: + config_template_str = f.read() + + pool_type = combined_res_df['pool_type'].iloc[0] + + dataset_name = save_path.stem + config_to_save_str = config_template_str.format( + out_dir=f'out-{dataset_name}', + dataset=dataset_name, + block_size=max_block_size, + use_graph_emb="True", + pool_type=pool_type, + n_nodes=n_nodes, + token_seq_round=f'token_seq_round_d{rounding_digits}', + ) + + with open(save_path.joinpath('train_adapt_gpt_config.py'), 'w') as f: + f.write(config_to_save_str) + + if verbose: + print(f"Data is saved to: {str(save_path.absolute())}") + if verbose: + print("Done!") + + return diff --git a/docs/sphinx/applications/python/qaoa_gpt_src/train_adapt_gpt_config_template.py b/docs/sphinx/applications/python/qaoa_gpt_src/train_adapt_gpt_config_template.py new file mode 100644 index 00000000000..862b6ca8235 --- /dev/null +++ b/docs/sphinx/applications/python/qaoa_gpt_src/train_adapt_gpt_config_template.py @@ -0,0 +1,42 @@ +# Train an ADAPT-GPT model +# Based on https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py + +out_dir = '{out_dir}' +eval_interval = 250 # keep frequent because we'll overfit. Determines how often standard loss evaluation occurs +eval_iters = 200 #Determines how many batches are used to calculate validation and training loss during model evaluation +log_interval = 10 # don't print too too often +eval_ar_every = 5000 # how often we do approx ratio evaluation (calling ADAPT-QAOA cudaq). Controls how often the model performs domain-specific evaluation using approximation ratio + +# we expect to overfit on this small dataset, so only save when val improves +always_save_checkpoint = False + +dataset = '{dataset}' +gradient_accumulation_steps = 1 +batch_size = 64 +block_size = {block_size} # context of up to 256 previous characters + +# baby GPT model :) +n_layer = 6 +n_head = 6 +n_embd = 384 +dropout = 0.2 + +learning_rate = 1e-4 # with baby networks can afford to go a bit higher +n_epochs = 70 +max_iters = 30000 +lr_decay_iters = 30000 # make equal to max_iters usually +min_lr = 1e-5 # learning_rate / 10 usually +beta2 = 0.95 # make a bit bigger because number of tokens per iter is small + +warmup_iters = 100 # not super necessary potentially + +graph_emb_dim = 500 # default for FEATHER graph +use_graph_emb = {use_graph_emb} +pool_type = '{pool_type}' +n_nodes = {n_nodes} +token_seq_round = '{token_seq_round}' # rounding digits for token sequence +n_samples = 5 + +# on macbook also add +# device = 'cpu' # run on cpu only +# compile = False # do not torch compile the model diff --git a/docs/sphinx/applications/python/qaoa_gpt_src/util.py b/docs/sphinx/applications/python/qaoa_gpt_src/util.py new file mode 100644 index 00000000000..8fc7ed3aab8 --- /dev/null +++ b/docs/sphinx/applications/python/qaoa_gpt_src/util.py @@ -0,0 +1,758 @@ +#============================================================================== # +# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# The QAOA-GPT implementation in CUDA-Q is based on this paper: # +# https://arxiv.org/pdf/2504.16350 # +# Usage or reference of this code or algorithms requires citation of the paper: # +# Ilya Tyagin, Marwa Farag, Kyle Sherbert, Karunya Shirali, Yuri Alexeev, # +# Ilya Safro "QAOA-GPT: Efficient Generation of Adaptive and Regular Quantum # +# Approximate Optimization Algorithm Circuits", IEEE International Conference # +# on Quantum Computing and Engineering (QCE), 2025 # +# ============================================================================= # + +from pathlib import Path +import subprocess +import sys +from datetime import datetime +import pandas as pd +import torch +from tqdm import tqdm +from collections import defaultdict +import networkx as nx +import numpy as np +from itertools import islice + +from gurobipy import Model, GRB +import gurobipy as gb + +from qaoa_gpt_src.custom_feather import CustomFeatherGraph as FeatherGraph + +import json +from joblib import Parallel, delayed + +import cudaq +from hamiltonian_graph import term_coefficients, term_words, max_cut_ham +from adapt_qaoa_pool import all_pool, qaoa_mixer, qaoa_single_x, qaoa_double + +# Set target +#cudaq.set_target("nvidia") # Set the target to CUDAQ +cudaq.set_target("nvidia", option="fp64") + + +##################################################### +def extract_graph(token_seq): + graph_seq = [] + + for idx, tok in enumerate(token_seq): + graph_seq.append(tok) + if tok == 'end_of_graph': + break + adapt_seq = token_seq[idx + 1:-1] + return graph_seq, adapt_seq + + +###################################################### +def circ_sanity_check(cur_q_circ): + + lr_sep_list = cur_q_circ[0::4] + op_idx_list = cur_q_circ[1::4] + + num_vals = cur_q_circ[2::4] + cur_q_circ[3::4] + + if any([type(el) != int for el in op_idx_list]): + #print('wrong op_idx_list') + return False + + if any([type(el) != str for el in lr_sep_list]): + #print('wrong lr_sep_list') + return False + + if len(cur_q_circ) % 4: + #print('Wrong length') + return False + + return True + + +############################################################ + + +def generate_circ_from_df( + test_run_df, + graph_emb_np, # for models with graph emb + emb_graph_id_to_idx_dict, # for models with graph emb + meta, + model, + device, + ctx, + n_samples_per_batch, + num_samples=5, # number of samples to draw + max_new_tokens=200, # number of tokens generated in each sample + temperature=0.1, # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions + top_k=200, # retain only the top_k most likely tokens, clamp others to have 0 probability + token_seq_col='token_seq_round_d2', + normalize_weights_flag=False, + emb_dtype=torch.bfloat16, +): + # Batched inference based on number of edges. + # We group graphs with the same number of edges together + # such that we can merge them into a tensor to keep the input length size consistent. + + if graph_emb_np is not None and emb_graph_id_to_idx_dict is not None: + gemb_flag = True + else: + gemb_flag = False + + stoi, itos = meta['stoi'], meta['itos'] + encode = lambda s: [stoi[c] for c in s] + decode = lambda l: [itos[i] for i in l] + + n_edges_to_count_dict = test_run_df['edgelist_list_len'].value_counts( + ).to_dict() + + adapt_gpt_out_list_dict = defaultdict(list) + x_list_dict = defaultdict(list) + graph_emb_dict = defaultdict(list) + y_dict = dict() + + pbar = tqdm(n_edges_to_count_dict.items()) + + for n_edges, n_graphs in pbar: + pbar.set_description( + f"Inference. Current batch: n_edges: {n_edges}, n_graphs: {n_graphs}" + ) + cur_test_run_df = test_run_df[test_run_df['edgelist_list_len'] == + n_edges] + + for row_idx, graph_df_row in cur_test_run_df.iterrows(): + #graph_df_row = test_df.loc[graph_idx] + start, adapt_seq = extract_graph(graph_df_row[token_seq_col]) + start_ids = encode(start) + x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, + ...]) + x_list_dict[n_edges].append(x) + + if gemb_flag: + cur_graph_idx = emb_graph_id_to_idx_dict[ + graph_df_row['graph_id']] + graph_emb_dict[n_edges].append( + torch.tensor(graph_emb_np[cur_graph_idx], + dtype=emb_dtype, + device=device)) + + adapt_gpt_out_dict = dict() + adapt_gpt_out_dict['graph'] = start[1:-1] + adapt_gpt_out_dict['n_edges'] = graph_df_row['edgelist_list_len'] + adapt_gpt_out_dict['q_circuits'] = [] + adapt_gpt_out_dict['adapt_circuit'] = adapt_seq + adapt_gpt_out_dict['adapt_full_ar'] = graph_df_row['approx_ratio'] + adapt_gpt_out_dict['graph_prefix'] = graph_df_row['graph_id'] + #adapt_gpt_out_dict['true_energy'] = graph_df_row['true_energy'] + if 'true_energy' in graph_df_row: + adapt_gpt_out_dict['true_energy'] = graph_df_row['true_energy'] + if 'energy_gurobi' in graph_df_row: + adapt_gpt_out_dict['energy_gurobi'] = graph_df_row[ + 'energy_gurobi'] + adapt_gpt_out_dict['label'] = graph_df_row['label'] + adapt_gpt_out_list_dict[n_edges].append(adapt_gpt_out_dict) + + cur_batch_torch = torch.vstack(x_list_dict[n_edges]) + + if gemb_flag: + cur_emb_batch_torch = torch.vstack(graph_emb_dict[n_edges]) + + # Calculate total samples and number of mini-batches + total_samples = cur_batch_torch.size(0) + n_batches = (total_samples + n_samples_per_batch - + 1) // n_samples_per_batch # Ensure ceiling division + + # Initialize an empty list for results + y_list = [] + + # Run inference in mini-batches + with torch.no_grad(): + for i in tqdm(range(n_batches), + desc='Internal batch progress', + disable=True): + start_idx = i * n_samples_per_batch + end_idx = min((i + 1) * n_samples_per_batch, total_samples) + + mini_batch = cur_batch_torch[start_idx:end_idx] + mini_batch_repeated = mini_batch.repeat( + num_samples, 1) # Repeat the mini-batch for num_samples + + if gemb_flag: + mini_emb_batch = cur_emb_batch_torch[start_idx:end_idx] + mini_emb_batch_repeated = mini_emb_batch.repeat( + num_samples, 1) # Repeat the mini-batch for num_samples + + with ctx: + if gemb_flag: + y = model.generate(mini_batch_repeated, + mini_emb_batch_repeated, + max_new_tokens, + temperature=temperature, + top_k=top_k) + else: + y = model.generate( + mini_batch_repeated, + #mini_emb_batch_repeated, + max_new_tokens, + temperature=temperature, + top_k=top_k) + + # Collect results from each mini-batch + y_list.append(y.detach().cpu()) + + # Concatenate results from all mini-batches + y_dict[n_edges] = torch.cat(y_list, dim=0) + + ### trimming the records (removing garbage after EOS) + for n_edges, cur_adapt_gpt_out_list in adapt_gpt_out_list_dict.items(): + cur_full_y_tensor = y_dict[n_edges] + + for graph_idx in range(len(cur_adapt_gpt_out_list)): + + cur_y_tensor = cur_full_y_tensor[ + graph_idx::len(cur_adapt_gpt_out_list)] + + for k in range(num_samples): + cur_gen_result = decode(cur_y_tensor[k].tolist()) + cur_circ = [] + circ_flag = 0 + for idx, tok in enumerate(cur_gen_result): + if tok == 'end_of_graph': + circ_flag = 1 + if circ_flag: + cur_circ.append(tok) + if tok == 'eos': + break + cur_adapt_gpt_out_list[graph_idx]['q_circuits'].append( + cur_circ[1:-1]) + + ### flattening the circ list + adapt_gpt_test_samples_list = [] + for n_edges, cur_adapt_gpt_out_list in adapt_gpt_out_list_dict.items(): + adapt_gpt_test_samples_list += cur_adapt_gpt_out_list + + for idx in range(len(adapt_gpt_test_samples_list)): + q_circ_filt_list = [] + for circ in adapt_gpt_test_samples_list[idx]['q_circuits']: + filt_flag = circ_sanity_check(circ) + # if not filt_flag: + # #print(adapt_gpt_test_samples_list[idx]['graph_prefix'], '\n') + # pass + # else: + # q_circ_filt_list.append(circ) + q_circ_filt_list.append(circ) + adapt_gpt_test_samples_list[idx]['q_circuits'] = q_circ_filt_list + + adapt_gpt_test_samples_list[idx]['q_circuits'] = q_circ_filt_list + + for gr_dict in adapt_gpt_test_samples_list: + graph_py_list = [] + + graph_edges_list = gr_dict['graph'][::2] + graph_weights_list = gr_dict['graph'][1::2] + + if normalize_weights_flag: + graph_w_norm = sum(graph_weights_list) + else: + graph_w_norm = 1.0 + + for edge_idx, edge in enumerate(graph_edges_list): + cur_edge = list(edge) + cur_edge += [graph_weights_list[edge_idx] / graph_w_norm] + graph_py_list.append(cur_edge) + + gr_dict['graph_w_py'] = graph_py_list + gr_dict['graph_weight_norm'] = graph_w_norm + + ## make it more error-prone + + adapt_gpt_test_samples_filt_list = [] + + for rec in adapt_gpt_test_samples_list: + pos_flag = 1 + # if len(rec['adapt_circuit']) % 4: + # pos_flag = 0 + # for gpt_circ in rec['q_circuits']: + # if len(gpt_circ) % 4: + # pos_flag = 0 + + if pos_flag: + adapt_gpt_test_samples_filt_list.append(rec) + + adapt_gpt_test_samples_df = pd.DataFrame(adapt_gpt_test_samples_filt_list) + + return adapt_gpt_test_samples_df + + +################################################################################# + + +def elist_to_nx(input_elist, idx_shift=True): + """Convert a list of edges to a NetworkX graph. + + Parameters: + - input_elist: List of edges in the format [(src, dst, weight), ...]. + - idx_shift: If True, shifts node indices from 1-based to 0-based. + + Returns: + - A NetworkX graph object. + """ + + elist = [] + if idx_shift: + for u, v, w in input_elist: + elist.append((u - 1, v - 1, w)) + else: + elist = input_elist + + G = nx.Graph() + G.add_weighted_edges_from(elist) + + return G + + +################################################ +def check_if_nx_graph_is_weighted(graph_nx): + return all('weight' in graph_nx[u][v] for u, v in graph_nx.edges) + + +############################################## +def nx_to_elist(nx_graph, idx_shift=True): + '''Convert a NetworkX graph to a weighted edge list. + + Parameters: + - nx_graph: A NetworkX graph object + + Returns: + - A dictionary with keys "elist" (list of edges) and "n_nodes" (number of nodes). + ''' + + if not isinstance(nx_graph, nx.Graph): + raise TypeError("Input must be a NetworkX graph object.") + + # Check if the graph is weighted + + weighted = check_if_nx_graph_is_weighted(nx_graph) + if not weighted: + raise ValueError( + "Current version of QAOA-GPT does not support unweighted graphs. " + "Weights w are expected to be sampled from U(0,1).") + shifted_elist = [] + for edge_idx, (n1, n2) in enumerate(nx_graph.edges): + cur_e_weight = nx_graph[n1][n2]['weight'] + if idx_shift: + # Shift node indices from 0-based to 1-based + n1 += 1 + n2 += 1 + + shifted_elist.append((n1, n2, cur_e_weight)) + graph_nx_from_edges = nx.from_edgelist(nx_graph.edges) + n_nodes = graph_nx_from_edges.number_of_nodes() + + return {"elist": shifted_elist, "n_nodes": n_nodes} + + +################################################ +def graph_to_edgelist(g): + """Return a weighted edge list: (src, dst, weight) for all edges.""" + return [(u, v, d['weight']) for u, v, d in g.edges(data=True)] + + +######################## +def eval_ansatz(edgelist, q_circuit, n_nodes, pool_type, verbose=False): + """ + Evaluate the ansatz using CUDA-Q. + + Parameters: + - edgelist: List of edges in the graph. + - q_circuit: The quantum circuit to evaluate. + - n_nodes: Number of nodes in the graph. + - pool_type: Type of pooling to use (e.g., 'all_pool'). + + Returns: + - Energy value of the evaluated circuit. + """ + + # Convert edgelist to a format suitable for CUDA-Q + # idx_shift=True means that node indices in edgelist start from 1 + # idx_shift=False means that node indices in edgelist start from 0 + + g = elist_to_nx(edgelist, idx_shift=True) + + e_list = graph_to_edgelist(g) + + # Hamiltonian + spin_ham = max_cut_ham(e_list) + qubits_num = n_nodes + + # Get the coefficients and pauli words of the Hamiltonian + ham_coeffs = term_coefficients(spin_ham) + ham_words = term_words(spin_ham, qubits_num) + + # Get the pool of operators + if pool_type == 'all_pool': + pools = all_pool(qubits_num) + elif pool_type == 'qaoa_mixer': + pools = qaoa_mixer(qubits_num) + elif pool_type == 'qaoa_single_x': + pools = qaoa_single_x(qubits_num) + elif pool_type == 'qaoa_double_ops': + pools = qaoa_double(qubits_num) + else: + raise ValueError( + "Invalid pool name. Choose from 'all_pool', 'qaoa_mixer', 'qaoa_single_x', or 'qaoa_double'." + ) + + op_indeces = [] + beta_coef = [] + gamma_coef = [] + + for j in range(0, len(q_circuit), 4): + op_indeces.append(q_circuit[j + 1] - 1) # Convert to 0-based index + beta_coef.append(q_circuit[j + 2]) + gamma_coef.append(q_circuit[j + 3]) + + temp = [] + for idx in op_indeces: + if idx < len(pools): + temp.append(pools[idx]) + else: + raise ValueError( + f"Operator index {idx} out of range for the pool of size {len(pools)}." + ) + + mixer_pool = [] + mixer_pool_str = [] + + for op in temp: + temp_op = [] + temp_op_str = [] + for term in op: + temp_op.append(cudaq.pauli_word(term.get_pauli_word(qubits_num))) + temp_op_str.append(term.get_pauli_word(qubits_num)) + mixer_pool.append(temp_op) + mixer_pool_str.append(temp_op_str) + if verbose: + print( + f"Using {len(mixer_pool)} operators from the pool: {mixer_pool_str}" + ) + + @cudaq.kernel + def kernel_qaoa(qubits_num: int, ham_words: list[cudaq.pauli_word], + ham_coeffs: list[complex], + mixer_pool: list[list[cudaq.pauli_word]], + gamma: list[float], beta: list[float], num_layer: int): + + qubits = cudaq.qvector(qubits_num) + + h(qubits) + + idx = 0 + for p in range(num_layer): + + for i in range(len(ham_coeffs)): + exp_pauli(gamma[p] * ham_coeffs[i].real, qubits, ham_words[i]) + + for word in mixer_pool[p]: + exp_pauli(beta[p], qubits, word) + + num_layer = len(mixer_pool) + energy_final = cudaq.observe(kernel_qaoa, spin_ham, qubits_num, ham_words, + ham_coeffs, mixer_pool, gamma_coef, beta_coef, + num_layer).expectation() + + return energy_final.real + + +############################# +def process_graph(graph_idx, + adapt_gpt_out_list, + n_nodes, + pool_type, + verbose=False): + + adapt_gpt_out_dict = adapt_gpt_out_list[graph_idx] + edgelist = adapt_gpt_out_dict["graph_w_py"] + adapt_gpt_energies_list = [] + + q_circuits = adapt_gpt_out_dict.get("q_circuits", []) + + if verbose: + print(f"Processing graph {graph_idx} with {len(q_circuits)} circuits.") + + for i in range(len(q_circuits) + 1): + if i < len(q_circuits): + generated_list = q_circuits[i] + if verbose: + print(f"Evaluating circuit {i}: {generated_list}") + else: + if verbose: + print("No more circuits to process, using adapt_circuit.") + # If no circuits left, use the adapt_circuit + if "adapt_circuit" not in adapt_gpt_out_dict: + raise ValueError( + "No circuits available for evaluation and 'adapt_circuit' not found in the output dictionary." + ) + else: + if verbose: + print("Using adapt_circuit for evaluation.") + if verbose: + print(adapt_gpt_out_dict["adapt_circuit"]) + generated_list = adapt_gpt_out_dict["adapt_circuit"] + + E_final = 999 # Default value if sth goes wrong + try: + E_final = eval_ansatz(edgelist, + generated_list, + n_nodes, + pool_type, + verbose=verbose) + except Exception as e: + print(f"Error in eval_ansatz: {e}") + #except Exception: + # pass + + if i < len(q_circuits): + adapt_gpt_energies_list.append(E_final) + else: + adapt_gpt_out_dict["ADAPT_energy_round"] = E_final + + adapt_gpt_out_dict["adapt_gpt_energies"] = adapt_gpt_energies_list + + return + + +########################## + + +def run_circuit_cudaq(input_fpath, + output_fpath, + n_nodes, + pool_type, + verbose=False): + + with open(input_fpath, "r") as f: + adapt_gpt_out_list = json.load(f) + + for graph_idx in tqdm(range(len(adapt_gpt_out_list)), + desc="Processing graphs"): + process_graph(graph_idx, adapt_gpt_out_list, n_nodes, pool_type, + verbose) + + # Save the results to the output file + with open(output_fpath, "w") as f: + json.dump(adapt_gpt_out_list, f) + + return + + +################################################################################ + + +def eval_adapt_gpt_circ_cudaq(adapt_gpt_res_df, + temp_folder, + n_nodes, + pool_type="all_pool", + verbose=False): + print(">>> eval_adapt_gpt_circ_cudaq CALLED <<<") # Add this line + formatted_timestamp = datetime.now().strftime('%Y-%m-%d__%H_%M_%S') + + temp_folder = Path(temp_folder) + + temp_folder.mkdir(parents=True, exist_ok=True) + + prefix = f'adapt_gpt_res_{formatted_timestamp}_df' + in_fname = f'{prefix}.json' + out_fname = f'{prefix}_cudaq.json' + + in_fname_path = temp_folder.joinpath(in_fname).resolve() + out_fname_path = temp_folder.joinpath(out_fname).resolve() + + adapt_gpt_res_df.to_json(in_fname_path, orient='records') + + run_circuit_cudaq(str(in_fname_path), str(out_fname_path), n_nodes, + pool_type, verbose) + + adapt_gpt_res_w_energies_df = pd.read_json(out_fname_path) + + return adapt_gpt_res_w_energies_df + + +################################################################### + + +def gurobi_max_cut_val_from_nx(graph_nx): + + model = Model("Max-Cut") + model.setParam('OutputFlag', False) + model.setParam(GRB.Param.TimeLimit, 10) + variables = {} + for node in graph_nx.nodes: + variables[node] = model.addVar(vtype=GRB.BINARY, name=f"x_{node}") + + objective = 0 + for u, v, w in graph_nx.edges(data="weight"): + objective -= w * ((2 * variables[v] * variables[u]) - + (variables[v] + variables[u])) + + model.setObjective(objective, GRB.MAXIMIZE) + model.optimize() + solution = [variables[node].x for node in graph_nx.nodes] + + return -model.ObjVal + + +######################################################## +# Graph embedding + + +def get_feather_emb( + graphs_nx_df, + n_workers, + n_nodes, + rounding_digits=2, +): + + combined_unique_graphs_df = (graphs_nx_df[['graph_id', 'edgelist_json' + ]].drop_duplicates()) + + def create_weighted_graph_nx(w_elist): + G = nx.Graph() + G.add_weighted_edges_from(w_elist) + return G + + combined_unique_graphs_df['edgelist_py_list'] = combined_unique_graphs_df[ + 'edgelist_json'].apply(lambda x: [(e[0] - 1, e[1] - 1, e[2]) + for e in json.loads(x) + #(e[0]-1, e[1]-1) for e in x + ]) + + combined_unique_graphs_df['graph_nx'] = ( + combined_unique_graphs_df['edgelist_py_list'].apply( + lambda x: create_weighted_graph_nx(x))) + + combined_unique_graphs_w_idx_df = combined_unique_graphs_df.set_index( + 'graph_id') + graphs_nx_dict = combined_unique_graphs_w_idx_df['graph_nx'].to_dict() + graphs_nx_filt_dict = dict([(name, g) + for name, g in graphs_nx_dict.items() + if g.number_of_nodes() == n_nodes]) + graphs_nx_filt_names = list(graphs_nx_filt_dict.keys()) + graphs_nx_filt_list = list(graphs_nx_filt_dict.values()) + + emb_graph_idx_to_id_dict = { + k: v for k, v in enumerate(graphs_nx_filt_names) + } + emb_graph_id_to_idx_dict = { + v: k for k, v in enumerate(graphs_nx_filt_names) + } + + def get_single_thread_feather_emb(g_list): + # Using our custom wrapper with the original FEATHER implementation + feather_model = FeatherGraph(order=5, + eval_points=25, + theta_max=2.5, + seed=42, + pooling="mean") + feather_model.fit(graphs=g_list) + return feather_model.get_embedding() + + def split_list(lst, n): + it = iter(lst) + return [ + list(islice(it, i)) for i in + [len(lst) // n + (1 if x < len(lst) % n else 0) for x in range(n)] + ] + + def embed_nx_w_feather_parallel(graphs_list, n_workers=n_workers): + graphs_chunked_list = split_list(graphs_list, n_workers) + + #graphs_chunked_list=[graphs_list] + + emb_np_list = Parallel(n_jobs=n_workers)( + delayed(get_single_thread_feather_emb)(g_chunk) + for g_chunk in graphs_chunked_list) + + return np.vstack(emb_np_list) + + feather_par_emb = embed_nx_w_feather_parallel(graphs_nx_filt_list[:], + n_workers=n_workers) + feather_par_emb = feather_par_emb.round(rounding_digits) + + return feather_par_emb, emb_graph_idx_to_id_dict + + +######################################################## +def seq_tokenize_graph(elist): + tok_list = ['bos'] + for n1, n2, w in elist: + tok_list += [tuple(sorted([n1, n2])), w] + tok_list.append('end_of_graph') + return tok_list + + +######################################################### +def prepare_model_input( + graphs_container, + n_nodes, + calculate_classic_maxcut=True, + n_workers_feather=1, +): + + if type(graphs_container) == list: + graphs_edgelist_list_dict = { + f'graph_{i}': g for i, g in enumerate(graphs_container) + } + elif type(graphs_container) == dict: + graphs_edgelist_list_dict = graphs_container + else: + raise ValueError( + "Only list or dict containers are supported for input graphs!") + + graphs_nx_dict = defaultdict(dict) + + for name, nx_graph in tqdm(graphs_edgelist_list_dict.items(), + desc='Preparing graphs...'): + nx_elist_dict = nx_to_elist(nx_graph) + + graphs_nx_dict[name]['elist'] = nx_elist_dict['elist'] + graphs_nx_dict[name]['n_nodes'] = nx_elist_dict['n_nodes'] + if calculate_classic_maxcut: + graphs_nx_dict[name]['energy_gurobi'] = gurobi_max_cut_val_from_nx( + nx_graph) + + graphs_nx_df = pd.DataFrame(graphs_nx_dict).T.reset_index(names='graph_id') + graphs_nx_df['token_seq_round_d2'] = graphs_nx_df['elist'].apply( + seq_tokenize_graph) + graphs_nx_df['edgelist_list_len'] = graphs_nx_df['elist'].apply(len) + graphs_nx_df['approx_ratio'] = None + graphs_nx_df['label'] = 'test_interactive' + graphs_nx_df['edgelist_json'] = graphs_nx_df['elist'].apply( + lambda x: json.dumps(x)) + + print("Performing FEATHER embedding") + feather_par_emb, emb_graph_idx_to_id_dict = get_feather_emb( + graphs_nx_df, + n_workers=n_workers_feather, + n_nodes=n_nodes, + ) + + emb_graph_id_to_idx_dict = { + v: k for k, v in emb_graph_idx_to_id_dict.items() + } + + graphs_nx_df['has_emb'] = graphs_nx_df['graph_id'].apply( + lambda x: True if x in emb_graph_id_to_idx_dict else False) + + graphs_nx_df = graphs_nx_df[graphs_nx_df['has_emb']] + + graphs_nx_df['graph_id'].apply(lambda x: x[:2]).value_counts() + + return graphs_nx_df, feather_par_emb, emb_graph_id_to_idx_dict diff --git a/docs/sphinx/using/applications.rst b/docs/sphinx/using/applications.rst index 2fcb260b060..8af982d1a74 100644 --- a/docs/sphinx/using/applications.rst +++ b/docs/sphinx/using/applications.rst @@ -311,6 +311,15 @@ CUDA-Q Applications QM/MM partitioning in the PE model. +
+ QAOA-GPT: GPT model for generating efficient QAOA quantum circuit. +
+ Learn how to implement QAOA-GPT algorithm using CUDA-Q. +
+ QAOA-GPt for generating QAOa circuit. +
+ +