From 1e5afd6fb5653eddc15aafcae8c20f5222e4e1e3 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 15 Jan 2024 16:36:57 +0100 Subject: [PATCH] Switch TinyLlama pretraining back to 16-mixed (#882) --- pretrain/tinyllama.py | 2 +- tutorials/pretrain_tinyllama.md | 23 ++++++++++++----------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pretrain/tinyllama.py b/pretrain/tinyllama.py index 76281a16b7..c876e6f96e 100644 --- a/pretrain/tinyllama.py +++ b/pretrain/tinyllama.py @@ -70,7 +70,7 @@ def setup(resume: Union[bool, Path] = False): logger = choose_logger(logger_name, name=name, resume=resume) strategy = FSDPStrategy(auto_wrap_policy={Block}, state_dict_type="full", sharding_strategy="HYBRID_SHARD") - fabric = L.Fabric(devices=devices, strategy=strategy, precision="bf16-true", loggers=[logger]) + fabric = L.Fabric(devices=devices, strategy=strategy, precision="bf16-mixed", loggers=[logger]) fabric.launch() fabric.print(hparams) diff --git a/tutorials/pretrain_tinyllama.md b/tutorials/pretrain_tinyllama.md index a3ba17a32f..bec8d1d086 100644 --- a/tutorials/pretrain_tinyllama.md +++ b/tutorials/pretrain_tinyllama.md @@ -11,17 +11,18 @@ This tutorial will walk you through pretraining [TinyLlama](https://github.com/j Here is a quick fact sheet: -| Name | Description | -|------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Parameters | 1.1B | -| Model Size | Layers: 22, Heads: 32, Query Groups: 4, Embedding Size: 2048, Intermediate Size: 5632 | -| Sequence Length | 2048 | -| Batch Size | 2 million tokens (2048 * 1024) | -| Learning Rate | 4e-4 | -| Learning Rate Schedule | Cosine with 2000 warmup steps | -| Training Data | [SlimPajama](https://huggingface.co/datasets/cerebras/slimpajama-627b) (893 GB), [Starcoder](https://huggingface.co/datasets/bigcode/starcoderdata) (290 GB) | -| Combined Dataset Size | Around 950B tokens | -| Total Tokens During Training | 3 trillion | +| Name | Description | +|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Parameters | 1.1B | +| Model Size | Layers: 22, Heads: 32, Query Groups: 4, Embedding Size: 2048, Intermediate Size: 5632 | +| Sequence Length | 2048 | +| Learning Rate | 4e-4 | +| Learning Rate Schedule | Cosine with 2000 warmup steps | +| Training Data | [SlimPajama](https://huggingface.co/datasets/cerebras/slimpajama-627b) (893 GB), [Starcoder](https://huggingface.co/datasets/bigcode/starcoderdata) (290 GB) | +| Combined Dataset Size | Around 950B tokens | +| Total Tokens During Training | 3 trillion (3 epochs) | +| Time to complete training | ~ 4 weeks with 64 A100 GPUs | +| Model FLOPs Utilization (MFU) | 52% | (this table was sourced from the author's [README](https://github.com/jzhang38/TinyLlama/))