From 1e5afd6fb5653eddc15aafcae8c20f5222e4e1e3 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Mon, 15 Jan 2024 16:36:57 +0100
Subject: [PATCH] Switch TinyLlama pretraining back to 16-mixed (#882)

---
 pretrain/tinyllama.py           |  2 +-
 tutorials/pretrain_tinyllama.md | 23 ++++++++++++-----------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/pretrain/tinyllama.py b/pretrain/tinyllama.py
index 76281a16b7..c876e6f96e 100644
--- a/pretrain/tinyllama.py
+++ b/pretrain/tinyllama.py
@@ -70,7 +70,7 @@ def setup(resume: Union[bool, Path] = False):
     logger = choose_logger(logger_name, name=name, resume=resume)
 
     strategy = FSDPStrategy(auto_wrap_policy={Block}, state_dict_type="full", sharding_strategy="HYBRID_SHARD")
-    fabric = L.Fabric(devices=devices, strategy=strategy, precision="bf16-true", loggers=[logger])
+    fabric = L.Fabric(devices=devices, strategy=strategy, precision="bf16-mixed", loggers=[logger])
     fabric.launch()
 
     fabric.print(hparams)
diff --git a/tutorials/pretrain_tinyllama.md b/tutorials/pretrain_tinyllama.md
index a3ba17a32f..bec8d1d086 100644
--- a/tutorials/pretrain_tinyllama.md
+++ b/tutorials/pretrain_tinyllama.md
@@ -11,17 +11,18 @@ This tutorial will walk you through pretraining [TinyLlama](https://github.com/j
 
 Here is a quick fact sheet:
 
-| Name                         | Description                                                                                                                                                  |
-|------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Parameters                   | 1.1B                                                                                                                                                         |
-| Model Size                   | Layers: 22, Heads: 32, Query Groups: 4, Embedding Size: 2048, Intermediate Size: 5632                                                                        |
-| Sequence Length              | 2048                                                                                                                                                         |
-| Batch Size                   | 2 million tokens (2048 * 1024)                                                                                                                               |
-| Learning Rate                | 4e-4                                                                                                                                                         |
-| Learning Rate Schedule       | Cosine with 2000 warmup steps                                                                                                                                |
-| Training Data                | [SlimPajama](https://huggingface.co/datasets/cerebras/slimpajama-627b) (893 GB), [Starcoder](https://huggingface.co/datasets/bigcode/starcoderdata) (290 GB) |
-| Combined Dataset Size        | Around 950B tokens                                                                                                                                           |
-| Total Tokens During Training | 3 trillion                                                                                                                                                   |
+| Name                          | Description                                                                                                                                                  |
+|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Parameters                    | 1.1B                                                                                                                                                         |
+| Model Size                    | Layers: 22, Heads: 32, Query Groups: 4, Embedding Size: 2048, Intermediate Size: 5632                                                                        |
+| Sequence Length               | 2048                                                                                                                                                         |
+| Learning Rate                 | 4e-4                                                                                                                                                         |
+| Learning Rate Schedule        | Cosine with 2000 warmup steps                                                                                                                                |
+| Training Data                 | [SlimPajama](https://huggingface.co/datasets/cerebras/slimpajama-627b) (893 GB), [Starcoder](https://huggingface.co/datasets/bigcode/starcoderdata) (290 GB) |
+| Combined Dataset Size         | Around 950B tokens                                                                                                                                           |
+| Total Tokens During Training  | 3 trillion (3 epochs)                                                                                                                                        |
+| Time to complete training     | ~ 4 weeks with 64 A100 GPUs                                                                                                                                  |
+| Model FLOPs Utilization (MFU) | 52%                                                                                                                                                          |
 
 (this table was sourced from the author's [README](https://github.com/jzhang38/TinyLlama/))