NVIDIA · shamanez · Apr 22, 2024 · Apr 22, 2024 · Apr 22, 2024 · Apr 22, 2024
diff --git a/examples/docker_setup/README.md b/examples/docker_setup/README.md
@@ -0,0 +1,70 @@
+## Quick Start Guide to Running Your PyTorch Docker Container
+
+### Step 1: Create the Dockerfile
+
+1. **Open Terminal**: Open a terminal on your Ubuntu machine.
+2. **Create Dockerfile**: Enter `nano Dockerfile` to create and edit a new Dockerfile.
+3. **Enter Dockerfile Content**:
+    ```dockerfile
+    # Use an official PyTorch image as a base
+    FROM nvcr.io/nvidia/pytorch:latest
+
+    # Set the working directory inside the container
+    WORKDIR /workspace
+
+    # Install any necessary dependencies
+    RUN pip install -r requirements.txt
+
+    # Copy the local code to the container's workspace
+    COPY ./ /workspace/
+
+    # Set the default command to execute
+    CMD ["/bin/bash"]
+    ```
+    Replace `latest` with the specific version of PyTorch you need. Modify `requirements.txt` to include all necessary Python packages.
+
+### Step 2: Build Your Docker Image
+
+1. **Build Image**: In your terminal, run:
+   ```bash
+   docker build -t my-pytorch-app .
+   ```
+   This command builds the Docker image named `my-pytorch-app` using the Dockerfile in the current directory.
+
+### Step 2: Create the Docker Run Script
+
+1. **Open Terminal**: Open a terminal on your Ubuntu machine.
+2. **Create Script File**: Enter `nano run_pytorch_docker.sh` to create and edit a new shell script.
+3. **Enter Script Content**:
+    ```bash
+    #!/bin/bash
+    # This script runs a Docker container with necessary volume mounts for the PyTorch application.
+
+    docker run --gpus all -it --rm \
+      -v /path/to/megatron:/workspace/megatron \
+      -v /path/to/dataset:/workspace/dataset \
+      -v /path/to/checkpoints:/workspace/checkpoints \
+      my-pytorch-app \
+      /bin/bash
+    ```
+    Replace `/path/to/megatron`, `/path/to/dataset`, and `/path/to/checkpoints` with the actual paths to your resources. This will take you the interactive window.
+4. **Save and Exit**: Press `Ctrl+O`, hit `Enter` to save, then `Ctrl+X` to exit `nano`.
+5. **Make Executable**: Run `chmod +x run_pytorch_docker.sh` to make your script executable.
+
+### Step 3: Run the Docker Container
+
+- **Execute the Script**: In your terminal, type `./run_pytorch_docker.sh` to start the Docker container. This script mounts specified directories and opens a container with GPU access enabled.
+
+
+### Step 4: Debugging Inside the Container
+
+Once your Docker container is running and you're inside its interactive shell, you can proceed as if you're in a typical development environment:
+
+- **Full Access to Libraries**: All libraries and tools installed in the Docker image are at your disposal. You can run commands, execute scripts, and use your usual debugging tools just like on a local machine.
+- **Normal Operation**: Interact with the terminal as you would in any Linux environment. Edit, execute, and debug your applications directly inside the container using the command line or any terminal-based editors like Vim or Nano.
+
+This setup provides a seamless experience for development and debugging, ensuring that your work environment is both controlled and replicable.
+
+### Step 5: Exit the Container
+
+- **To Exit**: Type `exit` in the container's terminal. The container will stop, and due to the `--rm` flag, it will also be automatically removed, cleaning up your system.
diff --git a/examples/docker_setup/docker_run.sh b/examples/docker_setup/docker_run.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# This script runs a Docker container with the necessary volume mounts for the PyTorch application.
+
+docker run --ipc=host --shm-size=512m --gpus all -it --rm \
+  -v /home/ubuntu/src/Megatron-LM:/workspace/megatron \
+  -v /home/ubuntu/src/dataset-dir:/workspace/dataset \
+  -v /home/ubuntu/src/checkpoint-dir:/workspace/checkpoints \
+  my-pytorch-app \
+  /bin/bash
diff --git a/examples/docker_setup/dockerfile b/examples/docker_setup/dockerfile
@@ -0,0 +1,14 @@
+# Use NVIDIA's PyTorch image as the base
+FROM nvcr.io/nvidia/pytorch:24.03-py3
+
+# Set the working directory in the container
+WORKDIR /workspace
+
+# Copy the requirements file into the container at /app
+COPY requirements.txt /workspace
+
+# Install any needed packages specified in requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Install additional Python packages
+RUN pip install pip install git+https://github.com/fanshiqing/grouped_gemm@v1.0
diff --git a/examples/docker_setup/requirements.txt b/examples/docker_setup/requirements.txt
@@ -0,0 +1,3 @@
+transformers
+datasets
+sentencepiece
diff --git a/examples/pretrain_gpt_distributed_with_mp.sh b/examples/pretrain_gpt_distributed_with_mp.sh
@@ -28,6 +28,7 @@ DISTRIBUTED_ARGS="
 GPT_ARGS="
     --tensor-model-parallel-size 2 \
     --pipeline-model-parallel-size 2 \
+    --attention-softmax-in-fp32 \
     --sequence-parallel \
     --num-layers 24 \
     --hidden-size 1024 \
@@ -44,7 +45,7 @@ GPT_ARGS="
     --weight-decay 1e-2 \
     --lr-warmup-fraction .01 \
     --clip-grad 1.0 \
-    --fp16
+    --bf16
 "
 
 DATA_ARGS="

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
@@ -11,6 +11,7 @@
 import time
 import gzip
 import glob
+from itertools import chain
 import torch
 import numpy as np
 import multiprocessing
@@ -141,7 +142,8 @@ def split_sentences(self, file_name):
         fout.close()
 
 
-    def process_json_file(self, file_name):
+    def process_json_file(self, file_name, batch_size_for_packing=None, block_size=None):
+
         input_file_name, output_prefix = file_name
         print("Opening", input_file_name)
         fin = open(input_file_name, 'r', encoding='utf-8')
@@ -156,6 +158,7 @@ def process_json_file(self, file_name):
         if self.args.split_sentences:
             level = "sentence"
 
+
         output_bin_files = {}
         output_idx_files = {}
         builders = {}
@@ -173,17 +176,65 @@ def process_json_file(self, file_name):
         startup_end = time.time()
         proc_start = time.time()
         total_bytes_processed = 0
+
+        # Initialize temporary storage for batch processing
+        if batch_size_for_packing and block_size:
+            key =  self.args.json_keys[0]
+            batch_docs = {key: []}
+            num_docs_in_batch = 0
+
         print("Time to startup:", startup_end - startup_start)
         for i, (doc, sentence_lens, bytes_processed) in enumerate(encoded_docs, start=1):
+
+
+            # Update the total bytes processed with the bytes from the current document
             total_bytes_processed += bytes_processed
-            for key in doc.keys():
-                builders[key].add_document(doc[key], sentence_lens[key])
-            self.print_processing_stats(i, proc_start, total_bytes_processed)
+
+            if batch_size_for_packing and block_size:
+
+                # Append tokens from current doc to the batch under the "text" key
+                batch_docs[key].extend(doc[key])
+                num_docs_in_batch += 1
+
+
+                # Once we have enough documents in the batch or it's the last document, process the batch
+                if num_docs_in_batch >= batch_size_for_packing:
+                    packed_blocks = self.pack_tokens(batch_docs, block_size)
+
+                    # Process each packed block
+                    for block in packed_blocks:
+                        builders[key].add_document(block, [len(block)])
+
+                    # Reset batch_docs and num_docs_in_batch for the next batch
+                    batch_docs = {"text": []}
+                    num_docs_in_batch = 0
+
+
+                    # Optional: Update stats and progress
+                    self.print_processing_stats(i, proc_start, total_bytes_processed)
+            else:
+                for key in doc.keys():
+                    builders[key].add_document(doc[key], sentence_lens[key])
+                    self.print_processing_stats(i, proc_start, total_bytes_processed)
+
 
         fin.close()
         builders[key].finalize(output_idx_files[key])
 
 
+    # Function to handle the concatenation and packing of tokens into blocks
+    def pack_tokens(self,docs, block_size):
+        # Concatenate all tokens from all documents in the batch
+        concatenated_tokens = list(chain(*docs.values()))
+
+        # Calculate the number of full blocks that can be formed
+        num_full_blocks = len(concatenated_tokens) // block_size
+
+        # Generate each full block
+        blocks = [concatenated_tokens[i * block_size: (i + 1) * block_size] for i in range(num_full_blocks)]
+
+        return blocks
+
 def get_args():
     parser = argparse.ArgumentParser()
     group = parser.add_argument_group(title='input data')
@@ -207,7 +258,7 @@ def get_args():
                        help='YTTM tokenizer model.')
     group.add_argument('--vocab-file', type=str, default=None,
                        help='Path to the vocab file')
-    group.add_argument('--vocab-size', default=786,
+    group.add_argument('--vocab-size', default=None,
                        help='size of vocab for use with NullTokenizer')
     group.add_argument('--merge-file', type=str, default=None,
                        help='Path to the BPE merge file (if necessary).')
@@ -231,6 +282,9 @@ def get_args():
     group.add_argument('--keep-sequential-samples', action='store_true',
                        help='Ensure ordering of samples in .jsonl files is '
                             'preserved when using partitions>1.')
+    parser.add_argument('--block-size', type=int, default=2048, help='Block size for token packing')
+    parser.add_argument('--batch-size-for-packing', type=int, default=1000, help='Number of documents to concatenate before packing')
+
     args = parser.parse_args()
     args.keep_empty = False
 
@@ -365,7 +419,7 @@ def main():
     input_key = 'sentence_split' if args.split_sentences else 'partition'
     for name in in_ss_out_names:
         p = multiprocessing.Process(target=partition.process_json_file,
-                                    args=((name[input_key], name['output_prefix']),))
+                                    args=((name[input_key], name['output_prefix']),args.batch_size_for_packing, args.block_size))
         p.start()
         processes.append(p)
 
@@ -405,5 +459,4 @@ def main():
 
 if __name__ == '__main__':
 
-    main()
-
+    main()