# ZeRO with Deepspeed

In [1]:
%%writefile ./examples/deepspeed_example.py

# Import necessary libraries
from accelerate import Accelerator
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import deepspeed

# Initialize Accelerate
accelerator = Accelerator()

# Load the dataset
dataset = load_dataset("imdb", split="train[:2000]")  # Using a small subset for demonstration
eval_dataset = load_dataset("imdb", split="test[2000:2500]")  # Subset for quick evaluation

# Load the model and tokenizer
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the dataset
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize, batched=True)

# Define training arguments with DeepSpeed config
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=80, # Match DeepSpeed config
    num_train_epochs=1,
    evaluation_strategy="epoch",
    learning_rate=0.00015,  # Match DeepSpeed config
    weight_decay=0.01,  # Match DeepSpeed config
    fp16=True,  # Match DeepSpeed config
    report_to="none"
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_eval_dataset
)

# Prepare model and data with Accelerate
model, tokenized_dataset = accelerator.prepare(model, tokenized_dataset)

# Start training
trainer.train()

Overwriting ./examples/deepspeed_example.py


In [2]:
%%writefile ./tooling/config/ds_config.json
{ 
  "fp16": {
    "enabled": true,
    "loss_scale": 0,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "optimizer": {
    "type": "AdamW",
    "params": {
      "lr": 0.00015,
      "betas": [0.9, 0.999],
      "eps": 1e-8,
      "weight_decay": 0.01
    }
  },
  "zero_optimization": {
    "stage": 2,
    "offload_optimizer": {
      "device": "none",
      "pin_memory": true
    },
    "contiguous_gradients": true,
    "overlap_comm": true,
    "allgather_partitions": true,
    "reduce_scatter": true,
    "reduce_bucket_size": 200000000,
    "allgather_bucket_size": 200000000
  },
  "steps_per_print": 100,
  "train_batch_size": 320,
  "train_micro_batch_size_per_gpu": 80,
  "gradient_accumulation_steps": 1,
  "wall_clock_breakdown": false
}


Overwriting ./tooling/config/ds_config.json


In [3]:
%%writefile ./tooling/config/accelerate_ds_config.yaml
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: DEEPSPEED
deepspeed_config:
  deepspeed_config_file: "/home/fs71550/simeon/LLMs-on-supercomputers/tooling/config/ds_config.json"
  deepspeed_multinode_launcher: standard
  zero3_init_flag: false
downcast_bf16: 'yes'
machine_rank: 0
main_training_function: main
num_machines: 2
num_processes: 4
rdzv_backend: static
same_network: true
use_cpu: false

Overwriting ./tooling/config/accelerate_ds_config.yaml


In [4]:
%%writefile ./tooling/slurm_scripts/ds_example.sh
#!/bin/bash

#SBATCH --job-name=training_example
#SBATCH --account=p71550
##SBATCH --account=p70824 # training account, please uncomment for training
#SBATCH --nodes=2                    # Number of nodes
#SBATCH --ntasks-per-node=1          # Number of tasks per node
#SBATCH --cpus-per-task=256          # Number of CPU cores per task (including hyperthreading if needed)
#SBATCH --partition=zen3_0512_a100x2
#SBATCH --qos=admin
##SBATCH --qos=zen3_0512_a100x2 # qos for training
#SBATCH --gres=gpu:2                 # Number of GPUs per node
#SBATCH --output=../../output/%x-%j.out  # Output file
##SBATCH --reservation=

######################
### Set Environment ###
######################
module load miniconda3
eval "$(conda shell.bash hook)"
source /opt/sw/jupyterhub/envs/conda/vsc5/jupyterhub-huggingface-v2/modules  # Activate the conda environment

######################
#### Set Network #####
######################
# Get the IP address of the master node (head node)
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
nodes_array=($nodes)
node_0=${nodes_array[0]}

NUM_PROCESSES=$(( SLURM_NNODES * SLURM_GPUS_ON_NODE ))

export MASTER_ADDR=$node_0
export MASTER_PORT=29500

######################
#### Prepare Launch ###
######################
# Configure Accelerate launch command

export LAUNCHER="accelerate launch \
    --config_file "../config/accelerate_ds_config.yaml" \
    --machine_rank \$SLURM_PROCID \
    --main_process_ip $MASTER_ADDR \
    --main_process_port $MASTER_PORT \
    --num_processes $NUM_PROCESSES \
    --num_machines $SLURM_NNODES \
    "
export PROGRAM="../../examples/deepspeed_example.py"

START=$(date +%s.%N)
echo "START TIME: $(date)"

export SRUN_ARGS="--cpus-per-task $SLURM_CPUS_PER_TASK --jobid $SLURM_JOBID"
export OMP_NUM_THREADS=256
export CMD="$LAUNCHER $PROGRAM"

# Execute the command with srun to run on multiple nodes
srun $SRUN_ARGS ../start_train.sh "$CMD"

echo "END TIME: $(date)"
END=$(date +%s.%N)
RUNTIME=$(echo "$END - $START" | bc -l)
echo "Runtime: $RUNTIME"

Writing ./tooling/slurm_scripts/ds_example.sh
