Skip to content

Commit

Permalink
Merge branch 'feat/internlm2' into v0.2.3-internlm2
Browse files Browse the repository at this point in the history
  • Loading branch information
gaoyang07 committed Feb 7, 2024
2 parents a3260a8 + bfbc10f commit 0a4ffea
Show file tree
Hide file tree
Showing 12 changed files with 1,406 additions and 85 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ aim_logs/
nvmelogs/
run_backup/
runs/
RUN/
runs_bak/
LLM_ALERT
small_demo/
Expand Down
37 changes: 37 additions & 0 deletions configs/_base_/default_runtime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (c) InternLM. All rights reserved.

cudnn_deterministic = False
cudnn_benchmark = False

enable_tb = True

grad_profiling = dict(
# calculate layer norms and parameter norms, and show them on tensorboard
grad_norm_profiling=False,
# count zero gradients, and show them on tensorboard
zero_grad_profiling=False,
# [optional] layers displayed on tensorboard, default: layers=["ScaleColumnParallelLinear"]
# if not set, display all layers
layers=["ScaleColumnParallelLinear"],
vocab_grad_norm_profiling=False,
interval_steps=5,
)

grad_scaler = dict(
fp16=dict(
# the initial loss scale, defaults to 2**16
initial_scale=2**16,
# the minimum loss scale, defaults to None
min_scale=1,
# the number of steps to increase loss scale when no overflow occurs
growth_interval=1000,
),
# the multiplication factor for increasing loss scale, defaults to 2
growth_factor=2,
# the multiplication factor for decreasing loss scale, defaults to 0.5
backoff_factor=0.5,
# the maximum loss scale, defaults to None
max_scale=2**24,
# the number of overflows before decreasing loss scale, defaults to 2
hysteresis=2,
)
59 changes: 59 additions & 0 deletions configs/_base_/models/internlm2_20B.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright (c) InternLM. All rights reserved.

model_type = "INTERNLM2"

VOCAB_SIZE = 92544
HIDDEN_SIZE = 6144
NUM_ATTENTION_HEAD = 48
NUM_KV_ATTENTION_HEAD = 8
MLP_RATIO = 8 / 3
NUM_LAYER = 48

model = dict(
num_chunks=1,
checkpoint=1.0,
dtype="torch.bfloat16",
embed_split_hidden=True,
num_layers=NUM_LAYER,
hidden_size=HIDDEN_SIZE,
vocab_size=VOCAB_SIZE,
embed_grad_scale=1,
parallel_output=True,
num_attention_heads=NUM_ATTENTION_HEAD,
num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
mlp_ratio=MLP_RATIO,
norm_type="rmsnorm",
adapt_hf=True,
apply_post_layer_norm=False,
no_bias=True,
layer_norm_epsilon=1e-5,
rope_base=1000000,
)

hybrid_zero_optimizer = dict(
# Enable low_level_optimzer overlap_communication
overlap_sync_grad=True,
overlap_sync_param=False,
# bucket size for nccl communication params
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
clip_grad_norm=1.0,
)

# zero1 parallel:
# 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
# so parameters will be divided within the range of dp.
# 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
# 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
# For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
# 4. fsdp: bool, whether to use fsdp in pytorch, which can be a subsitution of ZeRO1.
# pipeline parallel (dict):
# 1. size: int, the size of pipeline parallel.
# 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
# tensor parallel: tensor parallel size, usually the number of GPUs per node.
parallel = dict(
zero1=dict(size=16, fsdp=False),
tensor=2,
pipeline=dict(size=1, interleaved_overlap=True),
sequence_parallel=True,
)
59 changes: 59 additions & 0 deletions configs/_base_/models/internlm2_7B.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright (c) InternLM. All rights reserved.

model_type = "INTERNLM2"

VOCAB_SIZE = 92544
HIDDEN_SIZE = 4096
NUM_ATTENTION_HEAD = 32
NUM_KV_ATTENTION_HEAD = 8
MLP_RATIO = 3.5
NUM_LAYER = 32

model = dict(
num_chunks=1,
checkpoint=0.2,
dtype="torch.bfloat16",
embed_split_hidden=True,
num_layers=NUM_LAYER,
hidden_size=HIDDEN_SIZE,
vocab_size=VOCAB_SIZE,
embed_grad_scale=1,
parallel_output=True,
num_attention_heads=NUM_ATTENTION_HEAD,
num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
mlp_ratio=MLP_RATIO,
norm_type="rmsnorm",
adapt_hf=False,
apply_post_layer_norm=False,
no_bias=True,
layer_norm_epsilon=1e-5,
rope_base=1000000,
)

hybrid_zero_optimizer = dict(
# Enable low_level_optimzer overlap_communication
overlap_sync_grad=True,
overlap_sync_param=False,
# bucket size for nccl communication params
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
clip_grad_norm=1.0,
)

# zero1 parallel:
# 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
# so parameters will be divided within the range of dp.
# 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
# 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
# For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
# 4. fsdp: bool, whether to use fsdp in pytorch, which can be a subsitution of ZeRO1.
# pipeline parallel (dict):
# 1. size: int, the size of pipeline parallel.
# 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
# tensor parallel: tensor parallel size, usually the number of GPUs per node.
parallel = dict(
zero1=dict(size=8, fsdp=False),
tensor=1,
pipeline=dict(size=1, interleaved_overlap=True),
sequence_parallel=False,
)
54 changes: 54 additions & 0 deletions configs/_base_/models/internlm_20B.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) InternLM. All rights reserved.

model_type = "INTERNLM"

VOCAB_SIZE = 103168
HIDDEN_SIZE = 5120
NUM_ATTENTION_HEAD = 40
MLP_RATIO = 8 / 3
NUM_LAYER = 60

model = dict(
num_chunks=1,
checkpoint=False,
dtype="torch.bfloat16",
embed_split_hidden=True,
num_layers=NUM_LAYER,
hidden_size=HIDDEN_SIZE,
vocab_size=VOCAB_SIZE,
embed_grad_scale=1,
parallel_output=True,
num_attention_heads=NUM_ATTENTION_HEAD,
mlp_ratio=MLP_RATIO,
norm_type="rmsnorm",
apply_post_layer_norm=False,
layer_norm_epsilon=1e-5,
)

hybrid_zero_optimizer = dict(
# Enable overlap_communication
overlap_sync_grad=True,
overlap_sync_param=False,
# bucket size for nccl communication params
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
clip_grad_norm=1.0,
)

# zero1 parallel:
# 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
# so parameters will be divided within the range of dp.
# 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
# 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
# For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
# 4. fsdp: bool, whether to use fsdp in pytorch, which can be a subsitution of ZeRO1.
# pipeline parallel (dict):
# 1. size: int, the size of pipeline parallel.
# 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
# tensor parallel: tensor parallel size, usually the number of GPUs per node.
parallel = dict(
zero1=dict(size=8, fsdp=False),
tensor=4,
pipeline=dict(size=1, interleaved_overlap=True),
sequence_parallel=False,
)
54 changes: 54 additions & 0 deletions configs/_base_/models/internlm_7B.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) InternLM. All rights reserved.

model_type = "INTERNLM"

VOCAB_SIZE = 103168
HIDDEN_SIZE = 4096
NUM_ATTENTION_HEAD = 32
MLP_RATIO = 8 / 3
NUM_LAYER = 32

model = dict(
num_chunks=1,
checkpoint=False,
dtype="torch.bfloat16",
embed_split_hidden=True,
num_layers=NUM_LAYER,
hidden_size=HIDDEN_SIZE,
vocab_size=VOCAB_SIZE,
embed_grad_scale=1,
parallel_output=True,
num_attention_heads=NUM_ATTENTION_HEAD,
mlp_ratio=MLP_RATIO,
norm_type="rmsnorm",
apply_post_layer_norm=False,
layer_norm_epsilon=1e-5,
)

hybrid_zero_optimizer = dict(
# Enable overlap_communication
overlap_sync_grad=True,
overlap_sync_param=False,
# bucket size for nccl communication params
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
clip_grad_norm=1.0,
)

# zero1 parallel:
# 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
# so parameters will be divided within the range of dp.
# 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
# 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
# For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
# 4. fsdp: bool, whether to use fsdp in pytorch, which can be a subsitution of ZeRO1.
# pipeline parallel (dict):
# 1. size: int, the size of pipeline parallel.
# 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
# tensor parallel: tensor parallel size, usually the number of GPUs per node.
parallel = dict(
zero1=dict(size=8, fsdp=False),
tensor=1,
pipeline=dict(size=1, interleaved_overlap=True),
sequence_parallel=False,
)
Loading

0 comments on commit 0a4ffea

Please sign in to comment.