Merge branch 'feat/internlm2' into v0.2.3-internlm2

InternLM · Feb 7, 2024 · 0a4ffea · 0a4ffea
2 parents a3260a8 + bfbc10f
commit 0a4ffea
Show file tree

Hide file tree

Showing 12 changed files with 1,406 additions and 85 deletions.
diff --git a/.gitignore b/.gitignore
@@ -128,6 +128,7 @@ aim_logs/
 nvmelogs/
 run_backup/
 runs/
+RUN/
 runs_bak/
 LLM_ALERT
 small_demo/

diff --git a/configs/_base_/default_runtime.py b/configs/_base_/default_runtime.py
@@ -0,0 +1,37 @@
+# Copyright (c) InternLM. All rights reserved.
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+enable_tb = True
+
+grad_profiling = dict(
+    # calculate layer norms and parameter norms, and show them on tensorboard
+    grad_norm_profiling=False,
+    # count zero gradients, and show them on tensorboard
+    zero_grad_profiling=False,
+    # [optional] layers displayed on tensorboard, default: layers=["ScaleColumnParallelLinear"]
+    # if not set, display all layers
+    layers=["ScaleColumnParallelLinear"],
+    vocab_grad_norm_profiling=False,
+    interval_steps=5,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
diff --git a/configs/_base_/models/internlm2_20B.py b/configs/_base_/models/internlm2_20B.py
@@ -0,0 +1,59 @@
+# Copyright (c) InternLM. All rights reserved.
+
+model_type = "INTERNLM2"
+
+VOCAB_SIZE = 92544
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+NUM_KV_ATTENTION_HEAD = 8
+MLP_RATIO = 8 / 3
+NUM_LAYER = 48
+
+model = dict(
+    num_chunks=1,
+    checkpoint=1.0,
+    dtype="torch.bfloat16",
+    embed_split_hidden=True,
+    num_layers=NUM_LAYER,
+    hidden_size=HIDDEN_SIZE,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
+    mlp_ratio=MLP_RATIO,
+    norm_type="rmsnorm",
+    adapt_hf=True,
+    apply_post_layer_norm=False,
+    no_bias=True,
+    layer_norm_epsilon=1e-5,
+    rope_base=1000000,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+# zero1 parallel:
+#     1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+#         so parameters will be divided within the range of dp.
+#     2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+#     3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+#         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+#     4. fsdp: bool, whether to use fsdp in pytorch, which can be a subsitution of ZeRO1.
+# pipeline parallel (dict):
+#     1. size: int, the size of pipeline parallel.
+#     2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+# tensor parallel: tensor parallel size, usually the number of GPUs per node.
+parallel = dict(
+    zero1=dict(size=16, fsdp=False),
+    tensor=2,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=True,
+)
diff --git a/configs/_base_/models/internlm2_7B.py b/configs/_base_/models/internlm2_7B.py
@@ -0,0 +1,59 @@
+# Copyright (c) InternLM. All rights reserved.
+
+model_type = "INTERNLM2"
+
+VOCAB_SIZE = 92544
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+NUM_KV_ATTENTION_HEAD = 8
+MLP_RATIO = 3.5
+NUM_LAYER = 32
+
+model = dict(
+    num_chunks=1,
+    checkpoint=0.2,
+    dtype="torch.bfloat16",
+    embed_split_hidden=True,
+    num_layers=NUM_LAYER,
+    hidden_size=HIDDEN_SIZE,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
+    mlp_ratio=MLP_RATIO,
+    norm_type="rmsnorm",
+    adapt_hf=False,
+    apply_post_layer_norm=False,
+    no_bias=True,
+    layer_norm_epsilon=1e-5,
+    rope_base=1000000,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+# zero1 parallel:
+#     1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+#         so parameters will be divided within the range of dp.
+#     2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+#     3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+#         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+#     4. fsdp: bool, whether to use fsdp in pytorch, which can be a subsitution of ZeRO1.
+# pipeline parallel (dict):
+#     1. size: int, the size of pipeline parallel.
+#     2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+# tensor parallel: tensor parallel size, usually the number of GPUs per node.
+parallel = dict(
+    zero1=dict(size=8, fsdp=False),
+    tensor=1,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
+)
diff --git a/configs/_base_/models/internlm_20B.py b/configs/_base_/models/internlm_20B.py
@@ -0,0 +1,54 @@
+# Copyright (c) InternLM. All rights reserved.
+
+model_type = "INTERNLM"
+
+VOCAB_SIZE = 103168
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+
+model = dict(
+    num_chunks=1,
+    checkpoint=False,
+    dtype="torch.bfloat16",
+    embed_split_hidden=True,
+    num_layers=NUM_LAYER,
+    hidden_size=HIDDEN_SIZE,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    mlp_ratio=MLP_RATIO,
+    norm_type="rmsnorm",
+    apply_post_layer_norm=False,
+    layer_norm_epsilon=1e-5,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+# zero1 parallel:
+#     1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+#         so parameters will be divided within the range of dp.
+#     2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+#     3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+#         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+#     4. fsdp: bool, whether to use fsdp in pytorch, which can be a subsitution of ZeRO1.
+# pipeline parallel (dict):
+#     1. size: int, the size of pipeline parallel.
+#     2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+# tensor parallel: tensor parallel size, usually the number of GPUs per node.
+parallel = dict(
+    zero1=dict(size=8, fsdp=False),
+    tensor=4,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
+)
diff --git a/configs/_base_/models/internlm_7B.py b/configs/_base_/models/internlm_7B.py
@@ -0,0 +1,54 @@
+# Copyright (c) InternLM. All rights reserved.
+
+model_type = "INTERNLM"
+
+VOCAB_SIZE = 103168
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+
+model = dict(
+    num_chunks=1,
+    checkpoint=False,
+    dtype="torch.bfloat16",
+    embed_split_hidden=True,
+    num_layers=NUM_LAYER,
+    hidden_size=HIDDEN_SIZE,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    mlp_ratio=MLP_RATIO,
+    norm_type="rmsnorm",
+    apply_post_layer_norm=False,
+    layer_norm_epsilon=1e-5,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+# zero1 parallel:
+#     1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+#         so parameters will be divided within the range of dp.
+#     2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+#     3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+#         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+#     4. fsdp: bool, whether to use fsdp in pytorch, which can be a subsitution of ZeRO1.
+# pipeline parallel (dict):
+#     1. size: int, the size of pipeline parallel.
+#     2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+# tensor parallel: tensor parallel size, usually the number of GPUs per node.
+parallel = dict(
+    zero1=dict(size=8, fsdp=False),
+    tensor=1,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
+)