Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
yingtongxiong committed Sep 8, 2023
2 parents 671c752 + 1ee31ff commit 9481df9
Show file tree
Hide file tree
Showing 39 changed files with 1,960 additions and 479 deletions.
23 changes: 19 additions & 4 deletions configs/7B_sft.py
@@ -1,4 +1,5 @@
JOB_NAME = "7b_train"
DO_ALERT = False

SEQ_LEN = 2048
HIDDEN_SIZE = 4096
Expand All @@ -22,13 +23,16 @@
ckpt = dict(
enable_save_ckpt=False, # enable ckpt save.
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt.
# load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
# load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
load_optimizer=True, # Wheter to load optimizer states when continuing training.
# load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
load_ckpt_folder="local:llm_ckpts/",
# 'load_ckpt_info' setting guide:
# 1. the 'path' indicate ckpt path,
# 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
checkpoint_every=CHECKPOINT_EVERY,
async_upload=True, # async ckpt upload. (only work for boto3 ckpt)
async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload.
snapshot_ckpt_folder="/".join([SAVE_CKPT_FOLDER, "snapshot"]), # directory for snapshot ckpt storage path.
oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency.
)

Expand All @@ -52,6 +56,8 @@
min_length=50,
# train_folder=TRAIN_FOLDER,
# valid_folder=VALID_FOLDER,
empty_cache_and_diag_interval=10,
diag_outlier_ratio=1.1,
)

grad_scaler = dict(
Expand Down Expand Up @@ -147,3 +153,12 @@

cudnn_deterministic = False
cudnn_benchmark = False

monitor = dict(
# feishu alert configs
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
),
)
64 changes: 20 additions & 44 deletions doc/code-docs/locales/en/LC_MESSAGES/initialize.po
Expand Up @@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: InternLM \n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2023-09-07 14:15+0800\n"
"POT-Creation-Date: 2023-09-08 15:32+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language: zh_CN\n"
Expand All @@ -19,36 +19,36 @@ msgstr ""
"Content-Transfer-Encoding: 8bit\n"
"Generated-By: Babel 2.12.1\n"

#: ../../source/initialize.rst:2 b829330eebd24620b745072bbfc26c98
#: ../../source/initialize.rst:2
msgid "训练构建"
msgstr "Training Setup"

#: ../../source/initialize.rst:7 8c8472b4647a4de8998d75b9ec6f09eb
#: ../../source/initialize.rst:7
msgid "命令行参数解析"
msgstr "Argument Parsing"

#: ../../source/initialize.rst:8 f74176fa4aee4bbfaf989ffab9283ee7
#: ../../source/initialize.rst:9
#, fuzzy
msgid ""
"InternLM 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_"
" 库来向InternLM运行时提供命令行参数配置。用户可 使用 "
" 库来向InternLM运行时提供命令行参数配置。用户可使用 "
"``internlm.initialize.get_default_parser()`` 来获取 InternLM "
"的默认解析器,其中包含一些内置参数,用户可以向此解析器添加自定义参数。"
msgstr ""
"InternLM uses the `argparse <https://docs.python.org/3/library/argparse.html>`_ library to supply commandline "
"configuration to the InternLM runtime. Use ``internlm.initialize.get_default_parser()`` to get InternLM's default "
"parser with some builtin arguments, users can add custom parameters to this parser."
"InternLM uses the `argparse "
"<https://docs.python.org/3/library/argparse.html>`_ library to supply "
"commandline configuration to the InternLM runtime. Use "
"``internlm.initialize.get_default_parser()`` to get InternLM's default "
"parser with some builtin arguments, users can add custom parameters to "
"this parser."

#: 9930855b85bf41ed8712fc40e1e034f7
#: internlm.initialize.launch.get_default_parser:1 of
msgid ""
"Reads user command line and uses an argument parser to parse the input "
"arguments. Input arguments include configuration, host, port, world size,"
" local rank, backend for torch.distributed."
msgstr ""

#: 015003b013e346bea15b4514f2001a25 544472c2ce3c43bfb59317083c6b55c9
#: 7ee60ba1a92a4b9e8174049fb498a4f0 bca7c66f1a5a4517958bcea1e09d5d10
#: f5cbe452ae694c7884ac4596a7735bf6
#: internlm.initialize.initialize_trainer.initialize_trainer
#: internlm.initialize.launch.get_default_parser
#: internlm.train.training_internlm.get_train_data_loader
Expand All @@ -57,55 +57,50 @@ msgstr ""
msgid "返回"
msgstr ""

#: 9b04c3d6b98b44ee89f800b71e8d80a9
#: internlm.initialize.launch.get_default_parser:4 of
msgid ""
"Returns the parser with the default arguments, the user may add "
"customized arguments into this parser."
msgstr ""

#: 147005b197e64c4b9a96a7cfe78045bc 3634f79c9aa547a48eb3fd7f150deb51
#: d3f0aa4143c84b719cd0b53170dd86c1
#: internlm.initialize.initialize_trainer.initialize_trainer
#: internlm.initialize.launch.get_default_parser
#: internlm.train.training_internlm.initialize_model of
msgid "返回类型"
msgstr ""

#: ../../source/initialize.rst:25 db2bf9d3ff81483dbf218e63dd4bbbe4
#: ../../source/initialize.rst:25
msgid "模型初始化"
msgstr "Model Initialization"

#: 5c2e33e254d4495fbc4b0226aac1fddb
#: internlm.train.training_internlm.initialize_model:1 of
msgid "Initialize model with Automatic Mixed Precision."
msgstr ""

#: c1254615508542b680daf73374844f9e
#: internlm.train.training_internlm.initialize_model:3 of
msgid "The neural network model to be trained or evaluated."
msgstr ""

#: ../../source/initialize.rst:29 b9867771b9da40cd8f3a55ee5ab95f65
#: ../../source/initialize.rst:29
msgid "InternLM 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制模型初始化过程。示例模型初始化配置定义如下:"
msgstr ""
"InternLM uses the field ``model_type`` and ``model`` in the config file "
"to control model initialization process. An example model initialization "
"configuratio"

#: ../../source/initialize.rst:57 984a38d7f63949ecbb0d8b2ef3459d57
#: ../../source/initialize.rst:57
msgid "字段 ``model_type`` 指明了要初始化的模型类型"
msgstr ""
"The field ``model_type`` specifics the model type has been registered and"
" to be initialized."

#: ../../source/initialize.rst:58 9f04ad0f145f4e40bc75a3ef45c7a59d
#: ../../source/initialize.rst:58
msgid "字段 ``model`` 中的参数指定了在模型初始化过程中的参数设置"
msgstr ""
"The parameters in field ``model`` specific the configuration settings "
"during model initialization."

#: ../../source/initialize.rst:60 d7780e355bb6429bb5151d9a0e6d7e36
#: ../../source/initialize.rst:60
msgid ""
"值得注意的是,用户可以定义新的模型类型,并使用装饰器 ``@MODEL_INITIALIZER.register_module`` "
"注册模型的初始化函数,其中 ``MODEL_INITIALIZER`` 是类 "
Expand All @@ -117,109 +112,90 @@ msgstr ""
" instantiated object of class ``internlm.util.registry.Registry``, the "
"example is shown as follows."

#: ../../source/initialize.rst:72 d863f71b208a49a09d2d00537e331962
#: ../../source/initialize.rst:72
msgid "优化器初始化"
msgstr "Optimizer Initialization"

#: acaafdc9bb96434bbd42a98f74187db1
#: internlm.train.training_internlm.initialize_optimizer:1 of
msgid "Initialize optimizer."
msgstr ""

#: 62fc4215c9a44bda8b31c933db90f270 93c398e44f6a4f708ba064250a3d253c
#: e2bebdd751724915a65dec444bb89e25
#: internlm.initialize.initialize_trainer.initialize_trainer
#: internlm.train.training_internlm.get_train_data_loader
#: internlm.train.training_internlm.initialize_optimizer of
msgid "参数"
msgstr ""

#: 2033ee96ded8423a80268b337ba9549c
#: internlm.train.training_internlm.initialize_optimizer:3 of
msgid "Your model instance to be trained or evaluated."
msgstr ""

#: df01b44c724b4326a6c85b44694262ba
#: internlm.train.training_internlm.initialize_optimizer:6 of
msgid "A tuple of (optimizer, beta2_scheduler, lr_scheduler)."
msgstr ""

#: ../../source/initialize.rst:79 0b46b890048f4758a9d56e0540759d9f
#: ../../source/initialize.rst:79
msgid "数据加载器初始化"
msgstr "Dataloader Initialization"

#: 58e39b26ab4849788e792df386f01d7e
#: internlm.train.training_internlm.get_train_data_loader:1 of
msgid "Generate and return the training data loader."
msgstr ""

#: 37a91c167e0b4e5fad4edcc3caf0d012
#: internlm.train.training_internlm.get_train_data_loader:3 of
msgid "number of subprocesses used for dataloader."
msgstr ""

#: 947aba2a4f86420d9b2660425a6043cc
#: internlm.train.training_internlm.get_train_data_loader:5 of
msgid "generate function for dataset."
msgstr ""

#: 8a8f5ee665cb4e15bc33194c0b1f346c
#: internlm.train.training_internlm.get_train_data_loader:7 of
msgid "dataset sampler for training dataloader."
msgstr ""

#: 4c3e1e896e7940bf97c124909d2e7f36
#: internlm.train.training_internlm.get_train_data_loader:9 of
msgid "collate function for training dataloader."
msgstr ""

#: d9f0740d048c48888e82c8f8a78e33cd
#: internlm.train.training_internlm.get_train_data_loader:12 of
msgid "A tuple of (train_dl, dataset_types)."
msgstr ""

#: ../../source/initialize.rst:86 1c4df708ff5c47f6abae32617bf2ed31
#: ../../source/initialize.rst:86
msgid "Trainer 初始化"
msgstr "Trainer Initialization"

#: d535583dbcb245499e19c09f3f8b534a
#: internlm.initialize.initialize_trainer.initialize_trainer:1 of
msgid ""
"Core function to wrap the essential training components with our "
"functionality based on the config which is loaded into gpc.config."
msgstr ""

#: 3e370234e4b245e4b9cae1fe235df8ff
#: internlm.initialize.initialize_trainer.initialize_trainer:4 of
msgid "Your model instance or a function to build the model."
msgstr ""

#: b716a4a264234011a7b51fa12e575651
#: internlm.initialize.initialize_trainer.initialize_trainer:6 of
msgid "Your optimizer for training."
msgstr ""

#: 6a54ce9d516f4f14bab281c9db9816e8
#: internlm.initialize.initialize_trainer.initialize_trainer:8 of
msgid "Your criterion instance."
msgstr ""

#: ff9dfd04d31b4dc6afbdd841829b4c33
#: internlm.initialize.initialize_trainer.initialize_trainer:10 of
msgid "Dataloader for training."
msgstr ""

#: de345f9a457a4a88bf60b4ee96535e31
#: internlm.initialize.initialize_trainer.initialize_trainer:12 of
msgid "Dataloader for testing."
msgstr ""

#: 64e646b25420424d9dcdfb1ad7de5e6f
#: internlm.initialize.initialize_trainer.initialize_trainer:14 of
msgid "Your lr scheduler instance, optional."
msgstr ""

#: 39c7132bfafe4e22ae373081fee711ce
#: internlm.initialize.initialize_trainer.initialize_trainer:17 of
msgid ""
"A tuple of ``(trainer, train_dataloader, test_dataloader, lr_scheduler)``"
Expand Down

0 comments on commit 9481df9

Please sign in to comment.