Skip to content

Commit

Permalink
Reformat code and fix job trigger rules
Browse files Browse the repository at this point in the history
  • Loading branch information
EmmaQiaoCh authored and minseokl committed Aug 26, 2022
1 parent ba021a7 commit 4d698a3
Show file tree
Hide file tree
Showing 29 changed files with 948 additions and 590 deletions.
34 changes: 31 additions & 3 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,20 @@ wdl:
DGXNNODES: 1 # node num
TEST_CMD: ./ci/integration_test/wdl/wdl.sub # test script

wdl_multi_gpu:
extends: .cluster_test_job_daily # test on selene needs to extend .cluster_test_job
needs:
- build_train_single_node
variables:
GPFSFOLDER: $LOGDIR/wdl_multi_gpu # log dir, usually $LOGDIR + job name
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} # should not change
CONT: $TRAIN_IMAGE_VERSIONED # image name
MOUNTS: ${DATASET}:${DATASET_MOUNT} # mount
SLURM_ACCOUNT: devtech # account, do not need change
WALLTIME: "00:15:00" # estimate job time. Less time, higher priority
DGXNNODES: 1 # node num
TEST_CMD: ./ci/integration_test/wdl/wdl_daily.sub

deepfm:
extends: .cluster_test_job
needs:
Expand All @@ -365,6 +379,20 @@ deepfm:
DGXNNODES: 1
TEST_CMD: ./ci/integration_test/deepfm/deepfm.sub

deepfm_multi_gpu:
extends: .cluster_test_job_daily
needs:
- build_train_single_node
variables:
GPFSFOLDER: $LOGDIR/deepfm_multi_gpu
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
CONT: $TRAIN_IMAGE_VERSIONED
MOUNTS: ${DATASET}:${DATASET_MOUNT}
SLURM_ACCOUNT: devtech
WALLTIME: "00:15:00"
DGXNNODES: 1
TEST_CMD: ./ci/integration_test/deepfm/deepfm_daily.sub

dlrm:
extends: .cluster_test_job
needs:
Expand Down Expand Up @@ -731,14 +759,14 @@ hdfs_backend_test:

wdl_check:
# Push logs to gitlab
extends: .cluster_post_test_job
extends: .cluster_post_test_job_daily
needs:
- wdl
- wdl_multi_gpu
variables:
GPFSFOLDER: $LOGDIR/wdl_check
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
CONT: $TRAIN_IMAGE_VERSIONED
MOUNTS: $LOGDIR/wdl:/logs
MOUNTS: $LOGDIR/wdl_multi_gpu:/logs
SLURM_ACCOUNT: devtech
WALLTIME: "00:15:00"
DGXNNODES: 1
Expand Down
1 change: 1 addition & 0 deletions HugeCTR/core/buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include "buffer.hpp"

#include "HugeCTR/include/utils.hpp"
#include "core.hpp"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,10 +217,9 @@ class FusedReluBiasFullyConnectedLayer : public Layer {
CudaDeviceContext context(get_device_id());
HCTR_LIB_THROW(cudaEventDestroy(event_overlap_));
}
}catch(const std::exception &error) {
} catch (const std::exception& error) {
HCTR_LOG(INFO, WORLD, "FusedReluBiasFullyConnectedLayer Dtor error:%s", error.what());
}

};
};
} // namespace HugeCTR
1 change: 0 additions & 1 deletion HugeCTR/src/base/debug/logger.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

#include <algorithm>
#include <base/debug/logger.hpp>
#include <resource_managers/resource_manager_ext.hpp>
#include <chrono>
#include <common.hpp>
#include <cstdarg>
Expand Down
3 changes: 1 addition & 2 deletions ci/integration_test/deepfm/deepfm.sub
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@
srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
cd /dataset/criteo_kaggle/deepfm && \
python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/deepfm_1gpu.json && \
python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/deepfm_fp16_1gpu.json && \
python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/deepfm_8gpu.json"
python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/deepfm_fp16_1gpu.json"
5 changes: 5 additions & 0 deletions ci/integration_test/deepfm/deepfm_daily.sub
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
cd /dataset/criteo_kaggle/deepfm && \
python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/deepfm_8gpu.json"
5 changes: 1 addition & 4 deletions ci/integration_test/wdl/wdl.sub
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,4 @@
srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
cd /dataset/criteo_kaggle/wdl && \
python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_1gpu.json && \
python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_8gpu.json && \
python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_fp16_1gpu.json && \
mkdir /workdir/export_predictions_wdl_fp16_8gpu/ && \
python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_fp16_8gpu.json"
python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_fp16_1gpu.json"
7 changes: 7 additions & 0 deletions ci/integration_test/wdl/wdl_daily.sub
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
cd /dataset/criteo_kaggle/wdl && \
python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_8gpu.json && \
mkdir /workdir/export_predictions_wdl_fp16_8gpu/ && \
python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_fp16_8gpu.json"
28 changes: 14 additions & 14 deletions ci/rules.gitlab_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@
- <<: *if-push-to-mr-opened
when: never
- <<: *if-default-ci-actions
when: always
# when: always
- <<: *if-new-image
when: always
# when: always
- when: never

.hugectr:rules:build:
Expand All @@ -72,11 +72,11 @@
when: never
- <<: *if-default-ci-actions
changes: *hugectr-source
when: always
#when: always
- <<: *if-new-image
when: always
#when: always
- <<: *if-daily-schedule
when: always
#when: always
- when: never

.sok:rules:build:
Expand All @@ -86,36 +86,36 @@
when: never
- <<: *if-default-ci-actions
changes: *sok-source
when: always
#when: always
- <<: *if-new-image
when: always
#when: always
- <<: *if-daily-schedule
when: always
#when: always
- when: never

# Condition for run sanity test
.hugectr:rules:sanity-test:
rules:
- <<: *if-merge-request
changes: *hugectr-source
when: always
#when: always
- <<: *if-new-image
when: always
#when: always
- when: never

.sok:rules:sanity-test:
rules:
- <<: *if-merge-request
changes: *sok-source
when: always
#when: always
- <<: *if-new-image
when: always
#when: always
- when: never

.default:rules:daily-test:
rules:
- <<: *if-new-image
when: always
#when: always
- <<: *if-daily-schedule
when: always
#when: always
- when: never
44 changes: 28 additions & 16 deletions ci/template.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
stages:
- build_from_scratch
- format_check
- build
- test
- inference_benchmark
Expand All @@ -11,31 +12,40 @@ stages:
- post_test

.python_format:
stage: build
image: python:3.8-alpine
stage: format_check
tags:
- nvidia.com/cuda.driver.major=470
extends:
- .dlcluster_job
- .format:rules:check
script:
- srun -N 1 -p dgx1v,dgx1v16g,dgx1v32g bash -cx "
echo $(pwd);
docker pull python:3.8-alpine;
docker run --rm -v $(pwd):/src -w /src python:3.8-alpine sh -c 'pip install black && black --line-length 100 --check --diff --color --extend-exclude \"$EXCLUDE\" .'
"
allow_failure: true
- pwd
- ls -all
- docker pull python:3.8-alpine;
- docker run -d --rm --name python_${CI_PIPELINE_ID} -w /src python:3.8-alpine sleep infinity
- docker cp $(pwd) python_${CI_PIPELINE_ID}:/src
- docker exec python_${CI_PIPELINE_ID} sh -c 'pip install black && pwd && ls -all . '
- docker exec python_${CI_PIPELINE_ID} sh -c "black --line-length 100 --check --diff --color --extend-exclude \"$EXCLUDE\" ./hugectr"
after_script:
- docker stop python_${CI_PIPELINE_ID}
allow_failure: false
timeout: 15 minutes

.clang_format:
stage: build
stage: format_check
tags:
- nvidia.com/cuda.driver.major=470
extends:
- .dlcluster_job
- .format:rules:check
script:
- srun -N 1 -p dgx1v,dgx1v16g,dgx1v32g bash -cx "
echo $(pwd);
docker run --rm --workdir /src -v $(pwd):/src gitlab-master.nvidia.com:5005/dl/hugectr/hugectr/clang-format-lint --clang-format-executable /clang-format/$EXECUTABLE -r --exclude $EXCLUDE --style $STYLE --extensions $EXTENSIONS .
"
allow_failure: true
- pwd
- ls -all
- docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}"
- docker run -d --rm --name clang_${CI_PIPELINE_ID} -w /src gitlab-master.nvidia.com:5005/dl/hugectr/hugectr/clang-format-lint-new sleep infinity
- docker cp $(pwd) clang_${CI_PIPELINE_ID}:/src
- docker exec clang_${CI_PIPELINE_ID} sh -c "cd ./hugectr && /run-clang-format.py --clang-format-executable /clang-format/$EXECUTABLE -r --exclude $EXCLUDE --style $STYLE --extensions $EXTENSIONS ."
after_script:
- docker stop clang_${CI_PIPELINE_ID}
allow_failure: false
timeout: 15 minutes

.build_nightly:
Expand Down Expand Up @@ -92,6 +102,8 @@ stages:
rules:
- if: $NIGHTLY == "1"
when: always
- if: $TEST_NEW_IMAGE == "1"
when: always
- when: never
timeout: 2 hours

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@
from hierarchical_parameter_server.core.lookup_layer import LookupLayer
from hierarchical_parameter_server.core.sparse_lookup_layer import SparseLookupLayer

__all__ = [item for item in dir() if not item.startswith("__")]
__all__ = [item for item in dir() if not item.startswith("__")]
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def Init(**kwargs):
.. code-block:: python
import hierarchical_parameter_server as hps
with strategy.scope():
hps.Init(**kwargs)
Expand All @@ -72,7 +72,7 @@ def Init(**kwargs):
.. code-block:: python
import hierarchical_parameter_server as hps
hps_init = hps.Init(**kwargs)
with tf.Session() as sess:
sess.run(hps_init)
Expand All @@ -83,19 +83,19 @@ def Init(**kwargs):
kwargs: dict
keyword arguments for this function.
Currently, it must contains `global_batch_size` and `ps_config_file`.
* `global_batch_size`: int, the global batch size for HPS that is deployed on multiple GPUs
* `ps_config_file`: str, the JSON configuration file for HPS initialization
An example `ps_config_file` is as follows and `global_batch_size` can be
An example `ps_config_file` is as follows and `global_batch_size` can be
configured as 16384 correspondingly:
.. code-block:: python
ps_config_file = {
"supportlonglong" : True,
"models" :
"models" :
[{
"model": "foo",
"sparse_files": ["foo_sparse.model"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class LookupLayer(tf.keras.layers.Layer):
"""
Abbreviated as ``hps.LookupLayer(*args, **kwargs)``.
This is a wrapper class for HPS lookup layer, which basically performs
This is a wrapper class for HPS lookup layer, which basically performs
the same function as tf.nn.embedding_lookup.
Parameters
Expand Down
Loading

0 comments on commit 4d698a3

Please sign in to comment.