Reformat code and fix job trigger rules

NVIDIA-Merlin · Aug 26, 2022 · 4d698a3 · 4d698a3
1 parent ba021a7
commit 4d698a3
Show file tree

Hide file tree

Showing 29 changed files with 948 additions and 590 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -351,6 +351,20 @@ wdl:
     DGXNNODES: 1                                                                 # node num
     TEST_CMD: ./ci/integration_test/wdl/wdl.sub                                  # test script
 
+wdl_multi_gpu:
+  extends: .cluster_test_job_daily                                                     # test on selene needs to extend .cluster_test_job
+  needs:
+    - build_train_single_node
+  variables:
+    GPFSFOLDER: $LOGDIR/wdl_multi_gpu                                                      # log dir, usually $LOGDIR + job name
+    GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}                                     # should not change
+    CONT: $TRAIN_IMAGE_VERSIONED                                                 # image name
+    MOUNTS: ${DATASET}:${DATASET_MOUNT}                                          # mount
+    SLURM_ACCOUNT: devtech                                                       # account, do not need change
+    WALLTIME: "00:15:00"                                                         # estimate job time. Less time, higher priority
+    DGXNNODES: 1                                                                 # node num
+    TEST_CMD: ./ci/integration_test/wdl/wdl_daily.sub 
+
 deepfm:
   extends: .cluster_test_job
   needs:
@@ -365,6 +379,20 @@ deepfm:
     DGXNNODES: 1
     TEST_CMD: ./ci/integration_test/deepfm/deepfm.sub
 
+deepfm_multi_gpu:
+  extends: .cluster_test_job_daily
+  needs:
+    - build_train_single_node
+  variables:
+    GPFSFOLDER: $LOGDIR/deepfm_multi_gpu
+    GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
+    CONT: $TRAIN_IMAGE_VERSIONED
+    MOUNTS: ${DATASET}:${DATASET_MOUNT}
+    SLURM_ACCOUNT: devtech
+    WALLTIME: "00:15:00"
+    DGXNNODES: 1
+    TEST_CMD: ./ci/integration_test/deepfm/deepfm_daily.sub
+
 dlrm:
   extends: .cluster_test_job
   needs:
@@ -731,14 +759,14 @@ hdfs_backend_test:
 
 wdl_check:
   # Push logs to gitlab
-  extends: .cluster_post_test_job
+  extends: .cluster_post_test_job_daily
   needs:
-    - wdl
+    - wdl_multi_gpu
   variables:
     GPFSFOLDER: $LOGDIR/wdl_check
     GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
     CONT: $TRAIN_IMAGE_VERSIONED
-    MOUNTS: $LOGDIR/wdl:/logs
+    MOUNTS: $LOGDIR/wdl_multi_gpu:/logs
     SLURM_ACCOUNT: devtech
     WALLTIME: "00:15:00"
     DGXNNODES: 1

diff --git a/HugeCTR/core/buffer.cpp b/HugeCTR/core/buffer.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "buffer.hpp"
+
 #include "HugeCTR/include/utils.hpp"
 #include "core.hpp"
 

diff --git a/HugeCTR/include/layers/fused_relu_bias_fully_connected_layer.hpp b/HugeCTR/include/layers/fused_relu_bias_fully_connected_layer.hpp
@@ -217,10 +217,9 @@ class FusedReluBiasFullyConnectedLayer : public Layer {
         CudaDeviceContext context(get_device_id());
         HCTR_LIB_THROW(cudaEventDestroy(event_overlap_));
       }
-    }catch(const std::exception &error) {
+    } catch (const std::exception& error) {
       HCTR_LOG(INFO, WORLD, "FusedReluBiasFullyConnectedLayer Dtor error:%s", error.what());
     }
-
   };
 };
 }  // namespace HugeCTR
diff --git a/HugeCTR/src/base/debug/logger.cpp b/HugeCTR/src/base/debug/logger.cpp
@@ -20,7 +20,6 @@
 
 #include <algorithm>
 #include <base/debug/logger.hpp>
-#include <resource_managers/resource_manager_ext.hpp>
 #include <chrono>
 #include <common.hpp>
 #include <cstdarg>

diff --git a/ci/integration_test/deepfm/deepfm.sub b/ci/integration_test/deepfm/deepfm.sub
@@ -3,5 +3,4 @@
 srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
       cd /dataset/criteo_kaggle/deepfm && \
       python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/deepfm_1gpu.json && \
-      python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/deepfm_fp16_1gpu.json && \
-      python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/deepfm_8gpu.json"
+      python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/deepfm_fp16_1gpu.json"
diff --git a/ci/integration_test/deepfm/deepfm_daily.sub b/ci/integration_test/deepfm/deepfm_daily.sub
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
+      cd /dataset/criteo_kaggle/deepfm && \
+      python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/deepfm_8gpu.json"
diff --git a/ci/integration_test/wdl/wdl.sub b/ci/integration_test/wdl/wdl.sub
@@ -3,7 +3,4 @@
 srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
       cd /dataset/criteo_kaggle/wdl && \
       python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_1gpu.json && \
-      python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_8gpu.json && \
-      python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_fp16_1gpu.json && \
-      mkdir /workdir/export_predictions_wdl_fp16_8gpu/ && \
-      python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_fp16_8gpu.json"
+      python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_fp16_1gpu.json"
diff --git a/ci/integration_test/wdl/wdl_daily.sub b/ci/integration_test/wdl/wdl_daily.sub
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
+      cd /dataset/criteo_kaggle/wdl && \
+      python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_8gpu.json && \
+      mkdir /workdir/export_predictions_wdl_fp16_8gpu/ && \
+      python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_fp16_8gpu.json"
diff --git a/ci/rules.gitlab_ci.yml b/ci/rules.gitlab_ci.yml
@@ -60,9 +60,9 @@
     - <<: *if-push-to-mr-opened
       when: never
     - <<: *if-default-ci-actions
-      when: always
+      #      when: always
     - <<: *if-new-image
-      when: always
+      #      when: always
     - when: never
 
 .hugectr:rules:build:
@@ -72,11 +72,11 @@
       when: never
     - <<: *if-default-ci-actions
       changes: *hugectr-source
-      when: always
+      #when: always
     - <<: *if-new-image
-      when: always
+      #when: always
     - <<: *if-daily-schedule
-      when: always
+      #when: always
     - when: never
 
 .sok:rules:build:
@@ -86,36 +86,36 @@
       when: never
     - <<: *if-default-ci-actions
       changes: *sok-source
-      when: always
+      #when: always
     - <<: *if-new-image
-      when: always
+      #when: always
     - <<: *if-daily-schedule
-      when: always
+      #when: always
     - when: never
 
 # Condition for run sanity test
 .hugectr:rules:sanity-test:
   rules:
     - <<: *if-merge-request
       changes: *hugectr-source
-      when: always
+      #when: always
     - <<: *if-new-image
-      when: always
+      #when: always
     - when: never
 
 .sok:rules:sanity-test:
   rules:
     - <<: *if-merge-request
       changes: *sok-source
-      when: always
+      #when: always
     - <<: *if-new-image
-      when: always
+      #when: always
     - when: never
 
 .default:rules:daily-test:
   rules:
     - <<: *if-new-image
-      when: always
+      #when: always
     - <<: *if-daily-schedule
-      when: always
+      #when: always
     - when: never
diff --git a/ci/template.yml b/ci/template.yml
@@ -1,5 +1,6 @@
 stages:
   - build_from_scratch
+  - format_check
   - build
   - test
   - inference_benchmark
@@ -11,31 +12,40 @@ stages:
   - post_test
 
 .python_format:
-  stage: build
-  image: python:3.8-alpine
+  stage: format_check
+  tags:
+    - nvidia.com/cuda.driver.major=470
   extends:
-    - .dlcluster_job
     - .format:rules:check
   script:
-    - srun -N 1 -p dgx1v,dgx1v16g,dgx1v32g bash -cx "
-      echo $(pwd);
-      docker pull python:3.8-alpine;
-      docker run --rm -v $(pwd):/src -w /src python:3.8-alpine sh -c 'pip install black && black --line-length 100 --check --diff --color --extend-exclude \"$EXCLUDE\" .'
-      "
-  allow_failure: true
+    - pwd
+    - ls -all
+    - docker pull python:3.8-alpine;
+    - docker run -d --rm --name python_${CI_PIPELINE_ID} -w /src python:3.8-alpine sleep infinity
+    - docker cp $(pwd) python_${CI_PIPELINE_ID}:/src
+    - docker exec python_${CI_PIPELINE_ID} sh -c 'pip install black && pwd && ls -all . '
+    - docker exec python_${CI_PIPELINE_ID} sh -c "black --line-length 100 --check --diff --color --extend-exclude \"$EXCLUDE\" ./hugectr"
+  after_script:
+    - docker stop python_${CI_PIPELINE_ID}
+  allow_failure: false
   timeout: 15 minutes
 
 .clang_format:
-  stage: build
+  stage: format_check
+  tags:
+    - nvidia.com/cuda.driver.major=470
   extends:
-    - .dlcluster_job
     - .format:rules:check
   script:
-    - srun -N 1 -p dgx1v,dgx1v16g,dgx1v32g bash -cx "
-      echo $(pwd);
-      docker run --rm --workdir /src -v $(pwd):/src gitlab-master.nvidia.com:5005/dl/hugectr/hugectr/clang-format-lint --clang-format-executable /clang-format/$EXECUTABLE -r --exclude $EXCLUDE --style $STYLE --extensions $EXTENSIONS .
-      "
-  allow_failure: true
+    - pwd
+    - ls -all
+    - docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}"
+    - docker run -d --rm --name clang_${CI_PIPELINE_ID} -w /src gitlab-master.nvidia.com:5005/dl/hugectr/hugectr/clang-format-lint-new sleep infinity
+    - docker cp $(pwd) clang_${CI_PIPELINE_ID}:/src
+    - docker exec clang_${CI_PIPELINE_ID} sh -c "cd ./hugectr && /run-clang-format.py --clang-format-executable /clang-format/$EXECUTABLE -r --exclude $EXCLUDE --style $STYLE --extensions $EXTENSIONS ."
+  after_script:
+    - docker stop clang_${CI_PIPELINE_ID}
+  allow_failure: false
   timeout: 15 minutes
 
 .build_nightly:
@@ -92,6 +102,8 @@ stages:
   rules:
     - if: $NIGHTLY == "1"
       when: always
+    - if: $TEST_NEW_IMAGE == "1"
+      when: always
     - when: never
   timeout: 2 hours
 

diff --git a/hierarchical_parameter_server/hierarchical_parameter_server/__init__.py b/hierarchical_parameter_server/hierarchical_parameter_server/__init__.py
@@ -21,4 +21,4 @@
 from hierarchical_parameter_server.core.lookup_layer import LookupLayer
 from hierarchical_parameter_server.core.sparse_lookup_layer import SparseLookupLayer
 
-__all__ = [item for item in dir() if not item.startswith("__")]
+__all__ = [item for item in dir() if not item.startswith("__")]
diff --git a/hierarchical_parameter_server/hierarchical_parameter_server/core/initialize.py b/hierarchical_parameter_server/hierarchical_parameter_server/core/initialize.py
@@ -50,7 +50,7 @@ def Init(**kwargs):
     .. code-block:: python
 
         import hierarchical_parameter_server as hps
-        
+
         with strategy.scope():
             hps.Init(**kwargs)
 
@@ -72,7 +72,7 @@ def Init(**kwargs):
     .. code-block:: python
 
         import hierarchical_parameter_server as hps
-        
+
         hps_init = hps.Init(**kwargs)
         with tf.Session() as sess:
             sess.run(hps_init)
@@ -83,19 +83,19 @@ def Init(**kwargs):
     kwargs: dict
             keyword arguments for this function.
             Currently, it must contains `global_batch_size` and `ps_config_file`.
-            
+
             * `global_batch_size`: int, the global batch size for HPS that is deployed on multiple GPUs
-            
+
             * `ps_config_file`: str, the JSON configuration file for HPS initialization
 
-            An example `ps_config_file` is as follows and `global_batch_size` can be 
+            An example `ps_config_file` is as follows and `global_batch_size` can be
             configured as 16384 correspondingly:
-            
+
             .. code-block:: python
 
                 ps_config_file = {
                     "supportlonglong" : True,
-                    "models" : 
+                    "models" :
                     [{
                         "model": "foo",
                         "sparse_files": ["foo_sparse.model"],

diff --git a/hierarchical_parameter_server/hierarchical_parameter_server/core/lookup_layer.py b/hierarchical_parameter_server/hierarchical_parameter_server/core/lookup_layer.py
@@ -22,7 +22,7 @@ class LookupLayer(tf.keras.layers.Layer):
     """
     Abbreviated as ``hps.LookupLayer(*args, **kwargs)``.
 
-    This is a wrapper class for HPS lookup layer, which basically performs 
+    This is a wrapper class for HPS lookup layer, which basically performs
     the same function as tf.nn.embedding_lookup.
 
     Parameters