From 9c55721d3edbb14abb39ed8676a41ed6dad4e781 Mon Sep 17 00:00:00 2001 From: yumemiso Date: Wed, 27 May 2026 20:31:08 +0800 Subject: [PATCH] add common method for gpu virtualization in cloud --- GPU-Virtual-Service/gpu-remoting/README.md | 2 + .../gpu-remoting/include/chunkStructure.h | 75 ++ .../gpu-remoting/include/configure.h | 112 +++ .../gpu-remoting/include/constVar.h | 372 +++++++++ .../gpu-remoting/include/define.h | 552 +++++++++++++ .../gpu-remoting/src/common/configure.cc | 43 + .../gpu-remoting/src/common/elfHandle.cc | 732 ++++++++++++++++++ .../xpu-pool-service/.gitmodules | 7 - .../xpu-pool-service/ci/VersionSet.xml | 1 - .../xpu-pool-service/ci/app_define.json | 13 - .../xpu-pool-service/ci/at/at_deploy.sh | 31 - .../xpu-pool-service/ci/at/at_deploy.yml | 53 -- .../xpu-pool-service/ci/build.sh | 101 --- .../xpu-pool-service/ci/build.yml | 48 -- .../xpu-pool-service/ci/buildinfo.sh | 16 - .../ci/cmc/openSource_x86.xml | 24 - .../xpu-pool-service/ci/cmc/upload_cmc.xml | 25 - .../xpu-pool-service/ci/cms_signature.sh | 63 -- .../xpu-pool-service/ci/dependency.xml | 8 - .../xpu-pool-service/ci/hwp7s_signature.sh | 47 -- .../xpu-pool-service/ci/opensource.xml | 6 - .../xpu-pool-service/ci/third_party | 1 - .../ci/xpu_pool/build_x86.yml | 70 -- .../ci/xpu_pool/build_xpu_package.sh | 131 ---- .../xpu_docker_build/acl_client/Dockerfile | 10 - .../xpu_docker_build/cuda_client/Dockerfile | 10 - .../xpu_docker_build/exporter/Dockerfile | 30 - .../gpu-device-plugin/Dockerfile | 19 - .../npu-device-plugin/Dockerfile | 19 - 29 files changed, 1888 insertions(+), 733 deletions(-) create mode 100644 GPU-Virtual-Service/gpu-remoting/README.md create mode 100644 GPU-Virtual-Service/gpu-remoting/include/chunkStructure.h create mode 100644 GPU-Virtual-Service/gpu-remoting/include/configure.h create mode 100644 GPU-Virtual-Service/gpu-remoting/include/constVar.h create mode 100644 GPU-Virtual-Service/gpu-remoting/include/define.h create mode 100644 GPU-Virtual-Service/gpu-remoting/src/common/configure.cc create mode 100644 GPU-Virtual-Service/gpu-remoting/src/common/elfHandle.cc delete mode 100644 GPU-Virtual-Service/xpu-pool-service/.gitmodules delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/VersionSet.xml delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/app_define.json delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/at/at_deploy.sh delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/at/at_deploy.yml delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/build.sh delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/build.yml delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/buildinfo.sh delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/cmc/openSource_x86.xml delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/cmc/upload_cmc.xml delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/cms_signature.sh delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/dependency.xml delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/hwp7s_signature.sh delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/opensource.xml delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/third_party delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/build_x86.yml delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/build_xpu_package.sh delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/acl_client/Dockerfile delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/cuda_client/Dockerfile delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/exporter/Dockerfile delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/gpu-device-plugin/Dockerfile delete mode 100644 GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/npu-device-plugin/Dockerfile diff --git a/GPU-Virtual-Service/gpu-remoting/README.md b/GPU-Virtual-Service/gpu-remoting/README.md new file mode 100644 index 0000000..12b3362 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/README.md @@ -0,0 +1,2 @@ +# Flexible GPU Virtualization in Cloud (FlexGV) + diff --git a/GPU-Virtual-Service/gpu-remoting/include/chunkStructure.h b/GPU-Virtual-Service/gpu-remoting/include/chunkStructure.h new file mode 100644 index 0000000..b49e4d6 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/chunkStructure.h @@ -0,0 +1,75 @@ +#ifndef CHUNK_STRUCTURE_H +#define CHUNK_STRUCTURE_H + +#include "constVar.h" +#include +#include +#include + +typedef struct { + char *name; + size_t paramSize; + size_t paramNum; + uint16_t *paramOffsets; + uint16_t *paramSizes; + void *host_fun; +} KernelInfo_t; // used to store kernel parmeters from client fatCubin + +struct KernelPtx_t { + std::string name; + std::string body; + + KernelPtx_t(const char* n, size_t name_len, const char* b, size_t body_len) + : name(n, name_len), body(b, body_len) {} +}; // used to store kernel body from PTX codes + +struct LdParamInfo_t { + bool isUsed; + size_t index; + size_t offset; + + LdParamInfo_t(size_t idx, size_t off) : isUsed(false), index(idx), offset(off) {} +}; + +struct BatchInfo_t { + uint8_t curType; + size_t curBatchSize; +}; + +struct HostBuffer_t { + uint8_t* hostPtr; + size_t size; +}; + +struct TensorInfo_t { + void* devPtr; + size_t size; +}; + +struct Block_t{ + uint64_t start; + uint64_t devPtr = 0; + size_t size = 0; + bool valid = false; + bool essential = false; +}; + +struct Handle_t { + uint64_t handlePtr = 0; + enum API_REQUEST_CODE_SET type; + bool valid = false; + uint64_t stream = 0; +}; + +struct Sync_t { + boost::mutex mutex; + boost::condition_variable cv; +}; + +// struct GpuInform{ +// int GpuId; +// char IpAddr [IP_STRING_LEN]; +// int Port; +// }; + +#endif //CHUNK_STRUCTURE_H \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/include/configure.h b/GPU-Virtual-Service/gpu-remoting/include/configure.h new file mode 100644 index 0000000..cf445b6 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/configure.h @@ -0,0 +1,112 @@ +#ifndef BASICDEDUP_CONFIGURE_h +#define BASICDEDUP_CONFIGURE_h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "define.h" +#include "constVar.h" +#include "chunkStructure.h" +using namespace std; + +class Configure { +private: + string serverIp_; + uint16_t serverPort_; + + uint64_t clientID_; + bool isClient_; + size_t reqGPUnum_; + size_t priority_; + string proxyIp_; + uint16_t proxyPort_; + + string dpcIp_; + uint16_t dpcPort_; + + string monIp_; + uint16_t monPort_; + + size_t DDPreqGPUnum_; + + string model_; + size_t batchSize_; + + + void ReadConf(std::string path); + +public: + Configure(std::string path, bool isClient = false); + + ~Configure(); + + inline const string& GetServerIp() const noexcept { + return serverIp_; + } + + inline uint16_t GetServerPort() { + return serverPort_; + } + + inline uint64_t GetClientID() { + return clientID_; + } + + inline size_t GetReqGPUnum() { + if (DDPreqGPUnum_ > 1) { + return DDPreqGPUnum_; + } + return reqGPUnum_; + } + + inline size_t GetPriority() { + return priority_; + } + + inline const string& GetProxyIp() const noexcept { + return proxyIp_; + } + + inline uint16_t GetProxyPort() { + return proxyPort_; + } + + inline const string& GetDpcIp() const noexcept { + return dpcIp_; + } + + inline uint16_t GetDpcPort() { + return dpcPort_; + } + + inline const string& GetMonIp() const noexcept { + return monIp_; + } + + inline uint16_t GetMonPort() { + return monPort_; + } + + inline size_t GetDDPreqGPUnum() { + return DDPreqGPUnum_; + } + + inline const string& GetModel() const noexcept { + return model_; + } + + inline size_t GetBatchSize() { + return batchSize_; + } + + +}; + +#endif \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/include/constVar.h b/GPU-Virtual-Service/gpu-remoting/include/constVar.h new file mode 100644 index 0000000..d0ccbb1 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/constVar.h @@ -0,0 +1,372 @@ +#ifndef CONST_VAR_H +#define CONST_VAR_H + +#include "define.h" + +#define IP_STRING_LEN 40 +#define PORT_STRING_LEN 8 + +#define THREAD_NUM_PER_CLIENT 100 +#define IOV_MAX_NUM 1000 +#define PARAM_MAX_NUM 50 +#define REG_PARAM_MAX_NUM 128 * 1024 +#define TEST_STRING_LEN 50 +#define REQUEST_BUFFER_SIZE 3LL * 1024 * 1024 * 1024 +#define RECV_AM_SHIFT_BIT 8 + +#define BLOCK_ID_BIT 17 +#define BLOCKS_MAX_NUM (1LL << BLOCK_ID_BIT) // 0.25M blocks +#define BLOCK_SHIFT_BIT (64 - BLOCK_ID_BIT) // max size with 64TB for each block, nearly A100x800 + +#define HANDLE_MAX_NUM 800730 +#define HANDLE_PREFIX ((1LL << 54) | (1LL << 52) | (1LL << 50) | (1LL << 48)) // prefix: 0x55000000000000 +#define HANDLE_MASK ((1LL << 48) - 1) + +#define BACKUP_STREAM_NUM 4 +#define BACKUP_PERIOD 20 +#define BACKUP_API_MAX_NUM 10000 +#define COMM_EVENT_TIMEOUT 50 + +// static const char* BACKUP_FILE_DIR = "/mnt/nvme0/FlexGV_Test/Bak/"; +static const char* BACKUP_FILE_DIR = "/mnt/nvme0/Bak163/"; + +static const size_t CONN_RESERVED_NUM = 1024; +static const uint32_t THREAD_STACK_SIZE = 8*1024*1024; + +static const size_t DEVICE_POINTER_SIZE = sizeof(void *) * 2 + 3; +static const size_t HOST_POINTER_SIZE = sizeof(void *) * 2 + 3; +static const size_t DEVICE_FUNC_INFO_SMALL_SIZE = 1 * 1024 + 512 + 256; +static const size_t DEVICE_FUNC_INFO_LARGE_SIZE = 4 * 1024; + +enum SERVER_STATUS_SET { + NORMAL_STATUS, + // NODE_FAILURE, // cudaError or process failure + EP_CLOSE, // current endpoint ready to close (migration) + COMM_ABORT // communication abort +}; + +enum API_REQUEST_CODE_SET { + /* CUDA Runtime API */ + SERVER_STATUS = 1, + __CUDA_REGISTER = 2, + __CUDA_REGISTER_FAT_BINARY, + __CUDA_REGISTER_FAT_BINARY_END, + __CUDA_UNREGISTER_FAT_BINARY, + __CUDA_REGISTER_FUNCTION, + __CUDA_REGISTER_VAR, + CUDA_MALLOC, + CUDA_MEMCPY, + CUDA_MEMCPY_H2D, + CUDA_MEMCPY_D2H, + CUDA_MEMCPY_D2D, + NEW_ITERATION_REQ, // used for identifying the new iteration + CUDA_MEMCPY_ASYNC_H2D, + CUDA_MEMCPY_ASYNC_D2H, + CUDA_MEMCPY_ASYNC_D2D, + CUDA_MEMCPY_TO_SYMBOL, + CUDA_MEM_GET_INFO, + CUDA_FREE, + CUDA_STREAM_CREATE, + CUDA_STREAM_CREATE_WITH_FLAGS, + CUDA_STREAM_CREATE_WITH_PRIORITY, + CUDA_STREAM_DESTROY, + CUDA_STREAM_WAIT_EVENT, + CUDA_STREAM_SYNCHRONIZE, + CUDA_STREAM_IS_CAPTURING, + CUDA_STREAM_GET_CAPTURE_INFO, + CUDA_EVENT_CREATE, + CUDA_EVENT_CREATE_WITH_FLAGS, + CUDA_EVENT_RECORD, + CUDA_EVENT_QUERY, + CUDA_EVENT_DESTROY, + CUDA_EVENT_ELAPSED_TIME, + CUDA_LAUNCH_KERNEL, + CUDA_FUNC_GET_ATTRIBUTES, + CUDA_DEVICE_SYNCHRONIZE, + CUDA_MEMSET, + CUDA_MEMSET_ASYNC, + CUDA_SET_DEVICE, + CUDA_SET_MAIN_DEVICE, + CUDA_GET_DEVICE_COUNT, + CUDA_GET_DEVICE, + CUDA_GET_DEVICE_PROPERTIES, + CUDA_DEVICE_GET_ATTRIBUTE, + CUDA_OCCUPANCY_MAX_ACTIVE_BLOCKS_PER_MULTIPROCESSOR, + CUDA_OCCUPANCY_MAX_ACTIVE_BLOCKS_PER_MULTIPROCESSOR_WITH_FLAGS, + + /* cuBlas API */ + CUBLAS_CREATE_V2, + CUBLAS_SGEMM_V2, + CUBLAS_SGEMM_STRIDED_BATCHED, + CUBLAS_DESTROY_V2, + CUBLAS_SET_STREAM_V2, + CUBLAS_SET_WORKSPACE_V2, + CUBLAS_SET_MATH_MODE, + CUBLAS_GET_MATH_MODE, + + /* cuBlasLt API */ + CUBLASLT_CREATE, + CUBLASLT_DESTROY, + CUBLASLT_MATMULDESC_CREATE, + CUBLASLT_MATMULDESC_DESTROY, + CUBLASLT_MATMULDESC_SETATTRIBUTE, + CUBLASLT_MATRIX_LAYOUT_CREATE, + CUBLASLT_MATRIX_LAYOUT_DESTROY, + CUBLASLT_MATRIX_LAYOUT_SETATTRIBUTE, + CUBLASLT_MATMULPREFERENCE_CREATE, + CUBLASLT_MATMULPREFERENCE_DESTROY, + CUBLASLT_MATMULPREFERENCE_SETATTRIBUTE, + CUBLASLT_MATMULALGO_GETHEURISTIC, + CUBLASLT_MATMUL, + + /* cuDNN API */ + CUDNN_CREATE, + CUDNN_DESTROY, + CUDNN_CREATE_TENSOR_DESCRIPTOR, + CUDNN_DESTROY_TENSOR_DESCRIPTOR, + CUDNN_GET_TENSOR_SIZE_IN_BYTES, + CUDNN_SET_TENSOR_4D_DESCRIPTOR, + CUDNN_SET_TENSOR_ND_DESCRIPTOR, + CUDNN_SET_TENSOR_ND_DESCRIPTOR_EX, + CUDNN_CREATE_TENSOR_TRANSFORM_DESCRIPTOR, + CUDNN_SET_TENSOR_TRANSFORM_DESCRIPTOR, + CUDNN_DESTROY_TENSOR_TRANSFORM_DESCRIPTOR, + CUDNN_INIT_TRANSFORM_DEST, + CUDNN_TRANSFORM_TENSOR_EX, + CUDNN_TRANSFORM_FILTER, + CUDNN_CREATE_FILTER_DESCRIPTOR, + CUDNN_SET_FILTER_ND_DESCRIPTOR, + CUDNN_DESTROY_FILTER_DESCRIPTOR, + CUDNN_GET_FILTER_SIZE_IN_BYTES, + CUDNN_GET_FOLDED_CONV_BACKWARD_DATA_DESCRIPTORS, + CUDNN_SET_STREAM, + CUDNN_BATCH_NORMALIZATION_BACKWARD_EX, + CUDNN_BATCH_NORMALIZATION_FORWARD_TRAINING_EX, + CUDNN_BATCH_NORMALIZATION_FORWARD_INFERENCE, + CUDNN_BACKEND_CREATE_DESCRIPTOR, + CUDNN_BACKEND_DESTROY_DESCRIPTOR, + CUDNN_BACKEND_SET_ATTRIBUTE, + CUDNN_BACKEND_GET_ATTRIBUTE, + CUDNN_BACKEND_EXECUTE, + CUDNN_BACKEND_FINALIZE, + CUDNN_GET_BATCH_NORMALIZATION_BACKWARD_EX_WORKSPACE_SIZE, + CUDNN_GET_BATCH_NORMALIZATION_FORWARD_TRAINING_EX_WORKSPACE_SIZE, + CUDNN_GET_BATCH_NORMALIZATION_TRAINING_EX_RESERVE_SPACE_SIZE, + CUDNN_CREATE_CONVOLUTION_DESCRIPTOR, + CUDNN_DESTROY_CONVOLUTION_DESCRIPTOR, + CUDNN_SET_CONVOLUTION_GROUP_COUNT, + CUDNN_SET_CONVOLUTION_MATH_TYPE, + CUDNN_SET_CONVOLUTION_ND_DESCRIPTOR, + CUDNN_SET_CONVOLUTION_REORDER_TYPE, + CUDNN_GET_CONVOLUTION_FORWARD_ALGORITHM_V7, + CUDNN_GET_CONVOLUTION_BACKWARD_FILTER_ALGORITHM_V7, + CUDNN_GET_CONVOLUTION_BACKWARD_DATA_ALGORITHM_V7, + CUDNN_GET_CONVOLUTION_FORWARD_WORKSPACE_SIZE, + CUDNN_CONVOLUTION_FORWARD, + CUDNN_GET_CONVOLUTION_BACKWARD_DATA_WORKSPACE_SIZE, + CUDNN_CONVOLUTION_BACKWARD_FILTER, + CUDNN_GET_CONVOLUTION_BACKWARD_FILTER_WORKSPACE_SIZE, + CUDNN_CONVOLUTION_BACKWARD_DATA, + + /* NCCL API */ + NCCL_GROUP_START, + NCCL_GROUP_END, + NCCL_COMM_INIT_RANK, + NCCL_COMM_DESTROY, + NCCL_COMM_GET_ASYNC_ERROR, + NCCL_GET_UNIQUE_ID, + NCCL_GET_VERSION, + NCCL_ALL_REDUCE, + NCCL_REDUCE, + NCCL_REDUCE_SCATTER, + NCCL_ALL_GATHER, + NCCL_BROADCAST, + NCCL_SEND, + NCCL_RECV, + NCCL_COMM_COUNT, + NCCL_COMM_USER_RANK, + NCCL_COMM_CU_DEVICE, + NCCL_COMM_ABORT, + NCCL_COMM_INIT_ALL, + NCCL_COMM_INIT_RANK_CONFIG, + NCCL_COMM_SPLIT, + NCCL_COMM_FINALIZE, + NCCL_COMM_REGISTER, + NCCL_COMM_DEREGISTER, + NCCL_MEM_ALLOC, + NCCL_MEM_FREE, + NCCL_RED_OP_CREATE_PRE_MUL_SUM, + NCCL_RED_OP_DESTROY +}; + +enum MEMCPY_DATA_TYPE { + MEMCPY_OTHER = 0, + MEMCPY_TRAIN = 1, + MEMCPY_VALID = 2, + MEMCPY_MODEL = 3 +}; + +const int NotNeedRecordAPIs[] = { + /* CUDA Runtime API */ + // CUDA_MALLOC, // The validation phase of training sometimes needs to malloc memory + // CUDA_MEMCPY_H2D, + CUDA_MEMCPY_D2H, + CUDA_MEMCPY_D2D, + // CUDA_MEMCPY_ASYNC_H2D, + CUDA_MEMCPY_ASYNC_D2H, + // CUDA_MEMCPY_ASYNC_D2D, + CUDA_MEMCPY_TO_SYMBOL, + CUDA_MEM_GET_INFO, + CUDA_FREE, + CUDA_STREAM_IS_CAPTURING, + CUDA_STREAM_GET_CAPTURE_INFO, + CUDA_EVENT_ELAPSED_TIME, + CUDA_FUNC_GET_ATTRIBUTES, + // CUDA_MEMSET, + // CUDA_MEMSET_ASYNC, + CUDA_SET_DEVICE, + CUDA_SET_MAIN_DEVICE, + CUDA_GET_DEVICE_COUNT, + CUDA_GET_DEVICE, + CUDA_GET_DEVICE_PROPERTIES, + CUDA_DEVICE_GET_ATTRIBUTE, + CUDA_OCCUPANCY_MAX_ACTIVE_BLOCKS_PER_MULTIPROCESSOR, + CUDA_OCCUPANCY_MAX_ACTIVE_BLOCKS_PER_MULTIPROCESSOR_WITH_FLAGS, + + /* cuBlas API */ + CUBLAS_GET_MATH_MODE, + + /* cuBlasLt API */ + CUBLASLT_MATMULALGO_GETHEURISTIC, + + /* cuDNN API */ + CUDNN_GET_TENSOR_SIZE_IN_BYTES, + CUDNN_GET_FILTER_SIZE_IN_BYTES, + //todo: backend APIs + CUDNN_GET_BATCH_NORMALIZATION_BACKWARD_EX_WORKSPACE_SIZE, + CUDNN_GET_BATCH_NORMALIZATION_FORWARD_TRAINING_EX_WORKSPACE_SIZE, + CUDNN_GET_BATCH_NORMALIZATION_TRAINING_EX_RESERVE_SPACE_SIZE, + CUDNN_GET_CONVOLUTION_FORWARD_ALGORITHM_V7, + CUDNN_GET_CONVOLUTION_BACKWARD_FILTER_ALGORITHM_V7, + CUDNN_GET_CONVOLUTION_BACKWARD_DATA_ALGORITHM_V7, + CUDNN_GET_CONVOLUTION_FORWARD_WORKSPACE_SIZE, + CUDNN_GET_CONVOLUTION_BACKWARD_DATA_WORKSPACE_SIZE, + CUDNN_GET_CONVOLUTION_BACKWARD_FILTER_WORKSPACE_SIZE, + + /* NCCL API */ + // NCCL_GET_UNIQUE_ID, + NCCL_GET_VERSION, + NCCL_COMM_COUNT, + NCCL_COMM_USER_RANK, + NCCL_COMM_CU_DEVICE, + NCCL_MEM_ALLOC, + NCCL_MEM_FREE +}; + +const int ComputeAPIs[] = { + /* CUDA Runtime API */ + NEW_ITERATION_REQ, + CUDA_STREAM_WAIT_EVENT, // just a temporary state + CUDA_STREAM_SYNCHRONIZE, + CUDA_EVENT_RECORD, // just a temporary state + CUDA_LAUNCH_KERNEL, + CUDA_DEVICE_SYNCHRONIZE, + + /* cuBlas API */ + CUBLAS_SGEMM_V2, + CUBLAS_SGEMM_STRIDED_BATCHED, + + /* cuBlasLt API */ + CUBLASLT_MATMUL, + + /* cuDNN API */ + CUDNN_BATCH_NORMALIZATION_BACKWARD_EX, + CUDNN_BATCH_NORMALIZATION_FORWARD_TRAINING_EX, + CUDNN_BATCH_NORMALIZATION_FORWARD_INFERENCE, + //todo: backend APIs + CUDNN_CONVOLUTION_FORWARD, + CUDNN_CONVOLUTION_BACKWARD_FILTER, + CUDNN_CONVOLUTION_BACKWARD_DATA, + + /* NCCL API */ + NCCL_GROUP_START, + NCCL_GROUP_END, + NCCL_COMM_GET_ASYNC_ERROR, + NCCL_ALL_REDUCE, + NCCL_REDUCE, + NCCL_REDUCE_SCATTER, + NCCL_ALL_GATHER, + NCCL_BROADCAST, + NCCL_SEND, + NCCL_RECV, + // NCCL_COMM_FINALIZE +}; + +const int DestroyAPIs[] = { + /* CUDA Runtime API */ + CUDA_STREAM_DESTROY, + CUDA_EVENT_DESTROY, + // CUDA_FREE, + + /* cuBlas API */ + CUBLAS_DESTROY_V2, + + /* cuBlasLt API */ + CUBLASLT_DESTROY, + CUBLASLT_MATMULDESC_DESTROY, + CUBLASLT_MATRIX_LAYOUT_DESTROY, + CUBLASLT_MATMULPREFERENCE_DESTROY, + + + /* cuDNN API */ + CUDNN_DESTROY, + CUDNN_DESTROY_TENSOR_DESCRIPTOR, + CUDNN_DESTROY_TENSOR_TRANSFORM_DESCRIPTOR, + CUDNN_DESTROY_FILTER_DESCRIPTOR, + // CUDNN_BACKEND_DESTROY_DESCRIPTOR, + CUDNN_DESTROY_CONVOLUTION_DESCRIPTOR, + + /* NCCL API */ + NCCL_COMM_DESTROY, + NCCL_COMM_DEREGISTER, + // NCCL_MEM_FREE, + NCCL_RED_OP_DESTROY +}; + +const int CreateAPIs[] = { + /* CUDA Runtime API */ + CUDA_STREAM_CREATE, + CUDA_STREAM_CREATE_WITH_FLAGS, + CUDA_STREAM_CREATE_WITH_PRIORITY, + CUDA_EVENT_CREATE, + CUDA_EVENT_CREATE_WITH_FLAGS, + + /* cuBlas API */ + CUBLAS_CREATE_V2, + + /* cuBlasLt API */ + CUBLASLT_CREATE, + CUBLASLT_MATMULDESC_CREATE, + CUBLASLT_MATRIX_LAYOUT_CREATE, + CUBLASLT_MATMULPREFERENCE_CREATE, + + /* cuDNN API */ + CUDNN_CREATE, + CUDNN_CREATE_TENSOR_DESCRIPTOR, + CUDNN_CREATE_TENSOR_TRANSFORM_DESCRIPTOR, + CUDNN_CREATE_FILTER_DESCRIPTOR, + // CUDNN_BACKEND_CREATE_DESCRIPTOR, + CUDNN_CREATE_CONVOLUTION_DESCRIPTOR, + + /* NCCL API */ + NCCL_COMM_INIT_RANK, + NCCL_GET_UNIQUE_ID, + NCCL_COMM_INIT_ALL, + NCCL_COMM_INIT_RANK_CONFIG, + NCCL_COMM_SPLIT, + NCCL_COMM_REGISTER, + NCCL_RED_OP_CREATE_PRE_MUL_SUM + +}; + +#endif \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/include/define.h b/GPU-Virtual-Service/gpu-remoting/include/define.h new file mode 100644 index 0000000..c65def3 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/define.h @@ -0,0 +1,552 @@ +#ifndef MY_DEFINE_H +#define MY_DEFINE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOG_INFO 1 << 0 +#define LOG_COMM 1 << 1 +#define LOG_ERROR 1 << 2 +#define LOG_DEBUG 1 << 3 +#define LOG_REGS 1 << 4 + +#define LOG_CURR (LOG_INFO | LOG_ERROR) +// #define GV_Monitor +#define GV_GPUMAP +// #define GV_Scheduler +// #define GV_eScheduler +// #define GV_MSGHANDLER + +// #define GV_MEMORY +// #define GV_MEMORY_PTX +// #define GV_HANDLE +// #define GV_BACKUP + +static const uint64_t MB_2_B = 1000 * 1000; +static const uint64_t MiB_2_B = uint64_t(1) << 20; +static const uint64_t KB_2_B = 1000; +static const uint64_t KiB_2_B = uint64_t(1) << 10; +static const uint64_t SEC_2_US = 1000 * 1000; + +#define PAGE_SIZE 4096 +#define ALIGN_UP(size) (((size) + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define GET_BLOCK_ID(x) (x >> BLOCK_SHIFT_BIT) // address to block id +#define GET_BLOCK_INTER_OFFSET(x) (x & ((1LL << BLOCK_SHIFT_BIT) - 1)) // address to block internal offset + +#define GET_HANDLE_ID(x) (x & HANDLE_MASK) // get the api handle id +#define CHECK_HANDLE_PREFIX(x) (x & HANDLE_PREFIX) // check the api handle id (prefix) + + +#define CHKERR_ACTION(_cond, _msg, _action) \ + do { \ + if (_cond) { \ + fprintf(stderr, "Failed to %s\n", _msg); \ + _action; \ + } \ + } while (0) + + +#define CHKERR_JUMP(_cond, _msg, _label) \ + CHKERR_ACTION(_cond, _msg, goto _label) + + +#define CHKERR_JUMP_RETVAL(_cond, _msg, _label, _retval) \ + do { \ + if (_cond) { \ + fprintf(stderr, "Failed to %s, return value %d\n", _msg, _retval); \ + goto _label; \ + } \ + } while (0) + +#define GENERATE_KEY(clientID, dataType) (((clientID) << 2) | (dataType)) +#define GET_CLIENT_ID_FROM_KEY(key) ((key) >> 2) +#define GET_DATA_TYPE_FROM_KEY(key) ((key) & 0x3) + + +namespace tool { + /** + * @brief Get the Time Diff object + * + * @param start_time start time + * @param end_time end time + * @return double the diff time (sec) + */ + inline double GetTimeDiff(struct timeval start_time, struct timeval end_time) { + double second; + second = static_cast(end_time.tv_sec - start_time.tv_sec) * SEC_2_US + + end_time.tv_usec - start_time.tv_usec; + second = second / SEC_2_US; + return second; + } + + /** + * @brief compare the limits with the input + * + * @param input the input number + * @param lower the lower bound of the limitation + * @param upper the upper bound of the limitation + * @return uint32_t + */ + inline uint32_t CompareLimit(uint32_t input, uint32_t lower, uint32_t upper) { + if (input <= lower) { + return lower; + } else if (input >= upper) { + return upper; + } else { + return input; + } + } + + /** + * @brief get the ceil of the division + * + * @param a + * @param b + * @return uint32_t + */ + inline uint32_t DivCeil(uint32_t a, uint32_t b) { + uint32_t tmp = a / b; + if (a % b == 0) { + return tmp; + } else { + return (tmp + 1); + } + } + + /** + * @brief print the binary buffer + * + * @param fp the pointer to the buffer + * @param fp_size the size of the buffer + */ + inline void PrintBinaryArray(const uint8_t* buffer, size_t buffer_size) { + for (size_t i = 0; i < buffer_size; i++) { + fprintf(stdout, "%02x", buffer[i]); + } + fprintf(stdout, "\n"); + return ; + } + + inline void HexDump(const uint8_t* data, size_t size){ + size_t pos = 0; + while (pos < size) { + printf("%#05zx: ", pos); + for (int i = 0; i < 16; i++) { + if (pos + i < size) { + printf("%02x", data[pos + i]); + } else { + printf(" "); + } + if (i % 4 == 3) { + printf(" "); + } + } + printf(" | "); + for (int i = 0; i < 16; i++) { + if (pos + i < size) { + if (data[pos + i] >= 0x20 && data[pos + i] <= 0x7e) { + printf("%c", data[pos + i]); + } else { + printf("."); + } + } else { + printf(" "); + } + } + printf("\n"); + pos += 16; + } + } + + inline void PrintStackTrace(const std::string& filename, bool append = false) { + const int maxFrames = 128; // max number of frames in the stack trace + void* buffer[maxFrames]; // buffer pointer to store the stack trace + + int frameCount = backtrace(buffer, maxFrames); + char** symbols = backtrace_symbols(buffer, frameCount); + if (symbols == nullptr) { + fprintf(stderr, "Failed to get the backtrace symbols\n"); + return ; + } + + std::ofstream outFile; + if (append) { + outFile.open(filename, std::ios::app); + } + else { + outFile.open(filename); + } + if (!outFile.is_open()) { + std::cerr << "Failed to open file " << filename << std::endl; + free(symbols); + return; + } + + outFile << "Call stack:" << std::endl; + for (int i = 0; i < frameCount; ++i) { + outFile << symbols[i] << std::endl; + std::cout << symbols[i] << std::endl; + } + outFile << std::endl; + // std::cout << std::endl; + + outFile.close(); + free(symbols); + } + + inline bool CheckStackTrace(const std::string& target) { + bool found = false; + const int maxFrames = 128; + void* buffer[maxFrames]; + + int frameCount = backtrace(buffer, maxFrames); + char** symbols = backtrace_symbols(buffer, frameCount); + if (symbols == nullptr) { + return found; + } + + const char* target_cstr = target.c_str(); + for (int i = frameCount; i >= 0; i--) { + if (strstr(symbols[i], target_cstr) != NULL) { + found = true; + break; + } + } + free(symbols); + return found; + } + + inline void PrintPyStackTrace(const std::string& filename, bool append = false) { + std::ofstream outFile; + if (append) { + outFile.open(filename, std::ios::app); + } + else { + outFile.open(filename); + } + if (!outFile.is_open()) { + std::cerr << "Failed to open file " << filename << std::endl; + return; + } + + if (!Py_IsInitialized()) { + return; + } + + PyGILState_STATE gstate = PyGILState_Ensure(); + + PyThreadState *tstate = PyThreadState_Get(); + if (!tstate) { + outFile << "Failed to get the thread state" << std::endl; + // std::cerr << "Failed to get the thread state" << std::endl; + PyGILState_Release(gstate); + return; + } + + PyFrameObject *frame = tstate->frame; + if (!frame) { + outFile << "Failed to get the frame" << std::endl; + // std::cerr << "Failed to get the frame" << std::endl; + PyGILState_Release(gstate); + return; + } + + outFile << "Python call stack:" << std::endl; + while (frame) { + PyCodeObject *code = (PyCodeObject *)frame->f_code; + const char *filename_str = PyUnicode_AsUTF8(code->co_filename); + const char *funcname = PyUnicode_AsUTF8(code->co_name); + int lineno = PyFrame_GetLineNumber(frame); + outFile << " File \"" << filename_str << "\", line " << lineno << ", in " << funcname << std::endl; + std::cout << " File \"" << filename_str << "\", line " << lineno << ", in " << funcname << std::endl; + frame = frame->f_back; + } + outFile << std::endl; + + outFile.close(); + PyGILState_Release(gstate); + } + + inline bool CheckPyStackTrace(const std::string& target) { + if (!Py_IsInitialized()) { + return false; + } + + PyGILState_STATE gstate = PyGILState_Ensure(); + PyThreadState *tstate = PyThreadState_Get(); + if (!tstate) { + PyGILState_Release(gstate); + return false; + } + + PyFrameObject *frame = tstate->frame; + if (!frame) { + PyGILState_Release(gstate); + return false; + } + + const char* target_cstr = target.c_str(); + while (frame) { + PyCodeObject *code = (PyCodeObject *)frame->f_code; + const char *funcname = PyUnicode_AsUTF8(code->co_name); + if (strstr(funcname, target_cstr) != NULL) { + PyGILState_Release(gstate); + return true; + } + + frame = frame->f_back; + } + + PyGILState_Release(gstate); + return false; + } + + /** + * @brief a simple logger + * + * @param logger the logger name + * @param fmt the input message + */ + inline void Logging(int loglevel, const char* logger, const char* fmt, ...) { + if (LOG_CURR & loglevel) { + using namespace std; + char buf[BUFSIZ] = {'\0'}; + va_list ap; + va_start(ap, fmt); + vsnprintf(buf, BUFSIZ, fmt, ap); + va_end(ap); + time_t t = std::time(nullptr); + stringstream output; + output << std::put_time(std::localtime(&t), "%F %T ") + << "<" << logger << ">: " << buf; + cerr << output.str(); + return ; + } + } + + inline void Logging(const char* logger, const char* fmt, ...) { + if (LOG_CURR & LOG_DEBUG) { + using namespace std; + char buf[BUFSIZ] = {'\0'}; + va_list ap; + va_start(ap, fmt); + vsnprintf(buf, BUFSIZ, fmt, ap); + va_end(ap); + time_t t = std::time(nullptr); + stringstream output; + output << std::put_time(std::localtime(&t), "%F %T ") + << "<" << logger << ">: " << buf; + cerr << output.str(); + return ; + } + } + + inline uint64_t ProcessMemUsage() { + using std::ios_base; + using std::ifstream; + using std::string; + + uint64_t vm_usage = 0; + uint64_t resident_set = 0; + + // 'file' stat seems to give the most reliable results + // + ifstream stat_stream("/proc/self/stat",ios_base::in); + + // dummy vars for leading entries in stat that we don't care about + // + string pid, comm, state, ppid, pgrp, session, tty_nr; + string tpgid, flags, minflt, cminflt, majflt, cmajflt; + string utime, stime, cutime, cstime, priority, nice; + string O, itrealvalue, starttime; + + // the two fields we want + // + unsigned long vsize; + long rss; + + stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr + >> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt + >> utime >> stime >> cutime >> cstime >> priority >> nice + >> O >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest + + stat_stream.close(); + + long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages + vm_usage = vsize / 1024 ; + resident_set = rss * page_size_kb; + return resident_set; // only for PM + } + + inline uint64_t GetMaxMemoryUsage() { + struct rusage currentUsage; + getrusage(RUSAGE_SELF, ¤tUsage); + return currentUsage.ru_maxrss; + } + + inline std::string GenerateUUID() { + static std::random_device rd; + static std::mt19937_64 gen(rd()); + static std::uniform_int_distribution dis; + + std::array data; + std::array uuid; + + // 生成16个随机字节 + uint64_t* ptr = reinterpret_cast(data.data()); + ptr[0] = dis(gen); + ptr[1] = dis(gen); + + // 设置版本 (4) 和变体位 + data[6] = (data[6] & 0x0F) | 0x40; + data[8] = (data[8] & 0x3F) | 0x80; + + // 转换为十六进制字符串 + static const char* hex_chars = "0123456789abcdef"; + char* dst = uuid.data(); + for (int i = 0; i < 16; ++i) { + if (i == 4 || i == 6 || i == 8 || i == 10) { + *dst++ = '-'; + } + *dst++ = hex_chars[data[i] >> 4]; + *dst++ = hex_chars[data[i] & 0x0F]; + } + *dst = '\0'; + + return std::string(uuid.data(), 36); + } + + inline bool FileExist(std::string filePath) { + return std::filesystem::is_regular_file(filePath); + } + + inline uint64_t GetStrongSeed() { + uint64_t a = clock(); + struct timeval currentTime; + gettimeofday(¤tTime, NULL); + uint64_t b = currentTime.tv_sec * SEC_2_US + currentTime.tv_usec; + uint64_t c = getpid(); + + // Robert Jenkins' 96 bit Mix Function + a = a - b; a = a - c; a = a ^ (c >> 13); + b = b - c; b = b - a; b = b ^ (a << 8); + c = c - a; c = c - b; c = c ^ (b >> 13); + a = a - b; a = a - c; a = a ^ (c >> 12); + b = b - c; b = b - a; b = b ^ (a << 16); + c = c - a; c = c - b; c = c ^ (b >> 5); + a = a - b; a = a - c; a = a ^ (c >> 3); + b = b - c; b = b - a; b = b ^ (a << 10); + c = c - a; c = c - b; c = c ^ (b >> 15); + + return c; + } + + inline void GetIpStrFromSockaddr(const struct sockaddr_storage *sock_addr, + char *ip_str, size_t max_size) { + if (!ip_str) { + return; + } + + if (sock_addr->ss_family == AF_INET) { + const struct sockaddr_in *addr_in = reinterpret_cast(sock_addr); + inet_ntop(AF_INET, &(addr_in->sin_addr), ip_str, max_size); + } + else if (sock_addr->ss_family == AF_INET6) { + const struct sockaddr_in6 *addr_in6 = reinterpret_cast(sock_addr); + inet_ntop(AF_INET6, &(addr_in6->sin6_addr), ip_str, max_size); + } else { + ip_str[0] = '\0'; + } + } + + inline void GetPortStrFromSockaddr(const struct sockaddr_storage *sock_addr, + char *port_str, size_t max_size) { + if (!port_str) { + return; // 如果port_str是NULL,直接返回 + } + + if (sock_addr->ss_family == AF_INET) { + const struct sockaddr_in *addr_in = reinterpret_cast(sock_addr); + snprintf(port_str, max_size, "%d", ntohs(addr_in->sin_port)); + } + else if (sock_addr->ss_family == AF_INET6) { + const struct sockaddr_in6 *addr_in6 = reinterpret_cast(sock_addr); + snprintf(port_str, max_size, "%d", ntohs(addr_in6->sin6_port)); + } else { + port_str[0] = '\0'; // 对于不支持的地址家族,设置为空字符串 + } + } + + inline void SetSockAddr(const char *address_str, uint16_t server_port, + struct sockaddr_storage *saddr, sa_family_t ai_family){ + struct sockaddr_in *sa_in; + struct sockaddr_in6 *sa_in6; + + /* The server will listen on INADDR_ANY */ + memset(saddr, 0, sizeof(*saddr)); + + switch (ai_family) { + case AF_INET: + sa_in = (struct sockaddr_in*)saddr; + if (address_str != NULL) { + inet_pton(AF_INET, address_str, &sa_in->sin_addr); + } else { + sa_in->sin_addr.s_addr = INADDR_ANY; + } + sa_in->sin_family = AF_INET; + sa_in->sin_port = htons(server_port); + break; + case AF_INET6: + sa_in6 = (struct sockaddr_in6*)saddr; + if (address_str != NULL) { + inet_pton(AF_INET6, address_str, &sa_in6->sin6_addr); + } else { + sa_in6->sin6_addr = in6addr_any; + } + sa_in6->sin6_family = AF_INET6; + sa_in6->sin6_port = htons(server_port); + break; + default: + fprintf(stderr, "Invalid address family"); + break; + } + } + + inline bool ReadSocketMessage(int sock, uint8_t* buffer, size_t buffer_size) { + if (sock < 0 || !buffer) { + tool::Logging(LOG_ERROR, "ReadSocketMessage", "Invalid socket or buffer\n"); + return false; + } + size_t bytesRead = 0; + while (bytesRead < buffer_size) { + int ret = read(sock, buffer + bytesRead, buffer_size - bytesRead); + if (ret < 0) { + tool::Logging(LOG_ERROR, "ReadSocketMessage", "Failed to read from socket\n"); + return false; + } else if (ret == 0) { + tool::Logging(LOG_ERROR, "ReadSocketMessage", "Socket closed\n"); + return false; + } + bytesRead += ret; + } + return true; + } + +} // namespace tool +#endif \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/src/common/configure.cc b/GPU-Virtual-Service/gpu-remoting/src/common/configure.cc new file mode 100644 index 0000000..8318a6e --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/common/configure.cc @@ -0,0 +1,43 @@ +#include "../../include/configure.h" + +Configure::~Configure() { +} + +Configure::Configure(std::string path, bool isClient) { + isClient_ = isClient; + this->ReadConf(path); +} + +void Configure::ReadConf(std::string path) { + using namespace boost; + using namespace boost::property_tree; + + ptree root; + read_json(path, root); + + serverIp_ = root.get("ServerConfig.serverIp_"); + serverPort_ = root.get("ServerConfig.serverPort_"); + + dpcIp_ = root.get("DispatcherConfig.dpcIp_"); + dpcPort_ = root.get("DispatcherConfig.dpcPort_"); + + monIp_ = root.get("MonitorConfig.monitorIp_"); + monPort_ = root.get("MonitorConfig.monitorPort_"); + + if (isClient_) { + const char* envClientID = std::getenv("FLEXGV_CLIENT_ID"); + const char* envPriority = std::getenv("FLEXGV_PRIORITY"); + const char* envReqGPUnum = std::getenv("FLEXGV_REQ_NUM"); + const char* envModel = std::getenv("FLEXGV_MODEL"); + const char* envBatchSize = std::getenv("FLEXGV_BATCH_SIZE"); + clientID_ = envClientID ? std::stoull(envClientID) : root.get("ClientConfig.clientID_"); + reqGPUnum_ = root.get("ClientConfig.requestGPUnum_"); + priority_ = envPriority ? std::stoull(envPriority) : root.get("ClientConfig.priority_"); + proxyIp_ = root.get("ClientConfig.proxyIp_"); + proxyPort_ = root.get("ClientConfig.proxyPort_"); + DDPreqGPUnum_ = envReqGPUnum ? std::stoull(envReqGPUnum) : 1; + model_ = envModel ? envModel : "resnet18"; + batchSize_ = envBatchSize ? std::stoull(envBatchSize) : 32; + } + return ; +} diff --git a/GPU-Virtual-Service/gpu-remoting/src/common/elfHandle.cc b/GPU-Virtual-Service/gpu-remoting/src/common/elfHandle.cc new file mode 100644 index 0000000..ca06d2b --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/common/elfHandle.cc @@ -0,0 +1,732 @@ +#include "../../include/hook/elfHandle.h" + +static const char* myName = "elfHandle"; + +int InitElf2(void){ + if (elf_version(EV_CURRENT) == EV_NONE) { + tool::Logging(LOG_ERROR, myName, "ELF library initialization failed: %s\n", elf_errmsg(-1)); + return -1; + } + return 0; +} + +static int GetStrByElfFlag(char** str, uint64_t flag) +{ + return asprintf(str, "64Bit: %s, Debug: %s, Linux: %s, Compress %s", + (flag & FATBIN_FLAG_64BIT) ? "yes" : "no", + (flag & FATBIN_FLAG_DEBUG) ? "yes" : "no", + (flag & FATBIN_FLAG_LINUX) ? "yes" : "no", + (flag & FATBIN_FLAG_COMPRESS) ? "yes" : "no"); +} + +static void PrintFatTextHeader(FatTextHeader_t *th) +{ + char* flagstr = NULL; + GetStrByElfFlag(&flagstr, th->flags); + + tool::Logging(LOG_REGS, myName, "text_header: fatbin_kind: %#x, header_size %#x, size %#zx, compressed_size %#x,\ + minor %#x, major %#x, arch %d, decompressed_size %#zx\n\tflags: %s\n", + th->kind, + th->header_size, + th->size, + th->compressed_size, + th->minor, + th->major, + th->arch, + th->decompressed_size, + flagstr); + tool::Logging(LOG_REGS, myName, "\tunknown fields: unknown1: %#x, unknown2: %#x, zeros: %#zx\n", + th->unknown1, + th->unknown2, + th->zero); + + free(flagstr); +} + +/** Check the header of a fatbin + * Performs some integrity checks and returns the elf header + * @param fatbin_data Pointer to the fatbin data + * @param fatbin_size Size of the fatbin data + * @param decompressed_size Pointer to a variable that will be set to the size of the decompressed data + * @param compressed_data Pointer to a variable that will be set to point to the compressed data +*/ +static int GetFatElfHeader(const uint8_t* fatbin_data, size_t fatbin_size, FatElfHeader_t **elf_header) +{ + FatElfHeader_t *eh = NULL; + + if (fatbin_data == NULL || elf_header == NULL) { + tool::Logging(LOG_ERROR, myName, "fatbin_data is NULL\n"); + return 1; + } + + // if (fatbin_size < sizeof(struct fat_elf_header)) { + // tool::Logging(LOG_ERROR, myName, "fatbin_size is too small"); + // return 1; + // } + + eh = (FatElfHeader_t*) fatbin_data; + if (eh->magic != FATBIN_TEXT_MAGIC) { + tool::Logging(LOG_ERROR, myName, "Invalid magic number: expected %#x but got %#x\n", FATBIN_TEXT_MAGIC, eh->magic); + return 1; + } + + if (eh->version != 1 || eh->header_size != sizeof(FatElfHeader_t)) { + tool::Logging(LOG_ERROR, myName, "fatbin text version is wrong or header size is inconsistent.\ + This is a sanity check to avoid reading a new fatbinary format\n"); + return 1; + } + + *elf_header = eh; + return 0; +} + +/** Check the text header of a fatbin + * Performs some integrity checks and returns the text header + * @param fatbin_data Pointer to the fatbin data + * @param fatbin_size Size of the fatbin data + * @param decompressed_size Pointer to a variable that will be set to the size of the decompressed data + * @param compressed_data Pointer to a variable that will be set to point to the compressed data +*/ +static int GetFatTextHeader(const uint8_t* fatbin_data, size_t fatbin_size, FatTextHeader_t **text_header) +{ + FatTextHeader_t *th = NULL; + + if (fatbin_data == NULL || text_header == NULL) { + tool::Logging(LOG_ERROR, myName, "fatbin_data is NULL\n"); + return 1; + } + + // if (fatbin_size < sizeof(struct fat_text_header)) { + // tool::Logging(LOG_ERROR, myName, "fatbin_size is too small"); + // return 1; + // } + + th = (FatTextHeader_t*)fatbin_data; + + if(th->obj_name_offset != 0) { + if (((char*)th)[th->obj_name_offset + th->obj_name_len] != '\0') { + tool::Logging(LOG_REGS, myName, "Fatbin object name is not null terminated\n"); + } else { + char *obj_name = (char*)th + th->obj_name_offset; + tool::Logging(LOG_REGS, myName, "Fatbin object name: %s (len:%#x)\n", obj_name, th->obj_name_len); + } + } + + *text_header = th; + return 0; +} + +/** Decompresses a fatbin file + * @param input Pointer compressed input data + * @param input_size Size of compressed data + * @param output preallocated memory where decompressed output should be stored + * @param output_size size of output buffer. Should be equal to the size of the decompressed data + */ +static size_t DecompressFatbin(const uint8_t* input, size_t input_size, uint8_t* output, size_t output_size) +{ + size_t ipos = 0, opos = 0; + uint64_t next_nclen; // length of next non-compressed segment + uint64_t next_clen; // length of next compressed segment + uint64_t back_offset; // negative offset where redudant data is located, relative to current opos + + while (ipos < input_size) { + next_nclen = (input[ipos] & 0xf0) >> 4; + next_clen = 4 + (input[ipos] & 0xf); + if (next_nclen == 0xf) { + do { + next_nclen += input[++ipos]; + } while (input[ipos] == 0xff); + } + + if (memcpy(output + opos, input + (++ipos), next_nclen) == NULL) { + tool::Logging(LOG_ERROR, myName, "copying data\n"); + return 0; + } +#ifdef FATBIN_DECOMPRESS_DEBUG + printf("%#04zx nocompress (len:%#x):\n", opos, next_nclen); + tool::HexDump(output + opos, next_nclen); +#endif + ipos += next_nclen; + opos += next_nclen; + if (ipos >= input_size || opos >= output_size) { + break; + } + back_offset = input[ipos] + (input[ipos + 1] << 8); + ipos += 2; + if (next_clen == 0xf+4) { + do { + next_clen += input[ipos++]; + } while (input[ipos - 1] == 0xff); + } +#ifdef FATBIN_DECOMPRESS_DEBUG + printf("%#04zx compress (decompressed len: %#x, back_offset %#x):\n", opos, next_clen, back_offset); +#endif + if (next_clen <= back_offset) { + if (memcpy(output + opos, output + opos - back_offset, next_clen) == NULL) { + tool::Logging(LOG_ERROR, myName, "Error copying data\n"); + return 0; + } + } else { + if (memcpy(output + opos, output + opos - back_offset, back_offset) == NULL) { + tool::Logging(LOG_ERROR, myName, "Error copying data\n"); + return 0; + } + for (size_t i = back_offset; i < next_clen; i++) { + output[opos + i] = output[opos + i - back_offset]; + } + } +#ifdef FATBIN_DECOMPRESS_DEBUG + tool::HexDump(output + opos, next_clen); +#endif + opos += next_clen; + } + tool::Logging(LOG_REGS, myName, "ipos: %#zx, opos: %#zx, ilen: %#zx, olen: %#zx\n", ipos, opos, input_size, output_size); + return opos; +} + + +static ssize_t DecompressSingleSection(const uint8_t *input, uint8_t **output, size_t *output_size, + FatElfHeader_t *eh, FatTextHeader_t *th) +{ + size_t padding; + size_t input_read = 0; + size_t output_written = 0; + size_t decompress_ret = 0; + const uint8_t zeroes[8] = {0}; + + if (input == NULL || output == NULL || eh == NULL || th == NULL) { + tool::Logging(LOG_ERROR, myName, "invalid parameters\n"); + return 1; + } + + // add max padding of 7 bytes + if ((*output = (uint8_t*)malloc(th->decompressed_size + 7)) == NULL) { + tool::Logging(LOG_ERROR, myName, "Error allocating memory of size %#zx for output buffer: %s\n", + th->decompressed_size, strerror(errno)); + goto error; + } + PrintFatTextHeader(th); + + if ((decompress_ret = DecompressFatbin(input, th->compressed_size, *output, th->decompressed_size)) != th->decompressed_size) { + tool::Logging(LOG_ERROR, myName, "Decompression failed: decompressed size is %#zx, but header says %#zx\n", + decompress_ret, th->decompressed_size); + tool::Logging(LOG_ERROR, myName, "input pos: %#zx, output pos: %#zx\n", input - (uint8_t*)eh, *output); + tool::HexDump(input, 0x160); + if (decompress_ret >= 0x60) + tool::HexDump((*output) + decompress_ret - 0x60, 0x60); + goto error; + } + input_read += th->compressed_size; + output_written += th->decompressed_size; + + padding = ((8 - (size_t)(input + input_read)) % 8); + if (memcmp(input + input_read, zeroes, padding) != 0) { + tool::Logging(LOG_ERROR, myName, "expected %#zx zero bytes, got:\n", padding); + tool::HexDump(input + input_read, 0x60); + goto error; + } + input_read += padding; + + padding = ((8 - (size_t)th->decompressed_size) % 8); + // Because we always allocated enough memory for one more elf_header and this is smaller than + // the maximal padding of 7, we do not have to reallocate here. + memset(*output, 0, padding); + output_written += padding; + + *output_size = output_written; + return input_read; + error: + free(*output); + *output = NULL; + return -1; +} + +int GetFatbinInfo(FatHeader_t *fatbin, std::vector *kernel_list, uint8_t** fatbin_mem, size_t* fatbin_size) { + FatElfHeader_t* eh; + FatTextHeader_t* th; + const uint8_t *input_pos = NULL; + const uint8_t *fatbin_data = NULL; + uint8_t *text_data = NULL; + size_t text_data_size = 0; + size_t fatbin_total_size = 0; + int ret = -1; + if (fatbin == NULL || fatbin_mem == NULL || fatbin_size == NULL) { + tool::Logging(LOG_ERROR, myName, "at least one parameter is NULL\n"); + goto error; + } + fatbin_data = input_pos = (const uint8_t*)fatbin->text; + if (fatbin->magic != FATBIN_STRUCT_MAGIC) { + tool::Logging(LOG_ERROR, myName, "fatbin struct magic number is wrong. Got %llx, expected %llx.\n", fatbin->magic, FATBIN_STRUCT_MAGIC); + goto error; + } + tool::Logging(LOG_REGS, myName, "Fatbin: magic: %x, version: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx\n", + fatbin->magic, fatbin->version, fatbin->text, fatbin->data, fatbin->unknown, fatbin->text2, fatbin->zero); + + if (GetFatElfHeader((uint8_t*)fatbin->text, sizeof(FatElfHeader_t), &eh) != 0) { + tool::Logging(LOG_ERROR, myName, "Something went wrong while checking the elf header.\n"); + goto error; + } + // tool::Logging(LOG_REGS, myName, "elf header: magic: %#x, version: %#x, header_size: %#x, size: %#zx", + // eh->magic, eh->version, eh->header_size, eh->size); + + input_pos += eh->header_size; + fatbin_total_size = eh->header_size + eh->size; + do { + if (GetFatTextHeader(input_pos, *fatbin_size - (input_pos - fatbin_data) - eh->header_size, &th) != 0) { + tool::Logging(LOG_ERROR, myName, "Something went wrong while checking the text header.\n"); + goto error; + } + //print_header(th); + input_pos += th->header_size; + if (th->kind != 2) { // section does not cotain device code (but e.g. PTX) + input_pos += th->size; + continue; + } + if (th->flags & FATBIN_FLAG_DEBUG) { + tool::Logging(LOG_REGS, myName, "fatbin contains debug information.\n"); + } + + if (th->flags & FATBIN_FLAG_COMPRESS) { + ssize_t input_read; + + tool::Logging(LOG_REGS, myName, "fatbin contains compressed device code. Decompressing...\n"); + if ((input_read = DecompressSingleSection(input_pos, &text_data, &text_data_size, eh, th)) < 0) { + tool::Logging(LOG_ERROR, myName, "Something went wrong while decompressing text section.\n"); + goto error; + } + input_pos += input_read; + //hexdump(text_data, text_data_size); + } else { + text_data = (uint8_t*)input_pos; + text_data_size = th->size; + input_pos += th->size; + } + // print_header(th); + if (GetParameterInfo(kernel_list, text_data , text_data_size) != 0) { + tool::Logging(LOG_ERROR, myName, "error getting parameter info\n"); + goto error; + } + if (th->flags & FATBIN_FLAG_COMPRESS) { + free(text_data); + } + } while (input_pos < (uint8_t*)eh + eh->header_size + eh->size); + + // if (get_elf_header((uint8_t*)fatbin->text2, sizeof(struct fat_elf_header), &eh) != 0) { + // tool::Logging(LOG_ERROR, myName, "Something went wrong while checking the header."); + // goto error; + // } + // fatbin_total_size += eh->header_size + eh->size; + + *fatbin_mem = (uint8_t*)fatbin->text; + *fatbin_size = fatbin_total_size; + ret = 0; + error: + return ret; +} + +static int GetSectionByName(Elf *elf, const char *name, Elf_Scn **section) +{ + Elf_Scn *scn = NULL; + GElf_Shdr shdr; + char *section_name = NULL; + size_t str_section_index; + + if (elf == NULL || name == NULL || section == NULL) { + tool::Logging(LOG_ERROR, myName, "invalid argument\n"); + return -1; + } + + if (elf_getshdrstrndx(elf, &str_section_index) != 0) { + tool::Logging(LOG_ERROR, myName, "elf_getshstrndx failed\n"); + return -1; + } + + while ((scn = elf_nextscn(elf, scn)) != NULL) { + if (gelf_getshdr(scn, &shdr) != &shdr) { + tool::Logging(LOG_ERROR, myName, "gelf_getshdr failed\n"); + return -1; + } + if ((section_name = elf_strptr(elf, str_section_index, shdr.sh_name)) == NULL) { + tool::Logging(LOG_ERROR, myName, "elf_strptr failed\n"); + return -1; + } + if (strcmp(section_name, name) == 0) { + *section = scn; + return 0; + } + } + return -1; +} + +static char* GetKernelSectionFromKernelName(const char *kernel_name) +{ + char *section_name = NULL; + if (kernel_name == NULL) { + tool::Logging(LOG_ERROR, myName, "invalid argument\n"); + return NULL; + } + + if (kernel_name[0] == '$') { + const char *p; + if ((p = strchr(kernel_name+1, '$')) == NULL) { + tool::Logging(LOG_ERROR, myName, "invalid kernel name\n"); + return NULL; + } + int len = (p - kernel_name) - 1; + if (asprintf(§ion_name, ".nv.info.%.*s", len, kernel_name+1) == -1) { + tool::Logging(LOG_ERROR, myName, "asprintf failed\n"); + return NULL; + } + } else { + if (asprintf(§ion_name, ".nv.info.%s", kernel_name) == -1) { + tool::Logging(LOG_ERROR, myName, "asprintf failed\n"); + return NULL; + } + } + return section_name; +} + +static int GetParaForKernel(Elf *elf, KernelInfo_t *kernel, void* memory, size_t memsize) +{ + struct __attribute__((__packed__)) nv_info_kernel_entry { + uint8_t format; + uint8_t attribute; + uint16_t values_size; + uint32_t values; + }; + struct __attribute__((__packed__)) nv_info_kparam_info { + uint32_t index; + uint16_t ordinal; + uint16_t offset; + uint16_t unknown : 12; + uint8_t cbank : 6; + uint16_t size : 14; + // missing are "space" (possible padding info?), and "Pointee's logAlignment" + // these were always 0 in the kernels I tested + }; + int ret = -1; + char *section_name = NULL; + Elf_Scn *section = NULL; + Elf_Data *data = NULL; + size_t secpos=0; + int i=0; + + if (kernel == NULL || kernel->name == NULL || memory == NULL) { + tool::Logging(LOG_ERROR, myName, "at least one parameter is NULL\n"); + goto cleanup; + } + kernel->paramNum = 0; + kernel->paramSize = 0; + kernel->paramOffsets = NULL; + kernel->paramSizes = NULL; + + if ((section_name = GetKernelSectionFromKernelName(kernel->name)) == NULL) { + tool::Logging(LOG_ERROR, myName, "GetKernelSectionFromKernelName failed\n"); + goto cleanup; + } + + if (GetSectionByName(elf, section_name, §ion) != 0) { + tool::Logging(LOG_ERROR, myName, "section %s not found\n", section_name); + goto cleanup; + } + + if ((data = elf_getdata(section, NULL)) == NULL) { + tool::Logging(LOG_ERROR, myName, "error getting section data\n"); + goto cleanup; + } + + while (secpos < data->d_size) { + struct nv_info_kernel_entry *entry = (struct nv_info_kernel_entry*)((uint8_t*)data->d_buf+secpos); + // printf("entry %d: format: %#x, attr: %#x, ", i++, entry->format, entry->attribute); + if (entry->format == EIFMT_SVAL && entry->attribute == EIATTR_KPARAM_INFO) { + if (entry->values_size != 0xc) { + tool::Logging(LOG_ERROR, myName, "EIATTR_KPARAM_INFO values size has not the expected value of 0xc\n"); + goto cleanup; + } + struct nv_info_kparam_info *kparam = (struct nv_info_kparam_info*)&entry->values; + // printf("kparam: index: %#x, ordinal: %#x, offset: %#x, unknown: %#0x, cbank: %#0x, size: %#0x\n", + // kparam->index, kparam->ordinal, kparam->offset, kparam->unknown, kparam->cbank, kparam->size); + tool::Logging(LOG_REGS, myName, "param %d: offset: %#x, size: %#x\n", kparam->ordinal, kparam->offset, kparam->size); + if (kparam->ordinal >= kernel->paramNum) { + kernel->paramOffsets = (uint16_t*)realloc(kernel->paramOffsets, + (kparam->ordinal+1)*sizeof(uint16_t)); + kernel->paramSizes = (uint16_t*)realloc(kernel->paramSizes, + (kparam->ordinal+1)*sizeof(uint16_t)); + kernel->paramNum = kparam->ordinal+1; + } + kernel->paramOffsets[kparam->ordinal] = kparam->offset; + kernel->paramSizes[kparam->ordinal] = kparam->size; + secpos += sizeof(struct nv_info_kernel_entry) + entry->values_size-4; + } else if (entry->format == EIFMT_HVAL && entry->attribute == EIATTR_CBANK_PARAM_SIZE) { + kernel->paramSize = entry->values_size; + tool::Logging(LOG_REGS, myName, "cbank_param_size: %#0x\n", entry->values_size); + secpos += sizeof(struct nv_info_kernel_entry)-4; + } else if (entry->format == EIFMT_HVAL) { + // printf("hval: %#x(%d)\n", entry->values_size, entry->values_size); + secpos += sizeof(struct nv_info_kernel_entry)-4; + } else if (entry->format == EIFMT_SVAL) { + // printf("sval_size: %#x ", entry->values_size); + // for (int j=0; j*sizeof(uint32_t) < entry->values_size; j++) { + // printf("val%d: %#x(%d) ", j, (&entry->values)[j], (&entry->values)[j]); + // } + // printf("\n"); + secpos += sizeof(struct nv_info_kernel_entry) + entry->values_size-4; + } else if (entry->format == EIFMT_NVAL) { + // printf("nval\n"); + secpos += sizeof(struct nv_info_kernel_entry)-4; + } else { + tool::Logging(LOG_REGS, myName, "unknown format: %#x\n", entry->format); + secpos += sizeof(struct nv_info_kernel_entry)-4; + } + } + // printf("remaining: %d\n", data->d_size % sizeof(struct nv_info_kernel_entry)); + ret = 0; + cleanup: + free(section_name); + return ret; +} + +static int GetSymbolTable(Elf *elf, Elf_Data **symbol_table_data, size_t *symbol_table_size, GElf_Shdr *symbol_table_shdr) +{ + GElf_Shdr shdr; + Elf_Scn *section = NULL; + + if (elf == NULL || symbol_table_data == NULL || symbol_table_size == NULL) { + tool::Logging(LOG_ERROR, myName, "invalid argument\n"); + return -1; + } + + if (GetSectionByName(elf, ".symtab", §ion) != 0) { + tool::Logging(LOG_ERROR, myName, "could not find .symtab section\n"); + return -1; + } + + if (gelf_getshdr(section, &shdr) == NULL) { + tool::Logging(LOG_ERROR, myName, "gelf_getshdr failed\n"); + return -1; + } + + if (symbol_table_shdr != NULL) { + *symbol_table_shdr = shdr; + } + + if(shdr.sh_type != SHT_SYMTAB) { + tool::Logging(LOG_ERROR, myName, "not a symbol table: %d\n", shdr.sh_type); + return -1; + } + + if ((*symbol_table_data = elf_getdata(section, NULL)) == NULL) { + tool::Logging(LOG_ERROR, myName, "elf_getdata failed\n"); + return -1; + } + + *symbol_table_size = shdr.sh_size / shdr.sh_entsize; + + return 0; +} + +static int CheckElf(Elf *elf) +{ + Elf_Kind ek; + GElf_Ehdr ehdr; + + int elfclass; + char *id; + size_t program_header_num; + size_t sections_num; + size_t section_str_num; + int ret = -1; + + if ((ek = elf_kind(elf)) != ELF_K_ELF) { + tool::Logging(LOG_ERROR, myName, "elf_kind is not ELF_K_ELF, but %d\n", ek); + goto cleanup; + } + + if (gelf_getehdr(elf, &ehdr) == NULL) { + tool::Logging(LOG_ERROR, myName, "gelf_getehdr failed\n"); + goto cleanup; + } + + if ((elfclass = gelf_getclass(elf)) == ELFCLASSNONE) { + tool::Logging(LOG_ERROR, myName, "gelf_getclass failed\n"); + goto cleanup; + } + + if ((id = elf_getident(elf, NULL)) == NULL) { + tool::Logging(LOG_ERROR, myName, "elf_getident failed\n"); + goto cleanup; + } + + tool::Logging(LOG_REGS, myName, "elfclass: %d-bit; elf ident[0..%d]: %7s\n", + (elfclass == ELFCLASS32) ? 32 : 64, + EI_ABIVERSION, id); + + if (elf_getshdrnum(elf, §ions_num) != 0) { + tool::Logging(LOG_ERROR, myName, "elf_getphdrnum failed\n"); + goto cleanup; + } + + if (elf_getphdrnum(elf, &program_header_num) != 0) { + tool::Logging(LOG_ERROR, myName, "elf_getshdrnum failed\n"); + goto cleanup; + } + + if (elf_getshdrstrndx(elf, §ion_str_num) != 0) { + tool::Logging(LOG_ERROR, myName, "elf_getshstrndx Wfailed\n"); + goto cleanup; + } + + tool::Logging(LOG_REGS, myName, "elf contains %d sections, %d program_headers, string table section: %d\n", + sections_num, program_header_num, section_str_num); + + ret = 0; +cleanup: + return ret; +} + +int GetParameterInfo(std::vector *kernel_list, void* memory, size_t memsize){ + struct __attribute__((__packed__)) nv_info_entry{ + uint8_t format; + uint8_t attribute; + uint16_t values_size; + uint32_t kernel_id; + uint32_t value; + }; + + Elf *elf = NULL; + Elf_Scn *section = NULL; + Elf_Data *data = NULL, *symbol_table_data = NULL; + GElf_Shdr symtab_shdr; + size_t symnum; + int i = 0; + GElf_Sym sym; + + int ret = -1; + KernelInfo_t *ki = NULL; + const char *kernel_str; + + if (memory == NULL || memsize == 0) { + tool::Logging(LOG_ERROR, myName, "memory was NULL or memsize was 0\n"); + return -1; + } + +// #define ELF_DUMP_TO_FILE 1 + +// #ifdef ELF_DUMP_TO_FILE + // FILE* fd2 = fopen("flexgv-elf-dump", "wb"); + // fwrite(memory, memsize, 1, fd2); + // fclose(fd2); +// #endif + + if ((elf = elf_memory((char*)memory, memsize)) == NULL) { + tool::Logging(LOG_ERROR, myName, "elf_memory failed\n"); + goto cleanup; + } + + if (CheckElf(elf) != 0) { + tool::Logging(LOG_ERROR, myName, "check_elf failed\n"); + goto cleanup; + } + + if (GetSymbolTable(elf, &symbol_table_data, &symnum, &symtab_shdr) != 0) { + tool::Logging(LOG_ERROR, myName, "could not get symbol table\n"); + goto cleanup; + } + + if (GetSectionByName(elf, ".nv.info", §ion) != 0) { + tool::Logging(LOG_REGS, myName, "could not find .nv.info section. This means this binary does not contain any kernels.\n"); + ret = 0; // This is not an error. + goto cleanup; + } + + if ((data = elf_getdata(section, NULL)) == NULL) { + tool::Logging(LOG_ERROR, myName, "elf_getdata failed\n"); + goto cleanup; + } + + for (size_t secpos=0; secpos < data->d_size; secpos += sizeof(struct nv_info_entry)) { + struct nv_info_entry *entry = (struct nv_info_entry *)((uint8_t*)data->d_buf+secpos); + // tool::Logging(LOG_REGS, myName, "%d: format: %#x, attr: %#x, values_size: %#x kernel: %#x, sval: %#x(%d)", + // i++, entry->format, entry->attribute, entry->values_size, entry->kernel_id, + // entry->value, entry->value); + + if (entry->values_size != 8) { + tool::Logging(LOG_ERROR, myName, "unexpected values_size: %#x\n", entry->values_size); + continue; + } + + if (entry->attribute != EIATTR_FRAME_SIZE) { + continue; + } + + if (entry->kernel_id >= symnum) { + tool::Logging(LOG_ERROR, myName, "kernel_id out of bounds: %#x\n", entry->kernel_id); + continue; + } + + if (gelf_getsym(symbol_table_data, entry->kernel_id, &sym) == NULL) { + tool::Logging(LOG_ERROR, myName, "gelf_getsym failed for entry %d\n", entry->kernel_id); + continue; + } + + if ((kernel_str = elf_strptr(elf, symtab_shdr.sh_link, sym.st_name) ) == NULL) { + tool::Logging(LOG_ERROR, myName, "strptr failed for entry %d\n", entry->kernel_id); + continue; + } + + /* When using (some?) intrinsics, nvcc adds symbols for them in the .nv.info table. + * They are prefixed with $__internal_7_$ and are not kernels. We skip them he + */ + const char *intrinsics_prefix = "$__internal_"; + if (strncmp(kernel_str, intrinsics_prefix, strlen(intrinsics_prefix)) == 0) { + continue; + } + + if (GetKernelInfoByKernelName(kernel_list, kernel_str) != NULL) { + continue; + } + + tool::Logging(LOG_REGS, myName, "found new kernel: %s (symbol table id: %#x)\n", kernel_str, entry->kernel_id); + + ki = (KernelInfo_t*)malloc(sizeof(KernelInfo_t)); + kernel_list->push_back(ki); + + size_t buflen = strlen(kernel_str)+1; + if ((ki->name = (char*)malloc(buflen)) == NULL) { + tool::Logging(LOG_ERROR, myName, "malloc failed\n"); + goto cleanup; + } + if (strncpy(ki->name, kernel_str, buflen) != ki->name) { + tool::Logging(LOG_ERROR, myName, "strncpy failed\n"); + goto cleanup; + } + + if (GetParaForKernel(elf, ki, memory, memsize) != 0) { + tool::Logging(LOG_ERROR, myName, "GetParaForKernel failed for kernel %s\n", kernel_str); + goto cleanup; + } + } + + ret = 0; + cleanup: + if (elf != NULL) { + elf_end(elf); + } + return ret; +} + +KernelInfo_t* GetKernelInfoByKernelName(std::vector *kernel_list, const char* kernelName) { + if (kernel_list == NULL) { + tool::Logging(LOG_ERROR, myName, "kernelMap is NULL\n"); + return NULL; + } + if (kernel_list->empty()) { + return NULL; + } + for (auto ki = kernel_list->begin(); ki != kernel_list->end(); ki++) { + if (strcmp((*ki)->name, kernelName) == 0) { + return *ki; + } + } + return NULL; +} \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/.gitmodules b/GPU-Virtual-Service/xpu-pool-service/.gitmodules deleted file mode 100644 index 2e03ba5..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/.gitmodules +++ /dev/null @@ -1,7 +0,0 @@ -[submodule "third_party/kubernetes"] - path = third_party/kubernetes - url = https://szv-open.codehub.huawei.com/OpenSourceCenter/kubernetes/kubernetes.git - -[submodule "third_party/runtime"] - path = third_party/runtime - url = https://szv-open.codehub.huawei.com/OpenBaize/Ascend/runtime.git \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/VersionSet.xml b/GPU-Virtual-Service/xpu-pool-service/ci/VersionSet.xml deleted file mode 100644 index 4ba8653..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/VersionSet.xml +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/app_define.json b/GPU-Virtual-Service/xpu-pool-service/ci/app_define.json deleted file mode 100644 index 6626ee2..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/app_define.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "fileVersion": "1", - "name": "XPUPoolService", - "serviceId": "b6a16627d7cd405697786962a38457d5", - "description": "", - "version": "1.0.0", - "type": "microService", - "processes": { - "XPUPoolService": { - "subscribes": [] - } - } - } \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/at/at_deploy.sh b/GPU-Virtual-Service/xpu-pool-service/ci/at/at_deploy.sh deleted file mode 100644 index 22440bc..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/at/at_deploy.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -# Copyright (C) Huawei Technologies Co., Ltd. 2024-2025. All rights reserved. -set -e - -function arch_config() { - arch=$(uname -m) - if [[ ${arch} == "x86_64" ]]; then - platform="x86" - elif [[ ${arch} == "aarch64" ]]; then - platform="arm" - else - echo "incorrect arch mode" - exit 1 - fi -} - -build_version=$(cat buildInfo.properties | sed -n 's/.*=//p') -echo ${build_version} - -# get product from CI pipeline for this project: -artget pull 56e9abca9a9045a98c283fd0cc958ffc ${build_version} -ca snapshot -at cloudartifact -ap deploy - -ssh -o "StrictHostKeyChecking no" ${execute_environment} "rm -rf /data/ci/at" -cd ${WORKSPACE}/${branch}/test -scp -r at ${execute_environment}:/data/ci/ - -arch_config -upload_arch=$(echo ${arch} | sed 's/_/-/g') -cd ${WORKSPACE}/deploy/software -scp ${upload_version}_${upload_arch}.zip ${execute_environment}:/data/ci/at/ -ssh ${execute_environment} "cd /data/ci/at && sh runtest.sh --artifact ${upload_version}_${upload_arch}.zip" \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/at/at_deploy.yml b/GPU-Virtual-Service/xpu-pool-service/ci/at/at_deploy.yml deleted file mode 100644 index c8df6a8..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/at/at_deploy.yml +++ /dev/null @@ -1,53 +0,0 @@ ---- -version: 2.0 - -buildspace: - fixed: true - path: /usr1/workspace - -envs: - - condition: env_type == 'docker' - resource: - type: docker - image: ${image_name_x86} - resource_class: 16U32G - mode: toolbox - - condition: env_type == "vpc" - resource: - type: docker - image: ${image_name_x86} - pool: ${img_pool_x86} - resource_class: 16U32G - -buildspace: - fixed: true - path: /usr1/workspace - -params: - - name: product - value: cloudbuild2.0 - - name: CB_AUTO_CHECK_VERSION - value: 2.0 - - name: CB_META_ENABLE_SWBOM - value: true - - name: CB_META_ENABLE_FILE_SWBOM - value: true - - name: CB_META_CMC_DEPENDENCY_V2 - value: true - -steps: - PRE_BUILD: - - checkout: - path: ${branch} - BUILD: - - build_execute: - command: | - sh ${branch}/ci/buildinfo.sh - sh ${branch}/ci/at/at_deploy.sh - accelerate: false - enhance: - - feature: md5_source_tracement - build_tools: [maven] - check: - auto: true - mode: sync \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/build.sh b/GPU-Virtual-Service/xpu-pool-service/ci/build.sh deleted file mode 100644 index 60273c5..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/build.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/bin/bash -# Copyright (C) Huawei Technologies Co., Ltd. 2024-2025. All rights reserved. -set -e - -WORK_DIR=$(cd $(dirname $0); pwd) -DEST_DIR=$WORK_DIR/../xpu_pool/xpu_docker_build/ - -function prepare() { - mkdir -p ${DEST_DIR}/cuda_client/GPU_client/ - mkdir -p ${DEST_DIR}/acl_client/NPU_client/ - mkdir -p ${WORK_DIR}/../XPU_symbols/ -} - -function handle_spdlog() { - mkdir -m 750 -p /usr/local/include - cd third_party/spdlog - cp -P --remove-destination -rf include/spdlog /usr/local/include - chmod 750 -R /usr/local/include - cd ${WORK_DIR} -} - -function compile_client() { - cd ${WORK_DIR} && rm -rf build && mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release ../../ && make -j -} - -function strip_gotest_codes() { - if [ ! -d "$1" ]; then - echo "Error: Directory '$1' does not exist." - return - fi - cd "$1" - find . -name *_test.go | xargs rm -rf - sed -i '/gomonkey/d' go.mod - go mod tidy -} - -function compile_device_plugin() { - # strip gomoney related codes to make SwInfoTree happy - strip_gotest_codes "${WORK_DIR}/../GPU-device-plugin/" - cd ${WORK_DIR}/../GPU-device-plugin/ && make -j -} - -function compile_xpu_exporter() { - # strip gomoney related codes to make SwInfoTree happy - strip_gotest_codes "${WORK_DIR}/../xpu-exporter/" - cd ${WORK_DIR}/../xpu-exporter/ && make clean && make -j -} - -function strip_symbols() { - cd ${WORK_DIR}/build/direct/cuda - objcopy --only-keep-debug libcuda_direct.so libcuda_direct.sym - objcopy --only-keep-debug gpu-monitor gpu-monitor.sym - objcopy --strip-all libcuda_direct.so - objcopy --strip-all gpu-monitor - - cd ${WORK_DIR}/build/direct/acl - objcopy --only-keep-debug libruntime_direct.so libruntime_direct.sym - objcopy --only-keep-debug npu-monitor npu-monitor.sym - objcopy --strip-all libruntime_direct.so - objcopy --strip-all npu-monitor -} - -function copy_to_build_dir() { - cd ${WORK_DIR}/build - cp -P --remove-destination -r direct/cuda/libcuda_direct.so ${DEST_DIR}/cuda_client/GPU_client/ - cp -P --remove-destination -r direct/cuda/gpu-monitor.so ${DEST_DIR}/cuda_client/GPU_client/ - cp -P --remove-destination -r $WORK_DIR/../client_update/cuda-client-update.sh ${DEST_DIR}/cuda_client/GPU_client/ - - cp -P --remove-destination -r direct/cuda/*.sym ${WORK_DIR}/../XPU_symbols/ - - cp -P --remove-destination -r direct/acl/libruntime_direct.so ${DEST_DIR}/acl_client/NPU_client/ - cp -P --remove-destination -r direct/acl/npu-monitor ${DEST_DIR}/acl_client/NPU_client/ - cp -P --remove-destination -r $WORK_DIR/../client_update/acl-client-update.sh ${DEST_DIR}/acl_client/NPU_client/ - - cp -P --remove-destination -r direct/acl/*.sym ${WORK_DIR}/../XPU_symbols/ - - cd ${WORK_DIR}/../GPU-device-plugin/ - cp -P --remove-destination -r gpu-device-plugin ${DEST_DIR}/gpu-device-plugin - cp -P --remove-destination -r npu-device-plugin ${DEST_DIR}/npu-device-plugin - cp -P --remove-destination -r xpu-client-tool ${DEST_DIR}/cuda_client/GPU_client/ - cp -P --remove-destination -r xpu-client-tool ${DEST_DIR}/acl_client/NPU_client/ - - - cd ${WORK_DIR}/../xpu-exporter/ - cp -P --remove-destination -r xpu-exporter ${DEST_DIR}/exporter - - cd ${WORK_DIR}/../XPU_symbols && tar -czvf XPU_symbols.tar.gz XPU_symbols -} - -function main() { - prepare - handle_spdlog - compile_client - compile_device_plugin - compile_xpu_exporter - strip_symbols - copy_to_build_dir -} - -main "$@" \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/build.yml b/GPU-Virtual-Service/xpu-pool-service/ci/build.yml deleted file mode 100644 index 48665ef..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/build.yml +++ /dev/null @@ -1,48 +0,0 @@ ---- -version: 2.0 - -buildspace: - fixed: true - path: /usr1/workspace - -params: - - name: image_name_x86 - value: szvecr02.his.huawei.com:80/ecr-build/modelengine_xpupool:x86_v6 - - name: image_name_arm - value: szvecr02.his.huawei.com:80/ecr-build/acs_xpupool:arm_v03 - - name: env_type - value: docker - - name: img_pool_x86 - value: docker-sz-service-x86-ondocker-16u-01 - - name: img_pool_arm - value: docker-sz-service-arm-ondocker-64u-02 - -envs: - - condition: env_type == 'label' - label: ${eulerx86_label} - - condition: env_type == 'docker' - resource: - type: docker - image: ${image_name_x86} - resource_class: 16U32G - mode: toolbox - - condition: env_type == "vpc" - resource: - type: docker - image: ${image_name_x86} - pool: ${img_pool_x86} - resource_class: 16U32G - -buildflow: - strategy: Eager - flow_metadata: - from: job_xpu_pool_build_x86 - attach_workspace: - path: pub_dir - resource: efs - jobs: - - job: job_xpu_pool_build_x86 - params: - - name: key1 - value: value1 - build_ref: ci/xpu_pool/build_x86.yml \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/buildinfo.sh b/GPU-Virtual-Service/xpu-pool-service/ci/buildinfo.sh deleted file mode 100644 index 15149ee..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/buildinfo.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -# Copyright (C) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. -set -e - -echo "Release is ${ENV_IS_RELEASE}" - -# 判断当前构建是否为版本构建,以决定构建变量 -if [ "${ENV_IS_RELEASE}" == "false" ]; then - SERVICE_VERSION='1.0.0-SNAPSHOT' - echo "buildVersion=${SERVICE_VERSION}.${ENV_PIPELINE_STARTTIME}">"${WORKSPACE}"/buildInfo.properties -else - if [ "${ENV_IS_RELEASE}" == "true" ]; then - SERVICE_VERSION=${ENV_RELEASE_VERSION} - echo "buildVersion=${ENV_RELEASE_VERSION}">"${WORKSPACE}"/buildInfo.properties - fi -fi \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/cmc/openSource_x86.xml b/GPU-Virtual-Service/xpu-pool-service/ci/cmc/openSource_x86.xml deleted file mode 100644 index b7c59c4..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/cmc/openSource_x86.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - - - Component - Generic - - EulerOS Server - EulerOSServerV200R013C00X86 - 2024.07.04.103000 - - - - Software/x86_64/DockerStack/EulerOS_Server_*.-docker.x86_64.tar.xz - EulerOS_Server/x86 - - - Software/x86_64/EulerOS-*-x86_64-dvd.iso - EulerOS_Server/x86 - - - - - \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/cmc/upload_cmc.xml b/GPU-Virtual-Service/xpu-pool-service/ci/cmc/upload_cmc.xml deleted file mode 100644 index 88fe24a..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/cmc/upload_cmc.xml +++ /dev/null @@ -1,25 +0,0 @@ - - - - BVersion - Generic - - eContainer - ${CMC_VERSION} - - N - - - output/software/* - - - - inner - output/inner/* - symbol - - - - - - \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/cms_signature.sh b/GPU-Virtual-Service/xpu-pool-service/ci/cms_signature.sh deleted file mode 100644 index 13bedaa..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/cms_signature.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash -# Copyright Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. -# 构建签名脚本 -set -e - -current_dir=$( - cd "$(dirname "$0")" || exit 1 - pwd -) -workspace=$(dirname "${current_dir}") -pkg_path=$1 -signature_jar=$(find /opt/buildtools/ -name signature.jar) - -if [ ! -d "${workspace}"/CI ]; then - mkdir -p "${workspace}"/CI -fi - -function gen_list() { - for file in "$1"/*; do - if [ -d "${file}" ]; then - gen_list "$file" - else - echo "$file" is file - if [ "$(basename "$file")"x != listx ]; then - cat <> "${pkg_path}"/list -Name: ${file##*$pkg_path/} -SHA256-Digest: $(sha256sum "${file}" | awk '{print $1}') -EOF - fi - fi - done -} - -function gen_signature_xml() { - cat << SIG_CONF > "${workspace}"/CI/signconf_cms.xml - - - - - CMS_Computing_RSA2048_CN_20220810_Huawei - - **/list - - ${pkg_path}/list.cms.crl/ - 2 - 5 - 10.29.154.209:12056 - 049944 - 260185123 - - -SIG_CONF -} - -cd "${pkg_path}" -cat <"${pkg_path}"/list -Manifest Version: 1.0 -Create By: Huawei Technology Inc. -EOF - -gen_list "${pkg_path}" -gen_signature_xml -java -jar "${signature_jar}" "${workspace}"/CI/signconf_cms.xml \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/dependency.xml b/GPU-Virtual-Service/xpu-pool-service/ci/dependency.xml deleted file mode 100644 index 7a849d1..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/dependency.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/hwp7s_signature.sh b/GPU-Virtual-Service/xpu-pool-service/ci/hwp7s_signature.sh deleted file mode 100644 index 6562fb5..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/hwp7s_signature.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -# Copyright Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. -# hwp7s签名用于CMC B版本发布 -set -e - -pkg_path=$1 -current_dir=$( - cd "$(dirname "$0")" || exit 1 - pwd -) -workspace=$(dirname "${current_dir}") -signature_jar=$(find /opt/buildtools/ -name signature.jar) - -function gen_signature_xml() { - cat << EOF > "${workspace}"/CIConfig.xml - - - - - CMS_G5_Test_Sign_RSA3072PSS_CN_20220505_HUAWEI - CMS_G5_Test_TSA_RSA3072PSS_CN_20220505_HUAWEI - - **/*.zip - **/*.iso - **/*.tar - **/*.tar.gz - **/*.tgz - - ${pkg_path}/crldata.crl/ - 2 - 10.29.154.209:12056 - 5 - 049944 - 261181132 - 1 - - -EOF -} - -gen_signature_xml - -# sign -if ! java -jar "${signature_jar}" "${workspace}"/CIConfig.xml; then - echo "signature execute failed. exit." - exit 1 -fi \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/opensource.xml b/GPU-Virtual-Service/xpu-pool-service/ci/opensource.xml deleted file mode 100644 index 8c8a184..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/opensource.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/third_party b/GPU-Virtual-Service/xpu-pool-service/ci/third_party deleted file mode 100644 index b0a49d0..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/third_party +++ /dev/null @@ -1 +0,0 @@ -../third_party diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/build_x86.yml b/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/build_x86.yml deleted file mode 100644 index 23322cf..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/build_x86.yml +++ /dev/null @@ -1,70 +0,0 @@ -version: 2.0 - -buildspace: - fixed: true - path: /usr1/workspace - -envs: - - condition: env_type == 'docker' - resource: - type: docker - image: ${image_name_x86} - resource_class: 16U32G - mode: toolbox - - condition: env_type == "vpc" - resource: - type: docker - image: ${image_name_x86} - pool: ${img_pool_x86} - resource_class: 16U32G - -steps: - PRE_BUILD: - - checkout: - path: ${branch} # 下载子路径,可选,如果配置,则会把代码下载到子子路径下,如果不配置,则会下载到当前路径 - - manifest_checkout: - manifest_file: ci/dependency.xml - groups: spdlog,kubernetes,ascend-runtime - repo_depth: 0 - - artget: - artifact_type: cmcbinary - action: pull - dependency: ${branch}/ci/cmc/copenSource_x86.xml - version_output_path: ./ - agent: ./ - username: ${CMC_USERNAME} - password: ${CMC_PASSWORD} - - BUILD: - - build_execute: - command: | - sh ${branch}/ci/buildinfo.sh - sh ${branch}/ci/xpu_pool/build_xpu_package.sh - accelerate: false - check: - buildcheck: true - auto: true - exclude_dir: ${branch}/manager-b/deploy/agent/ - - POST_BUILD: - - artget: - artifact_type: cloudartifact - file_path: output - version_output_path: ./ - - version_set: # 记录version set - metadata: true # 开启元数据采集,结合元数据时必要 - isKiaScan: false - - when: - condition: upload_cmc == 'true' - steps: - - artget: - artifact_type: cmcbinary - action: push - params: {"CMC_VERSION": "${CMC_VERSION}"} - dependency: ${branch}/ci/cmc/upload_cmc.xml - agent: . - version_output_path: . - add_source_code: push - add_env_image: push - username: ${CMC_USERNAME} - password: ${CMC_PASSWORD} \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/build_xpu_package.sh b/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/build_xpu_package.sh deleted file mode 100644 index b823854..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/build_xpu_package.sh +++ /dev/null @@ -1,131 +0,0 @@ -#!/bin/bash -# Copyright (C) Huawei Technologies Co., Ltd. 2024-2025. All rights reserved. -set -e - -build_target=$1 -image_name=$2 -current_dir=$( - cd "$(dirname "$0")" || exit 1 - pwd -) -top_dir=$(dirname "$(dirname "${current_dir}")") -pkg_dir=${top_dir}/pkg -host_scripts_dir=${current_dir}/host_scripts - -echo "current_dir=${current_dir}" -echo "top_dir=${top_dir}" -echo "pkg_dir=${pkg_dir}" -echo "build_target=${build_target}" -echo "image_name=${image_name}" - -function arch_config() { - arch=$(uname -m) - if [[ ${arch} == "x86_64" ]]; then - platform="x86" - elif [[ ${arch} == "aarch64" ]]; then - platform="arm" - else - echo "incorrect arch mode" - exit 1 - fi -} - -function mk_xpu_pkg_dir() { - [ -e "${pkg_dir}" ] && rm -rf "${pkg_dir}" - mkdir -p "${pkg_dir}"/images - mkdir -p "${pkg_dir}"/templates - chmod -R 750 "${pkg_dir} -} - -function build_xpu_component() { - echo "build xpu component begin" - cd ${top_dir}/ci && sh build.sh - echo "build xpu component end" -} - -function get_helm_package() { - cd ${top_dir}/install/helm && helm package gpupool - cp -P --remove-destination -rf gpupool-0.1.0.tgz "${pkg_dir}/templates" - cp -P --remove-destination -rf ../install.sh "${pkg_dir}/templates" - cp -P --remove-destination -rf ../uninstall.sh "${pkg_dir}" -} - -function mknod_func() { - loopfile_firstname="/dev/loop0" - loopfile_num=0 - loopfile_name=/dev/loop"${loopfile_num}" - while true; do - if [ -b "${loopfile_name}" ]; then - loopfile_num=$(expr ${loopfile_num} + 1) - loopfile_name=/dev/loop${loopfile_num} - else - sudo mknod ${loopfile_name} b 7 "${loopfile_num}" - sudo chmod 660 "${loopfile_name}" - sudo chown root:disk "${loopfile_name}" - echo "${loopfile_name}" - break - fi - done -} - -function make_docker_base_image() { - # CurrentDir: code_branch/XPUPoolService/ci/xpu_pool/ - mkdir -p "${current_dir}/xpu_docker_build/exporter/euler" - mkdir -p "${top_dir}/plugin-market/euler" - mknod_func - sudo mount "${top_dir}"/../EulerOS_Server/"${platform}"/EulerOS-*-dvd.iso "${current_dir}/xpu_docker_build/exporter/euler" - sudo mount "${top_dir}"/../EulerOS_Server/"${platform}"/EulerOS-*-dvd.iso "${top_dir}/plugin-market/euler" - cd "${current_dir}" - docker import "${top_dir}"/../EulerOS_Server/"${platform}"/EulerOS_Server_*.tar.xz euleros:econtainer -} - -function build_image() { - echo "build $2 image begin" - cd ${current_dir}/xpu_docker_build/$1 - local tag="$2:${image_tag}" - docker build --squash --no-cache -t $tag . - echo "build $2 image end" - shift 2 - for package in "$@"; do - image_export_list[$package]+="$tag" - done -} - -function export_images() { - echo "save images begin" - docker save -o "${pkg_dir}/images/gpupool_${platform}.tar" ${image_export_list[gpu]} - docker save -o "${pkg_dir}/images/npupool_${platform}.tar" ${image_export_list[npu]} - echo "save images end" -} - -function build_output_packages() { - cd "${pkg_dir}" - mkdir -p ${WORKSPACE}/output/software - upload_arch=$(echo ${arch} | sed 's/_/-/g') - zip -1 -y ${WORKSPACE}/output/software/${xpupool_plugin}_${upload_arch}.zip * - mkdir -p ${WORKSPACE}/output/inner - cp -P --remove-destination -rf ${top_dir}/XPU_symbols.tar.gz \ - ${WORKSPACE}/output/inner/${xpupool_plugin}_${upload_arch}_sym.tar.gz - cd - -} - -function main() { - local -A image_export_list - arch_config - mk_xpu_pkg_dir - build_xpu_component - get_helm_package - make_docker_base_image - cd ${top_dir}/plugin-market &&sh build_daemonset.sh - build_image "cuda_client" "cuda_client_update" gpu - build_image "acl_client" "acl_client_update" npu - build_image "gpu-device-plugin" "gpu_device_plugin" gpu - build_image "npu-device-plugin" "npu_device_plugin" npu - build_image "exporter" "xpu_exporter" gpu npu - export_images - sh ${current_dir}/../cms_signature.sh ${pkg_dir} - build_output_packages - sh ${current_dir}/../hwp7s_signature.sh ${WORKSPACE}/output/software -} - -main "$@" \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/acl_client/Dockerfile b/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/acl_client/Dockerfile deleted file mode 100644 index 92d86a7..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/acl_client/Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -FROM euleros:econtainer - -# 复制编译好的 npu_client 文件 -COPY ./NPU_client /root - -# /root 目录下除了项目输出文件外没有非隐藏文件,以小数点(.)开头的隐藏文件不会被通配符(*)匹配 -RUN chmod 500 /root/* - -USER root -ENTRYPOINT ["/root/acl-client-update.sh"] \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/cuda_client/Dockerfile b/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/cuda_client/Dockerfile deleted file mode 100644 index 4fd8bf9..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/cuda_client/Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -FROM euleros:econtainer - -# 复制编译好的 xpu_client 文件 -COPY ./GPU_client /root - -# /root 目录下除了项目输出文件外没有非隐藏文件,以小数点(.)开头的隐藏文件不会被通配符(*)匹配 -RUN chmod 500 /root/* - -USER root -ENTRYPOINT ["/root/cuda-client-update.sh"] \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/exporter/Dockerfile b/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/exporter/Dockerfile deleted file mode 100644 index fa72b5e..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/exporter/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -FROM euleros:econtainer AS xpu_exporter - -COPY ./euler/ /opt/euler/ -# 复制编译好的 xpu_exporter 文件 -COPY xpu-exporter /opt/xpu/bin/ - -RUN echo "[dvd]" >> /etc/yum.repos.d/dvd.repo \ - && echo "name=install dvd" >> /etc/yum.repos.d/dvd.repo \ - && echo "baseurl=file:///opt/euler" >> /etc/yum.repos.d/dvd.repo \ - && echo "enabled=1" >> /etc/yum.repos.d/dvd.repo \ - && echo "gpgcheck=0" >> /etc/yum.repos.d/dvd.repo \ - && yum clean all && yum makecache \ - && yum -y install openssl \ - && rm -rf /opt/euler \ - && rm -f /etc/yum.repos.d/dvd.repo - -RUN chmod 500 /opt/xpu/bin/xpu-exporter - -# 当前容器创建的文件可能暴露到宿主机,从而与宿主机甚至其他容器的用户id碰撞 -# 选择 10001 作为用户id/组id是为了避免与 useradd 自动生成的id碰撞 -# 在本容器内新增用户时应当注意避免id碰撞 -RUN echo "xpu:x:10001:10001:eXPUPoolService:/:/sbin/nologin" >> /etc/passwd \ - && echo "xpu:x:10001:" >> /etc/group \ - && echo "xpu:!:::::::" >> /etc/shadow \ - && chown xpu:xpu /opt/xpu/bin/xpu-exporter \ - && setcap CAP_DAC_OVERRIDE=ep /opt/xpu/bin/xpu-exporter - -USER xpu:xpu - -ENTRYPOINT ["/opt/xpu/bin/xpu-exporter"] \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/gpu-device-plugin/Dockerfile b/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/gpu-device-plugin/Dockerfile deleted file mode 100644 index 59828ec..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/gpu-device-plugin/Dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -FROM euleros:econtainer AS gpu_device_plugin - -# 复制编译好的 gpu-device-plugin 文件 -COPY gpu-device-plugin /opt/xpu/bin/ - -RUN chmod 500 /opt/xpu/bin/gpu-device-plugin - -# 当前容器创建的文件可能暴露到宿主机,从而与宿主机甚至其他容器的用户id碰撞 -# 选择 10001 作为用户id/组id是为了避免与 useradd 自动生成的id碰撞 -# 在本容器内新增用户时应当注意避免id碰撞 -RUN echo "xpu:x:10001:10001:eXPUPoolService:/:/sbin/nologin" >> /etc/passwd \ - && echo "xpu:x:10001:" >> /etc/group \ - && echo "xpu:!::::::::" >> /etc/shadow \ - && chown xpu:xpu /opt/xpu/bin/gpu-device-plugin \ - && setcap CAP_DAC_OVERRIDE=ep /opt/xpu/bin/gpu-device-plugin - -USER xpu:xpu - -ENTRYPOINT ["/opt/xpu/bin/gpu-device-plugin"] \ No newline at end of file diff --git a/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/npu-device-plugin/Dockerfile b/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/npu-device-plugin/Dockerfile deleted file mode 100644 index 02d0558..0000000 --- a/GPU-Virtual-Service/xpu-pool-service/ci/xpu_pool/xpu_docker_build/npu-device-plugin/Dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -FROM euleros:econtainer AS npu_device_plugin - -# 复制编译好的 npu-device-plugin 文件 -COPY npu-device-plugin /opt/xpu/bin/ - -RUN chmod 500 /opt/xpu/bin/npu-device-plugin - -# 运行时必须使用root用户,否则会报错。 -# 使用非root权限用户运行会导致npu-device-plugin运行时加载libdcml.so文件失败。原因如下: -# 1. ascended-npu-exporter在加载libdcml.so动态库时会分别尝试在 LD_LIBRARY_PATH 环境变量指定的目录和 .ldconfig 缓存项目中查找动态文件。 -# 2. 构建容器镜像时,如果指定的运行用户是非root权限用户,我们需要对npu-device-plugin二进制文件进行CAP_DAC_OVERRIDE=ep的授权操作。 -# 由于对npu-device-plugin的授权,导致程序无法获取到包含libdcml.so文件路径的"LD_LIBRARY_PATH"环境变量。 -# 程序无法通过环境变量设置的路径找到libdcml.so,这将导致在初始化的时候无法找到so文件。 -# 3. 当环境变量中找不到libdcml.so文件时,程序会执行"ldconfig"命令并获取缓存内容,但是libdcml.so文件只在运行环境中存在。 -# 构建环境下的ldconfig无法识别该文件,因此无法将其写入缓存(更新缓存内容需要使用root权限)。 -# 因此构建容器镜像时,如果指定的运行用户是非root权限用户,容器内部无法使用ldconfig更新缓存。 -USER root - -ENTRYPOINT ["/opt/xpu/bin/npu-device-plugin"] \ No newline at end of file