Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
761 changes: 761 additions & 0 deletions GPU-Virtual-Service/gpu-remoting/include/conqueue/atomicops.h

Large diffs are not rendered by default.

979 changes: 979 additions & 0 deletions GPU-Virtual-Service/gpu-remoting/include/conqueue/readerwriterqueue.h

Large diffs are not rendered by default.

2,544 changes: 2,544 additions & 0 deletions GPU-Virtual-Service/gpu-remoting/include/hashing/robin_hood.h

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions GPU-Virtual-Service/gpu-remoting/include/hook/elfHandle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#ifndef ELF_HANDLE_H
#define ELF_HANDLE_H

#include <libelf.h>
#include <gelf.h>
#include <dlfcn.h>

#include "../define.h"
#include "fatBinary.h"

int InitElf2(void);

int GetFatbinInfo(FatHeader_t *fatbin, std::vector<KernelInfo_t*> *kernel_list, uint8_t** fatbin_mem, size_t* fatbin_size);

int GetParameterInfo(std::vector<KernelInfo_t*> *kernel_list, void* memory, size_t memsize);

KernelInfo_t* GetKernelInfoByKernelName(std::vector<KernelInfo_t*> *kernel_list, const char *kernelname);

#endif


80 changes: 80 additions & 0 deletions GPU-Virtual-Service/gpu-remoting/include/hook/fatBinary.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#ifndef FAT_BINARY_ELF_H
#define FAT_BINARY_ELF_H

#include "../chunkStructure.h"
#include "../constVar.h"
#include <stdint.h>

typedef struct __attribute__((__packed__)) {
uint32_t magic;
uint32_t version;
uint64_t text; // points to first text section
uint64_t data; // points to outside of the file
uint64_t unknown;
uint64_t text2; // points to second text section
uint64_t zero;
} FatHeader_t;

#define CRICKET_ELF_NV_INFO_PREFIX ".nv.info"
#define CRICKET_ELF_NV_SHARED_PREFIX ".nv.shared."
#define CRICKET_ELF_NV_TEXT_PREFIX ".nv.text."
#define CRICKET_ELF_TEXT_PREFIX ".text."

#define CRICKET_ELF_FATBIN ".nv_fatbin"
#define CRICKET_ELF_REGFUN "_ZL24__sti____cudaRegisterAllv"

#define FATBIN_STRUCT_MAGIC 0x466243b1
#define FATBIN_TEXT_MAGIC 0xBA55ED50

typedef struct __attribute__((__packed__)) {
uint32_t magic;
uint16_t version;
uint16_t header_size;
uint64_t size;
} FatElfHeader_t;

typedef struct __attribute__((__packed__)) {
uint16_t kind;
uint16_t unknown1;
uint32_t header_size;
uint64_t size;
uint32_t compressed_size; // Size of compressed data
uint32_t unknown2; // Address size for PTX?
uint16_t minor;
uint16_t major;
uint32_t arch;
uint32_t obj_name_offset;
uint32_t obj_name_len;
uint64_t flags;
uint64_t zero; // Alignment for compression?
uint64_t decompressed_size; // Length of compressed data in decompressed representation.
// There is an uncompressed footer so this is generally smaller
// than size.
} FatTextHeader_t;

#define FATBIN_FLAG_64BIT 0x0000000000000001LL
#define FATBIN_FLAG_DEBUG 0x0000000000000002LL
#define FATBIN_FLAG_LINUX 0x0000000000000010LL
#define FATBIN_FLAG_COMPRESS 0x0000000000002000LL

#define EIATTR_PARAM_CBANK 0xa
#define EIATTR_EXTERNS 0xf
#define EIATTR_FRAME_SIZE 0x11
#define EIATTR_MIN_STACK_SIZE 0x12
#define EIATTR_KPARAM_INFO 0x17
#define EIATTR_CBANK_PARAM_SIZE 0x19
#define EIATTR_MAX_REG_COUNT 0x1b
#define EIATTR_EXIT_INSTR_OFFSETS 0x1c
#define EIATTR_S2RCTAID_INSTR_OFFSETS 0x1d
#define EIATTR_CRS_STACK_SIZE 0x1e
#define EIATTR_SW1850030_WAR 0x2a
#define EIATTR_REGCOUNT 0x2f
#define EIATTR_SW2393858_WAR 0x30
#define EIATTR_INDIRECT_BRANCH_TARGETS 0x34
#define EIATTR_CUDA_API_VERSION 0x37

#define EIFMT_NVAL 0x1
#define EIFMT_HVAL 0x3
#define EIFMT_SVAL 0x4

#endif
70 changes: 70 additions & 0 deletions GPU-Virtual-Service/gpu-remoting/include/hook/hook.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#ifndef GV_HOOK_H
#define GV_HOOK_H

#include <cstdint>
#include <memory>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cublasLt.h>
#include <cudnn.h>
#include <nvml.h>
#include <nccl.h>
#include <dlfcn.h>

// #include "fatBinaryCtl.h"
#include "fatBinary.h"
#include "../configure.h"
#include "../ucpConnection.h"
#include "../shmqueue/shmUtil.h"
#include "../clientEndpoint.h"

#define HOOK_LOG_TAG "GV-Hook"

extern int mainDevIdx; // different ranks(processes) may use different devices
extern std::vector<ClientEndpoint*> clientEpList;
extern std::vector<bool> threadValidList;

extern Configure* config_;
extern UCPConnection* connectionObj;
extern SharedMemoryOpt* shmOpt;
extern GPUidMap* gpuIdMap;
extern std::once_flag initFlag;
extern std::once_flag registerFlag;
extern std::vector<RegisterIOV*> regIOVList;
extern std::vector<KernelInfo_t*> registeredKernels;
extern robin_hood::unordered_flat_map<uint64_t, KernelInfo_t*> mapHost2KernelInfo;

extern bool isReConnected;
extern std::mutex reConnectMutex;
extern std::shared_mutex threadSharedMutex;
extern int processID; // the process ID of the current process
extern int threadNum; // how many sub-threads have been created in this process
extern int commDevIdx; // the device index for current process's communicator
extern thread_local int threadID; // e.g, 1641432
extern thread_local int ttID; // e.g, 1, 2, 3, ...
extern thread_local ClientEndpoint* clientEpObj; // the client endpoint object for the current thread
extern thread_local int myDevIdx;
extern thread_local int lastReqType;
extern thread_local bool isTraining;
extern thread_local bool batchCollected;
extern thread_local int curTensorIdx;
extern thread_local size_t curIter;
extern thread_local std::vector<TensorInfo_t> tensorByteList;

void ConnectToDispatcher(Configure config);
void SwitchClientEp(int dev, bool threadInit = false);
void Intialize();
void DestoryResources();

inline void HookLog(const char* func, bool checkClientEp = true, int debugLevel = LOG_DEBUG) {
if (checkClientEp || (ttID > 0 && threadValidList[ttID - 1] == false)){ // check if a new thread, or if the clientEp for myDevIdx has been created
SwitchClientEp(myDevIdx, clientEpObj == nullptr);
}
tool::Logging(debugLevel, HOOK_LOG_TAG, "[pid:%d, tid:%d, ttid:%d] ======== %s ========\n", processID, threadID, ttID, func);
}

void CheckTensors(int reqType);
bool CheckIteration(void* dst, size_t size);

#endif
47 changes: 47 additions & 0 deletions GPU-Virtual-Service/gpu-remoting/include/ucpConnection.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#ifndef UCP_CONNECTION_H
#define UCP_CONNECTION_H
#include "ucpUtil.h"
#include "configure.h"

class UCPConnection {
private:
string myName_ = "UCPConnection";

// string serverIP_; // instead of c_str(), which points to a temporary buffer with same addr
// uint16_t serverPort_;
// string clientIP_;
// uint16_t clientPort_;

int connectionType_ = CLIENT_SERVER_SEND_RECV_DEFAULT;

ucp_context_h ucpContext_; // shared by all workers
ucp_worker_h listenWorker_; // worker for listener, only used for server
ucp_listener_h listener_; // listener for server

size_t clientNum_ = 0;

int epollFd_ = -1;
int workerFd_ = -1;


public:
UCPConnection_t _listenCtx; // context for client connection, only used for server
// struct sockaddr_storage _serverAddr;
// struct sockaddr_storage _clientAddr;

void InitWorker(ucp_worker_h *ucpWorker, uint64_t clientID = 0);
ucp_worker_h CreateWorker(bool is_client=false, uint64_t clientID = 0);
void SetConnWorker(ucp_worker_h worker);

// UCPConnection(string address_str, uint16_t port, bool is_client = false);
UCPConnection(bool is_client);

~UCPConnection();

ucs_status_t Listen(const string& serverIP, uint16_t serverPort, ucp_listener_conn_callback_t callback);
void WaitConnection(volatile bool* is_closed);
void WaitConnection(UCPConnection_t* conn, volatile bool* is_closed);
};


#endif
Loading
Loading