Skip to content

Commit

Permalink
2.20.3-1
Browse files Browse the repository at this point in the history
Add support for alternating rings, allow for cross-nic rings without
cross-rail communication.
Add support for user buffer registration for network send/recv.
Optimize aggregated operations to better utilize all channels.
Add flattening for BCM PCI gen5 switches.
Add support for inter-node NVLink communication
Add support for port fusion in NET/IB.
Add support for ReduceScatter and AllGather using Collnet.
Update net API to v8.
Fix hang during A2A connection.
  • Loading branch information
sjeaugey committed Feb 13, 2024
1 parent b6d7438 commit b647562
Show file tree
Hide file tree
Showing 74 changed files with 4,616 additions and 2,149 deletions.
3 changes: 2 additions & 1 deletion ext-net/example/nccl/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@
#define NCCL_PTR_DMABUF 0x4

// Maximum number of requests per comm object
#define NCCL_NET_MAX_REQUESTS 8
#define NCCL_NET_MAX_REQUESTS 32

typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

#include "net_v8.h"
#include "net_v7.h"
#include "net_v6.h"
#include "net_v5.h"
Expand Down
1 change: 1 addition & 0 deletions ext-net/example/nccl/net_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ typedef struct {
int needsProxyProgress;
} ncclNetDeviceHandle_v7_t;

typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;

#endif
2 changes: 2 additions & 0 deletions ext-net/example/nccl/net_v6.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#ifndef NCCL_NET_V6_H_
#define NCCL_NET_V6_H_

#define NCCL_NET_MAX_REQUESTS_V6 8

typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
Expand Down
2 changes: 0 additions & 2 deletions ext-net/example/nccl/net_v7.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ typedef struct {
int netDeviceVersion; // Version number for network offload
} ncclNetProperties_v7_t;

typedef ncclNetProperties_v7_t ncclNetProperties_t;

typedef struct {
// Name of the network (mainly for logs)
const char* name;
Expand Down
83 changes: 83 additions & 0 deletions ext-net/example/nccl/net_v8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/

#ifndef NCCL_NET_V8_H_
#define NCCL_NET_V8_H_

#include "net_device.h"

typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int regIsGlobal; // regMr is not tied to a particular comm
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
} ncclNetProperties_v8_t;

typedef ncclNetProperties_v8_t ncclNetProperties_t;

typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);

// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);

// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
} ncclNet_v8_t;

#endif // end include guard
126 changes: 100 additions & 26 deletions ext-net/example/plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,37 @@ __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess;

__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v7_t* props) {
//pluginPciPath(dev, &props.pciPath);
//pluginPtrSupport(dev, &props.ptrSupport);
__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props) {
// Below are default values, if unsure don't change.

props->name = "Example";
// Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
props->pciPath = NULL;
// Only used to detect NICs with multiple PCI attachments.
props->guid = 0;
// Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers.
props->ptrSupport = NCCL_PTR_HOST;
// If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
props->regIsGlobal = 0;
// Speed in *Mbps*. 100000 means 100G
props->speed = 100000;
// Port number, used in conjunction with guid
props->port = 0;
// Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values.
props->latency = 0;
// Maximum number of comm objects we can create.
props->maxComms = 1024*1024;
// Maximum number of receive operations taken by irecv().
props->maxRecvs = 1;
// Coupling with NCCL network device-side code.
props->netDeviceType = 0;
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
return ncclInternalError;
}
__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclInternalError; }
Expand All @@ -38,7 +60,7 @@ __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_m

#define PLUGIN_NAME "Plugin"

const ncclNet_v7_t ncclNetPlugin_v7 = {
const ncclNet_v8_t ncclNetPlugin_v8 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.devices = pluginDevices,
Expand All @@ -60,10 +82,62 @@ const ncclNet_v7_t ncclNetPlugin_v7 = {
.irecvConsumed = pluginIrecvConsumed,
};

__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props) {
//pluginPciPath(dev, &props.pciPath);
//pluginPtrSupport(dev, &props.ptrSupport);
return ncclInternalError;
__hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* props_v7) {
ncclNetProperties_t props;
ncclResult_t ret = pluginGetProperties(dev, &props);
if (ret != ncclSuccess) return ret;
props_v7->name = props.name;
props_v7->pciPath = props.pciPath;
props_v7->guid = props.guid;
props_v7->ptrSupport = props.ptrSupport;
props_v7->speed = props.speed;
props_v7->port = props.port;
props_v7->maxComms = props.maxComms;
props_v7->maxRecvs = props.maxRecvs;
props_v7->netDeviceType = props.netDeviceType;
props_v7->netDeviceVersion = props.netDeviceVersion;
return ncclSuccess;
}

__hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int size, int type, void** mhandle) {
return pluginRegMr(collComm, data, size, type, mhandle);
}

const ncclNet_v7_t ncclNetPlugin_v7 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v7,
.listen = pluginListen,
.connect = pluginConnect,
.accept = pluginAccept,
.regMr = pluginRegMr_v7,
.regMrDmaBuf = pluginRegMrDmaBuf,
.deregMr = pluginDeregMr,
.isend = pluginIsend,
.irecv = pluginIrecv,
.iflush = pluginIflush,
.test = pluginTest,
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen,
.getDeviceMr = pluginGetDeviceMr,
.irecvConsumed = pluginIrecvConsumed,
};

__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props_v6) {
ncclNetProperties_t props;
ncclResult_t ret = pluginGetProperties(dev, &props);
if (ret != ncclSuccess) return ret;
props_v6->name = props.name;
props_v6->pciPath = props.pciPath;
props_v6->guid = props.guid;
props_v6->ptrSupport = props.ptrSupport;
props_v6->speed = props.speed;
props_v6->port = props.port;
props_v6->maxComms = props.maxComms;
props_v6->maxRecvs = props.maxRecvs;
return ncclSuccess;
}

__hidden ncclResult_t pluginConnect_v6(int dev, void* handle, void** sendComm) { return ncclInternalError; }
Expand All @@ -77,7 +151,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
.listen = pluginListen,
.connect = pluginConnect_v6,
.accept = pluginAccept_v6,
.regMr = pluginRegMr,
.regMr = pluginRegMr_v7,
.regMrDmaBuf = pluginRegMrDmaBuf,
.deregMr = pluginDeregMr,
.isend = pluginIsend,
Expand All @@ -98,7 +172,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
.listen = pluginListen,
.connect = pluginConnect_v6,
.accept = pluginAccept_v6,
.regMr = pluginRegMr,
.regMr = pluginRegMr_v7,
.deregMr = pluginDeregMr,
.isend = pluginIsend,
.irecv = pluginIrecv,
Expand All @@ -110,17 +184,17 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
};

/* v4 Compat */
static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props) {
ncclNetProperties_v6_t props_v6;
ncclResult_t ret = pluginGetProperties_v6(dev, &props_v6);
static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props_v4) {
ncclNetProperties_t props;
ncclResult_t ret = pluginGetProperties(dev, &props);
if (ret != ncclSuccess) return ret;
props->name = props_v6.name;
props->pciPath = props_v6.pciPath;
props->guid = props_v6.guid;
props->ptrSupport = props_v6.ptrSupport;
props->speed = props_v6.speed;
props->port = props_v6.port;
props->maxComms = props_v6.maxComms;
props_v4->name = props.name;
props_v4->pciPath = props.pciPath;
props_v4->guid = props.guid;
props_v4->ptrSupport = props.ptrSupport;
props_v4->speed = props.speed;
props_v4->port = props.port;
props_v4->maxComms = props.maxComms;
return ncclSuccess;
}
static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
Expand Down Expand Up @@ -157,7 +231,7 @@ const ncclNet_v4_t ncclNetPlugin_v4 = {
.listen = pluginListen,
.connect = pluginConnect_v4,
.accept = pluginAccept_v4,
.regMr = pluginRegMr,
.regMr = pluginRegMr_v7,
.deregMr = pluginDeregMr,
.isend = pluginIsend_v4,
.irecv = pluginIrecv_v4,
Expand Down Expand Up @@ -202,7 +276,7 @@ const ncclNet_v3_t ncclNetPlugin_v3 = {
.listen = pluginListen_v3,
.connect = pluginConnect_v3,
.accept = pluginAccept_v4,
.regMr = pluginRegMr,
.regMr = pluginRegMr_v7,
.deregMr = pluginDeregMr,
.isend = pluginIsend_v4,
.irecv = pluginIrecv_v4,
Expand All @@ -223,7 +297,7 @@ const ncclNet_v2_t ncclNetPlugin_v2 = {
.listen = pluginListen,
.connect = pluginConnect_v4,
.accept = pluginAccept_v4,
.regMr = pluginRegMr,
.regMr = pluginRegMr_v7,
.deregMr = pluginDeregMr,
.isend = pluginIsend_v4,
.irecv = pluginIrecv_v4,
Expand Down
4 changes: 2 additions & 2 deletions makefiles/version.mk
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 19
NCCL_PATCH := 4
NCCL_MINOR := 20
NCCL_PATCH := 3
NCCL_SUFFIX :=
PKG_REVISION := 1
2 changes: 1 addition & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ include ../makefiles/version.mk
INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := \
bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
init.cc init_nvtx.cc net.cc proxy.cc transport.cc register.cc \
$(wildcard graph/*.cc) \
$(wildcard misc/*.cc) \
$(wildcard transport/*.cc)
Expand Down
Loading

0 comments on commit b647562

Please sign in to comment.