In [None]:
!pip install nvcc4jupyter

In [2]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpiimux91e".


In [9]:
%%cuda_group_save --group shared --name "helper_cuda.h"
#ifndef HELPER_CUDA_H
#define HELPER_CUDA_H

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

// Error checking macro
#define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__ )

template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
                file, line, static_cast<unsigned int>(err), cudaGetErrorString(err), func);
        exit(1);
    }
}

#endif // HELPER_CUDA_H

In [10]:
%%cuda_group_save --name "stream_example.cu" --group "stream_demo"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda_runtime.h>
#include "helper_cuda.h"

class Dog
{
public:
    Dog();
    ~Dog();
    cudaStream_t *cuStreams;
    void prepare(int N);
private:
    int streamNum;
};

Dog::Dog() : streamNum(1), cuStreams(NULL) {}

Dog::~Dog() {
    if (cuStreams) {
        for (int i = 0; i < streamNum; i++) {
            checkCudaErrors(cudaStreamDestroy(cuStreams[i]));
        }
        free(cuStreams);
    }
}

void Dog::prepare(int N) {
    printf("Dog: creating %d streams\n", N);
    streamNum = N;
    cuStreams = (cudaStream_t*) malloc(sizeof(cudaStream_t) * N);
    for (int i = 0; i < streamNum; i++) {
        checkCudaErrors(cudaStreamCreate(&cuStreams[i]));
        printf("Dog: stream = %ld\n", (long)cuStreams[i]);
    }
}

class Cat
{
public:
    Cat();
    ~Cat();
    cudaStream_t *cuStreams;
    void prepare(int N);
private:
    int streamNum;
};

Cat::Cat() : streamNum(1), cuStreams(NULL) {}

Cat::~Cat() {
    if (cuStreams) {
        for (int i = 0; i < streamNum; i++) {
            checkCudaErrors(cudaStreamDestroy(cuStreams[i]));
        }
        free(cuStreams);
    }
}

void Cat::prepare(int N) {
    printf("Cat: creating %d streams\n", N);
    streamNum = N;
    cuStreams = (cudaStream_t*) malloc(sizeof(cudaStream_t) * N);
    for (int i = 0; i < streamNum; i++) {
        checkCudaErrors(cudaStreamCreate(&cuStreams[i]));
        printf("Cat: stream = %ld\n", (long)cuStreams[i]);
    }
}

int main(int argc, char **argv) {
    Dog d1;
    Cat c1;

    d1.prepare(2);
    c1.prepare(2);

    return 0;
}


In [12]:
%cuda_group_run --group "stream_demo" -c "--gpu-architecture sm_75 -O2 --default-stream per-thread"

Dog: creating 2 streams
Dog: stream = 97235711690480
Dog: stream = 97235711833744
Cat: creating 2 streams
Cat: stream = 97235708493408
Cat: stream = 97235708493440



Output will look like this:
```
Dog: creating 2 streams
Dog: stream = 97235711690480
Dog: stream = 97235711833744
Cat: creating 2 streams
Cat: stream = 97235708493408
Cat: stream = 97235708493440
```