<a href="https://colab.research.google.com/github/JTStephens18/CUDA_Playground/blob/main/CUDA_Playground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import random
import numpy as np
import math
from functools import partial

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from scipy.interpolate import interpn

In [2]:
# Slows things down, but good for development since it stops when there is an error
os.environ['CUDA_LAUNCH_BLOCKING']='1'

In [3]:
# Wurlitzer allows things to be printed from C++/CUDA code in a notebook
# Ninja is a build tool required by pytorch to compile C++/CUDA code
%pip install -q wurlitzer ninja

In [4]:
%load_ext wurlitzer

In [5]:
"""
load_inline is a great function that takes in
  a list of any of the cuda code strings you want to compile (cuda_sources)
  any plain cpp strings you want to compile (cpp_sources)
  any functions in the cpp strings you want to make available to pytorch (functions)
that compiles it all and turns it into a python module
"""
from torch.utils.cpp_extension import load_inline

In [6]:
def load_cuda(cuda_src, cpp_src, funcs, opt=False, verbose=False):
  return load_inline(cuda_sources=[cuda_src], cpp_sources=[cpp_src], functions=funcs,
                     extra_cuda_cflags=["-O2"] if opt else [], verbose=verbose, name='inline_ext')

In [7]:
cuda_begin = r'''
#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
// Checks input is contiguous in memory
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

// Ceiling division - which we can use to figure out how many blocks we need
inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
'''

In [None]:
"""
  Timestep: 33:30 in Getting started with CUDA for Python Programmers
  Use __global__ anytime we want to call something from the CPU to run on the GPU
  Ex: __global__ void func(int x) {}

  To call a CUDA kernel:
  func<<<numBlocks, numThreads>>> (
    arguments
  );

  To check for an error call:
  C10_CUDA_KERNEL_LAUNCH_CHECK();
  Always call it after calling a kernel to make sure no errors

  Must be careful when running a function from a CUDA kernel that it has finished
  Can check this by printing a value or .cpu() will wait for the kernel to finish and put it onto cpu
"""

## Neural Kernel Code

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
batch_size = 4
pairs = torch.randn(batch_size, 200, 200, 6).to(device)
input1 = torch.randn(batch_size, 200, 3).to(device)
input2 = torch.randn(batch_size, 200, 3).to(device)

In [16]:
print(pairs[1,12,17, 2])
print(input1[1,12,2])
print(input2[1,17,2])
flat = pairs.flatten()
print(flat.shape)

flat1 = input1.flatten()
flat2 = input2.flatten()

i = 1
j = 12
k = 17
l = 2
# Formula to calculate index of a flattened tensor given the dimensions of original multidimensional tensor and indexes
idx = i * (pairs.shape[1] * pairs.shape[2] * pairs.shape[3]) + j * (pairs.shape[2] * pairs.shape[3]) + k * (pairs.shape[3]) + l
input_idx = i * (input1.shape[1] * input1.shape[2]) + j * (input1.shape[2]) + l
input_idx2 = i * (input2.shape[1] * input2.shape[2]) + k * (input2.shape[2]) + l
print(idx)
print(flat[idx])
print(input_idx)
print(flat1[input_idx])
print(input_idx2)
print(flat2[input_idx2])

tensor(-1.0341, device='cuda:0')
tensor(1.3791, device='cuda:0')
tensor(0.9200, device='cuda:0')
torch.Size([9600])
3944
tensor(-1.0341, device='cuda:0')
98
tensor(1.3791, device='cuda:0')
113
tensor(0.9200, device='cuda:0')


In [27]:
%%time
pairs[:, :200, :, :3] = input1.unsqueeze(2)
pairs[:, :, :200, 3:] = input2.unsqueeze(1)

CPU times: user 8.69 ms, sys: 2.99 ms, total: 11.7 ms
Wall time: 15.6 ms


In [39]:
cuda_src = cuda_begin + r'''

#define BATCH 4
#define DIM1 200
#define DIM2 200
#define DIM3 3

__global__ void concat_kernel(float* input1, float* input2, float* out, int h, int w, int batch, int channels)  {
// Rows
    int r = blockIdx.y * blockDim.y + threadIdx.y;
// Columns
    int c = blockIdx.x * blockDim.x + threadIdx.x;
    if(r>=h || c>=w) return;
    for (int i = 0; i < batch; i++) {
      for (int j = 0; j < h; j++) {
        for (int k = 0; k < w; k++) {
          for (int l = 0; l < channels; l++) {
// Calculate indexes for flattened tensor based on their indexes in original multidimensions
            int idx = i * (h * w * channels) + j * (w * channels) + k * (2*channels) + l;
            int idx2 = i * (h * w * channels) + j * (w * channels) + k * (2*channels) + l+3;
            int input_idx_1 = i * (h * channels) + j * (channels) + l;
            int input_idx_2 = i * (w * channels) + k * (channels) + l;
// Assign values
            out[idx] = input1[input_idx_1];
            out[idx2] = input2[input_idx_2];
          }
        }
      }
    }
}

torch::Tensor concat(torch::Tensor input1, torch::Tensor input2, torch::Tensor output) {
  CHECK_INPUT(input1);
  CHECK_INPUT(input2);
  CHECK_INPUT(output);
// Assign variables for data dimensions
  int batch = input1.size(0);
  int h = input1.size(1);
  int w = input2.size(1);
  int channels = input1.size(2);
// Flatten tensors after shape values are stored
  auto newInput1 = torch::flatten(input1);
  auto newInput2 = torch::flatten(input2);
  dim3 tpb(16, 16);
  dim3 blocks(cdiv(w, tpb.x), cdiv(h, tpb.y));
  concat_kernel<<<blocks, tpb>>>(
    newInput1.data_ptr<float>(), newInput2.data_ptr<float>(), output.data_ptr<float>(), h,w, batch, channels);
  C10_CUDA_KERNEL_LAUNCH_CHECK();
  return output;
}
'''

In [40]:
cpp_src = "torch::Tensor concat(torch::Tensor input1, torch::Tensor input2, torch::Tensor output);"

module = load_cuda(cuda_src, cpp_src, ['concat'])

In [41]:
out = torch.zeros(4, 200, 200, 6).contiguous().to(device)
input1 = torch.randn(4, 200, 3).contiguous().to(device)
input2 = torch.randn(4, 200, 3).contiguous().to(device)

In [42]:
%%time
res = module.concat(input1, input2, out.flatten()).cpu()
print(res.shape)

torch.Size([960000])
CPU times: user 223 ms, sys: 3.99 ms, total: 227 ms
Wall time: 227 ms


In [43]:
print("1st Tensor", input1[0,0])
print("2nd Tensor", input2[0,0])
res = res.reshape(4, 200, 200, 2, 3)
print("Result", res[0,0,0])

1st Tensor tensor([ 1.5131, -0.0942,  0.8795], device='cuda:0')
2nd Tensor tensor([ 0.6690, -0.5289,  0.0010], device='cuda:0')
Result tensor([[ 1.5131e+00, -9.4231e-02,  8.7950e-01],
        [ 6.6897e-01, -5.2892e-01,  1.0203e-03]])
