<a href="https://colab.research.google.com/github/ItsFreakinDay/hybridCompSys/blob/task2/Task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%writefile secondTask.cu
#include <torch/extension.h>

#include <torch/extension.h>

__global__ void d_divide(float* a, int scalar, int* c, int n) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < n) {
        c[i] = static_cast<int>(a[i]) / scalar;
    }
}

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

const int block_size = 128;

__forceinline__ int calc_grid_size(int m) {
    return (m + block_size - 1) / block_size;
}

torch::Tensor divide_by_scalar(torch::Tensor a, int scalar) {
    CHECK_INPUT(a);

    auto host_result = torch::zeros_like(a, torch::kInt);
    int n = a.numel();

    d_divide<<<calc_grid_size(n), block_size>>>(
        a.data_ptr<float>(),
        scalar,
        host_result.data_ptr<int>(),
        n
    );

    return host_result;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("my_divide_by_scalar", &divide_by_scalar, "Custom divide by scalar operation");
}

Writing secondTask.cu


In [2]:
!pip install ninja

Collecting ninja
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/307.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m174.1/307.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ninja
Successfully installed ninja-1.11.1.1


In [18]:
%%writefile SecondTask.py
import unittest
import torch
from torch.utils.cpp_extension import load

class LabTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.ext = load(
            name='my_extension',
            sources=['secondTask.cu'],
            extra_cuda_cflags=['-O3'],
            extra_cflags=['-O3'],
        )
        cls.rtol = 1e-5
        cls.atol = 1e-5

    def test_divide_by_scalar(self):
        n = torch.randint(low=1, high=2048, size=(1, 1))
        scalar = torch.randint(low=1, high=100, size=(1, 1)).item()

        x = torch.randint(low=1, high=100, size=(n.item(),)).to("cuda").float()
        z = LabTest.ext.my_divide_by_scalar(x, scalar)
        z_ = x // scalar

        torch.testing.assert_allclose(z, z_, rtol=self.rtol, atol=self.atol)

if __name__ == '__main__':
    unittest.main()

Overwriting SecondTask.py


In [19]:
%run SecondTask.py

  torch.testing.assert_allclose(z, z_, rtol=self.rtol, atol=self.atol)
.
----------------------------------------------------------------------
Ran 1 test in 0.041s

OK
