# GEMM on GPU

## 1. Set-up 

In [1]:
# Mount google drive 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Make sure your token is stored in a txt file at the location below.
# This way there is no risk that you will push it to your repo
# Never share your token with anyone, it is basically your github password!
with open('/content/drive/MyDrive/ece5545/token.txt') as f:
    token = f.readline().strip()
# Use another file to store your github username    
with open('/content/drive/MyDrive/ece5545/git_username.txt') as f:
    handle = f.readline().strip()

In [29]:
# Clone your github repo
YOUR_TOKEN = token
YOUR_HANDLE = handle
BRANCH = "main"

%mkdir /content/drive/MyDrive/ece5545
%cd /content/drive/MyDrive/ece5545
!git clone https://{YOUR_TOKEN}@github.com/ML-HW-SYS/a3-{YOUR_HANDLE}.git
%cd /content/drive/MyDrive/ece5545/a3-{YOUR_HANDLE}
!git checkout {BRANCH}
!git pull
%cd /content/drive/MyDrive/ece5545

PROJECT_ROOT = f"/content/drive/MyDrive/ece5545/a3-{YOUR_HANDLE}"

mkdir: cannot create directory ‘/content/drive/MyDrive/ece5545’: File exists
/content/drive/MyDrive/ece5545
fatal: destination path 'a3-NamanMakkar' already exists and is not an empty directory.
/content/drive/MyDrive/ece5545/a3-NamanMakkar
Already on 'main'
Your branch is up to date with 'origin/main'.
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 4 (delta 2), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (4/4), 847 bytes | 1024 bytes/s, done.
From https://github.com/ML-HW-SYS/a3-NamanMakkar
   6e3ad80..f71d951  main       -> origin/main
Updating 6e3ad80..f71d951
Fast-forward
 src/ops.py | 3 [32m+++[m
 1 file changed, 3 insertions(+)
/content/drive/MyDrive/ece5545


In [4]:
# This extension reloads all imports before running each cell
%load_ext autoreload
%autoreload 2

In [5]:
!ls {PROJECT_ROOT}

1-conv1d_cpu.ipynb   4-gemm_gpu.ipynb	    README.md
2-conv1d_gpu.ipynb   5-conv2d_dw_gpu.ipynb  src
3-conv1d_fpga.ipynb  leaderboard_id.txt     tests


## 2. Install TVM

In [6]:
!pip install tlcpack-nightly-cu102 -f https://tlcpack.ai/wheels

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://tlcpack.ai/wheels
Collecting tlcpack-nightly-cu102
  Downloading https://github.com/tlc-pack/tlcpack/releases/download/v0.12.dev/tlcpack_nightly_cu102-0.13.dev42%2Bga6f6f1100-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (408.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m408.0/408.0 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tlcpack-nightly-cu102
Successfully installed tlcpack-nightly-cu102-0.13.dev42+ga6f6f1100


## 3. Implement `make_conv1d_gpu_scheduler_func` function in `src.ops`

In that function, you are required to implemented 1D convolution and use TVM to optimize it.
Let $x \in \mathbb{R}^m$ and $y \in \mathbb{R}^n$, then 
$$
\operatorname{conv1d}(x, y)_i = \sum_{j=-\infty}^{\infty} x[j]y[i-j], \forall i \in \{0, 1, \dots, m + n - 1\}
$$

Please use zero padding and unit stride. Please see the numpy convolution function for more detail: [link](https://numpy.org/doc/stable/reference/generated/numpy.convolve.html).

The `make_conv1d_gpu_scheduler_func` takes $m$ and $n$, which are the size of the two 1D input array. 
You should return both the TVM scheduler and the TVM opterator for 
1. Input $x$
2. Input $y$
3. Output $out$

The scheduler should be able to used to build a function with signature $func(x, y, out)$. 
Please see the following cells for usage.

In [31]:
import tvm
import numpy as np
import sys
# Adding assignment 3 to the system path
# Make sure this matches your git directory
sys.path.insert(0, PROJECT_ROOT)
from src.ops import make_gemm_gpu_scheduler
import os
import tvm
from tvm import te

M = 1024
N = 512
K = 2048
dtype = 'float32'
a_np = np.random.rand(M, K).astype(dtype)
w_np = np.random.rand(K, N).astype(dtype)
b_np = np.matmul(a_np, w_np)

s, A, W, B = make_gemm_gpu_scheduler(M, K, N) 
func = tvm.build(s, [A, W, B], "cuda")

dev = tvm.cuda(0)
a = tvm.nd.array(a_np, dev)
w = tvm.nd.array(w_np, dev)
b = tvm.nd.array(np.zeros((M, N), dtype), dev)
func(a, w, b)
evaluator = func.time_evaluator(func.entry_name, dev, number=1, repeat =1)


print("Answer:", b_np)
print("Output:", b)
print(f"1DConv TVM: %f ms" % (evaluator(a, w, b).mean * 1e3))

Answer: [[504.69336 512.46545 493.93903 ... 510.84012 509.8555  514.19006]
 [523.47076 525.4437  504.1851  ... 521.49854 517.2589  519.2721 ]
 [507.28436 523.9327  492.32886 ... 510.99527 515.9947  517.83777]
 ...
 [513.4128  519.89795 494.37585 ... 522.6633  513.0862  516.14343]
 [524.838   534.55347 507.97308 ... 532.16895 532.1095  533.21375]
 [514.8642  542.1322  505.2583  ... 524.63324 525.8705  529.6918 ]]
Output: [[504.69363 512.46594 493.93878 ... 510.8406  509.85538 514.19037]
 [523.4706  525.4435  504.18497 ... 521.4987  517.2591  519.2724 ]
 [507.28412 523.93286 492.3288  ... 510.9955  515.9947  517.8378 ]
 ...
 [513.4127  519.898   494.3758  ... 522.6635  513.0858  516.143  ]
 [524.83826 534.55383 507.97324 ... 532.16895 532.109   533.21387]
 [514.8643  542.1324  505.25867 ... 524.6327  525.8708  529.6917 ]]
1DConv TVM: 6.358848 ms


In [32]:
print(tvm.lower(s, [A, W, B], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((1024, 2048), "float32"), B: T.Buffer((2048, 512), "float32"), C: T.Buffer((1024, 512), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
        blockIdx_y = T.launch_thread("blockIdx.y", 32)
        A_local = T.allocate([32], "float32", "local")
        B_local = T.allocate([32], "float32", "local")
        blockIdx_x = T.launch_thread("blockIdx.x", 16)
        threadIdx_x = T.env_thread("threadIdx.x")
        threadIdx_y = T.env_thread("threadIdx.y")
        C_1 = T.Buffer((524288,), data=C.data)
        with T.launch_thread(threadIdx_x, 32):
            T.launch_thread(threadIdx_y, 32)
            C_1[blockIdx_y * 16384 + threadIdx_y * 512 + blockIdx_x * 32 + threadIdx_x] = T.float32(0)
        for k_outer in range(64):
            A_local_1 = T.Buffer((32,), data=A_lo

In [33]:
%cd {PROJECT_ROOT}
!python -m pytest tests/test_gemm_gpu.py

/content/drive/MyDrive/ece5545/a3-NamanMakkar
platform linux -- Python 3.9.16, pytest-7.2.2, pluggy-1.0.0
rootdir: /content/drive/MyDrive/ece5545/a3-NamanMakkar
plugins: anyio-3.6.2
collected 20 items                                                             [0m

tests/test_gemm_gpu.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                              [100%][0m

