<a href="https://colab.research.google.com/github/NShravanReddy/DeepLearning/blob/main/triton/tanh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import triton
import triton.language as tl
import torch
import time
import torch.nn as nn

@triton.jit
def t_t_k(x_ptr,
          y_ptr,
          N0,
          BLOCK_SIZE:tl.constexpr):
    pid=tl.program_id(axis=0)
    block_start= pid* BLOCK_SIZE
    offsets= block_start + tl.arange(0,BLOCK_SIZE)
    mask= offsets<N0
    x=tl.load(x_ptr+offsets,mask=mask)
    exp_2x = tl.exp(2 * x)
    y = (exp_2x - 1) / (exp_2x + 1)
    tl.store(y_ptr+offsets,y,mask=mask)

def t_t_k_h(x:torch.Tensor,BLOCK_SIZE=1024)->torch.Tensor:
    y=torch.empty_like(x)
    N0=x.numel()
    grid= lambda meta:(triton.cdiv(N0,meta['BLOCK_SIZE']),)
    t_t_k[grid](x,y,N0,BLOCK_SIZE=BLOCK_SIZE)
    return y

def benchmark(func, *args, n_warmup=10, n_iters=100):
    for _ in range(n_warmup):
        func(*args)
    torch.cuda.synchronize()
    start = time.perf_counter()
    for _ in range(n_iters):
        func(*args)
    torch.cuda.synchronize()
    end = time.perf_counter()
    return (end - start) / n_iters * 1000


if __name__=='__main__':
  N=1024*1024
  x=torch.randn(N, device='cuda', dtype=torch.float32)

  y_triton=t_t_k_h(x)
  tanh = nn.Tanh()
  y_torch = tanh(x)

  print(y_torch)
  print(y_triton)
  print(abs(y_torch-y_triton))

tensor([ 0.8139,  0.6383, -0.7585,  ..., -0.7062, -0.6467, -0.9629],
       device='cuda:0')
tensor([ 0.8139,  0.6383, -0.7585,  ..., -0.7062, -0.6467, -0.9629],
       device='cuda:0')
tensor([0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
        5.9605e-08], device='cuda:0')
