In [3]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers datasets evaluate accelerate matplotlib scikit-learn


Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [14]:
%%writefile lmul_cuda_kernel.cu
#include <torch/extension.h>
#include <cuda_runtime.h>
#include <vector>

// L-mul offset function
__device__ __forceinline__ int l_offset(int m) {
    if (m <= 3) return m;
    if (m == 4) return 3;
    return 4;  // m > 4
}

// Standard matmul kernel
__global__ void standard_matmul_kernel(const float* A, const float* B, float* C, int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < K; k++) {
            sum += A[row * K + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}

// Launcher function for PyTorch
void standard_matmul(torch::Tensor A, torch::Tensor B, torch::Tensor C) {
    int M = A.size(0);
    int K = A.size(1);
    int N = B.size(1);

    dim3 blockSize(16, 16);
    dim3 gridSize((N + blockSize.x - 1)/blockSize.x, (M + blockSize.y - 1)/blockSize.y);

    standard_matmul_kernel<<<gridSize, blockSize>>>(
        A.data_ptr<float>(),
        B.data_ptr<float>(),
        C.data_ptr<float>(),
        M, N, K
    );
}

// Add your L-Mul kernels in a similar way
// For example: lmul_addition_only, lmul_optimized_vectorized, lmul_integer_only
// Each will have a __global__ kernel and a launcher taking torch::Tensor arguments

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("standard_matmul", &standard_matmul, "Standard matrix multiplication (CUDA)");
    // m.def("lmul_addition_only", &lmul_addition_only_launcher, "L-Mul addition-only");
    // m.def("lmul_optimized", &lmul_optimized_launcher, "L-Mul optimized vectorized");
    // m.def("lmul_integer_only", &lmul_integer_only_launcher, "L-Mul integer-only");
}


Overwriting lmul_cuda_kernel.cu


In [5]:
!nvcc -Xcompiler -fPIC -shared -o lmul_cuda_kernel.so lmul_cuda_kernel.cu -O3 --use_fast_math


              int sign_mult = sign_lut[idx];
                  ^




In [7]:
!pip install ninja


Collecting ninja
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (180 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/180.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ninja
Successfully installed ninja-1.13.0


In [10]:
!rm -rf /root/.cache/torch_extensions/*


In [16]:
# Step 1: Install ninja (needed for PyTorch extensions)
!pip install ninja --quiet

# Step 2: Clean previous builds (optional but recommended)
!rm -rf /root/.cache/torch_extensions/*

# Step 3: Compile & load the CUDA extension
from torch.utils.cpp_extension import load

lmul_cuda = load(
    name="lmul_cuda",
    sources=["/content/lmul_cuda_kernel.cu"],  # full path
    extra_cuda_cflags=["-O3", "--use_fast_math"],
    verbose=True  # prints compilation logs
)


W1007 19:48:35.369000 747 torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
W1007 19:48:35.369000 747 torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.


In [18]:
import torch

# Define matrix sizes
M, N, K = 512, 512, 512  # example sizes

# Create input matrices on GPU
A = torch.randn(M, K, device='cuda')
B = torch.randn(K, N, device='cuda')

# Output matrix
C = torch.zeros(M, N, device='cuda')


In [19]:
lmul_cuda.standard_matmul(A, B, C)


In [20]:
import torch
import time

# Matrix dimensions
M, N, K = 512, 512, 512

# Random input tensors on GPU
A = torch.randn(M, K, device='cuda')
B = torch.randn(K, N, device='cuda')

# Output tensors
C_standard = torch.zeros(M, N, device='cuda')
C_lmul = torch.zeros(M, N, device='cuda')

# Block and grid sizes
block = (16, 16)
grid = ((N + block[0] - 1)//block[0], (M + block[1] - 1)//block[1])

In [22]:
torch.cuda.synchronize()
start = time.time()
C_standard = A @ B
torch.cuda.synchronize()
time_standard = time.time() - start
print(f"PyTorch matmul time: {time_standard*1000:.3f} ms")


PyTorch matmul time: 135.921 ms


In [24]:
import cupy as cp
import torch
import time

# Convert PyTorch tensors to CuPy arrays
A_cp = cp.asarray(A)
B_cp = cp.asarray(B)
C_cp = cp.zeros((M, N), dtype=cp.float32)

kernel_code = r'''
extern "C" __global__
void lmul_addition_only(float* A, float* B, float* C, int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < K; k++) {
            float a = A[row*K + k];
            float b = B[k*N + col];
            sum += a + b;  // simplified addition-only
        }
        C[row*N + col] = sum;
    }
}
'''

# Compile
module = cp.RawKernel(kernel_code, 'lmul_addition_only')

# Launch
block = (16, 16)
grid = ((N + block[0]-1)//block[0], (M + block[1]-1)//block[1])
start = time.time()
module(grid, block, (A_cp, B_cp, C_cp, M, N, K))
cp.cuda.Stream.null.synchronize()
print("L-Mul time:", (time.time()-start)*1000, "ms")

# Convert back to PyTorch
C_lmul = torch.as_tensor(C_cp)


L-Mul time: 246.0620403289795 ms


In [25]:
print("Max difference:", (C_standard - C_lmul).abs().max().item())


Max difference: 198.2362060546875


In [26]:
import torch
import time

# Example: standard matmul
M, K, N = 1024, 1024, 1024
A = torch.randn(M, K, device='cuda')
B = torch.randn(K, N, device='cuda')

# Measure PyTorch matmul
torch.cuda.synchronize()
start = time.time()
C_std = torch.matmul(A, B)
torch.cuda.synchronize()
time_std = time.time() - start  # seconds
power_gpu = 70  # watts
energy_std = power_gpu * time_std  # joules
print(f"Standard matmul: {time_std*1000:.2f} ms, Energy ~ {energy_std:.2f} J")

# L-Mul (addition-only) via CuPy
import cupy as cp
A_cp = cp.asarray(A)
B_cp = cp.asarray(B)
C_cp = cp.zeros((M, N), dtype=cp.float32)

kernel_code = r'''
extern "C" __global__
void lmul_addition_only(float* A, float* B, float* C, int M, int N, int K){
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < M && col < N){
        float sum = 0.0f;
        for (int k=0;k<K;k++){
            sum += A[row*K+k] + B[k*N+col];  // addition-only
        }
        C[row*N+col] = sum;
    }
}
'''

module = cp.RawKernel(kernel_code, 'lmul_addition_only')
block = (16, 16)
grid = ((N + 15)//16, (M + 15)//16)

start = time.time()
module(grid, block, (A_cp, B_cp, C_cp, M, N, K))
cp.cuda.Stream.null.synchronize()
time_lmul = time.time() - start
energy_lmul = power_gpu * time_lmul

print(f"L-Mul kernel: {time_lmul*1000:.2f} ms, Energy ~ {energy_lmul:.2f} J")


Standard matmul: 1.78 ms, Energy ~ 0.12 J
L-Mul kernel: 61.55 ms, Energy ~ 4.31 J
