In [1]:
import torch
import ml_utils.profile as profile

class qlora_forward(profile.BenchmarkCandidate):
    def __init__(self, batch_size, hidden_size, rank):
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.rank = rank
        self.X = torch.randn((batch_size, hidden_size), device='cuda', dtype=torch.bfloat16)
        self.W = torch.randn((hidden_size, hidden_size), device='cuda', dtype=torch.bfloat16)
        self.U = torch.randn((hidden_size, rank), device='cuda', dtype=torch.bfloat16)
        self.V = torch.randn((rank, hidden_size), device='cuda', dtype=torch.bfloat16)

    def benchmark_content(self):
        XW = torch.matmul(self.X, self.W)
        XU = torch.matmul(self.X, self.U)
        XUV = torch.matmul(XU, self.V)
        return XW + XUV

class qlora_low_rank_part(profile.BenchmarkCandidate):
    def __init__(self, batch_size, hidden_size, rank):
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.rank = rank
        self.X = torch.randn((batch_size, hidden_size), device='cuda', dtype=torch.bfloat16)
        self.W = torch.randn((hidden_size, hidden_size), device='cuda', dtype=torch.bfloat16)
        self.U = torch.randn((hidden_size, rank), device='cuda', dtype=torch.bfloat16)
        self.V = torch.randn((rank, hidden_size), device='cuda', dtype=torch.bfloat16)

    def benchmark_content(self):
        XU = torch.matmul(self.X, self.U)
        XUV = torch.matmul(XU, self.V)
        return XUV

class qlora_dense_part(profile.BenchmarkCandidate):
    def __init__(self, batch_size, hidden_size, rank):
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.rank = rank
        self.X = torch.randn((batch_size, hidden_size), device='cuda', dtype=torch.bfloat16)
        self.W = torch.randn((hidden_size, hidden_size), device='cuda', dtype=torch.bfloat16)
        self.U = torch.randn((hidden_size, rank), device='cuda', dtype=torch.bfloat16)
        self.V = torch.randn((rank, hidden_size), device='cuda', dtype=torch.bfloat16)

    def benchmark_content(self):
        XW = torch.matmul(self.X, self.W)
        return XW

class qlora_addition_part(profile.BenchmarkCandidate):
    def __init__(self, batch_size, hidden_size, rank):
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.rank = rank
        self.X = torch.randn((batch_size, hidden_size), device='cuda', dtype=torch.bfloat16)
        self.W = torch.randn((hidden_size, hidden_size), device='cuda', dtype=torch.bfloat16)
        self.U = torch.randn((hidden_size, rank), device='cuda', dtype=torch.bfloat16)
        self.V = torch.randn((rank, hidden_size), device='cuda', dtype=torch.bfloat16)
        self.XW = torch.matmul(self.X, self.W)
        self.XU = torch.matmul(self.X, self.U)
        self.XUV = torch.matmul(self.XU, self.V)

    def benchmark_content(self):
        return self.XW + self.XUV





In [4]:
params_grid = {
    "batch_size": [1024 * i for i in range(1, 15)],
    "hidden_size": [768],
    "rank": [16]
}

bench_input_list = profile.params_grid_to_list(params_grid)

batch_size_result = profile.benchmark(
    {
        "qlora_forward": qlora_forward,
        "qlora_low_rank_part": qlora_low_rank_part,
        "qlora_dense_part": qlora_dense_part,
        "qlora_addition_part": qlora_addition_part
    },
    bench_input_list
)

pivoted = batch_size_result.pivot(index="batch_size", columns="name", values="ms")

pivoted



name,qlora_addition_part,qlora_dense_part,qlora_forward,qlora_low_rank_part
batch_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1024,0.01024,0.027648,0.057344,0.024576
2048,0.017408,0.034816,0.062464,0.019456
3072,0.024576,0.036864,0.078048,0.023552
4096,0.03072,0.059392,0.114688,0.028672
5120,0.036864,0.054272,0.121856,0.033792
6144,0.044032,0.069632,0.14848,0.038912
7168,0.050176,0.07168,0.160768,0.041984
8192,0.05632,0.08608,0.187392,0.047104
9216,0.063488,0.103232,0.21504,0.052224
10240,0.070656,0.104416,0.227328,0.05632


for small models

3 parts

each takes around 1/3 of the total runtime

In [5]:
params_grid = {
    "batch_size": [1024 * i for i in range(1, 15, 2)],
    "hidden_size": [2048],
    "rank": [16]
}

bench_input_list = profile.params_grid_to_list(params_grid)

batch_size_result = profile.benchmark(
    {
        "qlora_forward": qlora_forward,
        "qlora_low_rank_part": qlora_low_rank_part,
        "qlora_dense_part": qlora_dense_part,
        "qlora_addition_part": qlora_addition_part
    },
    bench_input_list
)

pivoted = batch_size_result.pivot(index="batch_size", columns="name", values="ms")

pivoted


name,qlora_addition_part,qlora_dense_part,qlora_forward,qlora_low_rank_part
batch_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1024,0.021504,0.082944,0.12288,0.023552
3072,0.05632,0.20992,0.311296,0.0512
5120,0.09216,0.334848,0.50464,0.072704
7168,0.126976,0.49664,0.71264,0.106496
9216,0.161792,0.608256,0.868352,0.131072
11264,0.197632,0.764928,1.091584,0.152576
13312,0.231424,0.863744,1.246208,0.182272


For middle sized models

dense takes the most of the time

In [None]:
params_grid = {
    "batch_size": [1024 * i for i in range(1, 15, 2)],
    "hidden_size": [4096],
    "rank": [16]
}

bench_input_list = profile.params_grid_to_list(params_grid)

batch_size_result = profile.benchmark(
    {
        "qlora_forward": qlora_forward,
        "qlora_low_rank_part": qlora_low_rank_part,
        "qlora_dense_part": qlora_dense_part,
        "qlora_addition_part": qlora_addition_part
    },
    bench_input_list
)

pivoted = batch_size_result.pivot(index="batch_size", columns="name", values="ms")

pivoted

name,qlora_addition_part,qlora_dense_part,qlora_forward,qlora_low_rank_part
batch_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1024,0.038912,0.296864,0.367616,0.036864
3072,0.109568,0.899072,1.061888,0.082944
5120,0.1792,1.490944,1.73568,0.125952
7168,0.248832,2.153984,2.470912,0.173056
9216,0.318464,2.742272,3.168256,0.221184
11264,0.38912,3.350528,3.878912,0.269312
13312,0.458752,3.405824,4.115456,0.311296


Large models, dense taks the most

Increase the lora rank

In [8]:

params_grid = {
    "batch_size": [1024 * i for i in range(1, 15, 2)],
    "hidden_size": [768],
    "rank": [128]
}

bench_input_list = profile.params_grid_to_list(params_grid)

batch_size_result = profile.benchmark(
    {
        "qlora_forward": qlora_forward,
        "qlora_low_rank_part": qlora_low_rank_part,
        "qlora_dense_part": qlora_dense_part,
        "qlora_addition_part": qlora_addition_part
    },
    bench_input_list
)

pivoted = batch_size_result.pivot(index="batch_size", columns="name", values="ms")

pivoted

name,qlora_addition_part,qlora_dense_part,qlora_forward,qlora_low_rank_part
batch_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1024,0.01024,0.027648,0.052224,0.018432
3072,0.023552,0.036864,0.08192,0.0256
5120,0.036864,0.054272,0.126976,0.038912
7168,0.050176,0.070656,0.169984,0.0512
9216,0.063488,0.1024,0.228352,0.065536
11264,0.0768,0.119808,0.284672,0.091136
13312,0.090112,0.139264,0.320512,0.095232


In [9]:

params_grid = {
    "batch_size": [1024 * i for i in range(1, 15, 2)],
    "hidden_size": [4096],
    "rank": [128]
}

bench_input_list = profile.params_grid_to_list(params_grid)

batch_size_result = profile.benchmark(
    {
        "qlora_forward": qlora_forward,
        "qlora_low_rank_part": qlora_low_rank_part,
        "qlora_dense_part": qlora_dense_part,
        "qlora_addition_part": qlora_addition_part
    },
    bench_input_list
)

pivoted = batch_size_result.pivot(index="batch_size", columns="name", values="ms")

pivoted

name,qlora_addition_part,qlora_dense_part,qlora_forward,qlora_low_rank_part
batch_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1024,0.039936,0.300032,0.384,0.0512
3072,0.109568,0.91136,1.103872,0.10752
5120,0.1792,1.481728,1.81504,0.155648
7168,0.249776,2.183168,2.526208,0.202752
9216,0.318528,2.756608,3.318816,0.246784
11264,0.38912,3.346432,4.086784,0.307712
13312,0.458752,3.452416,4.233216,0.336896


In [11]:

params_grid = {
    "batch_size": [4096],
    "hidden_size": [1024 * i for i in range(1, 10, 2)],
    "rank": [128]
}

bench_input_list = profile.params_grid_to_list(params_grid)

batch_size_result = profile.benchmark(
    {
        "qlora_forward": qlora_forward,
        "qlora_low_rank_part": qlora_low_rank_part,
        "qlora_dense_part": qlora_dense_part,
        "qlora_addition_part": qlora_addition_part
    },
    bench_input_list
)

pivoted = batch_size_result.pivot(index="hidden_size", columns="name", values="ms")

pivoted

name,qlora_addition_part,qlora_dense_part,qlora_forward,qlora_low_rank_part
hidden_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1024,0.039936,0.084992,0.167936,0.04096
3072,0.109568,0.687104,0.891904,0.103856
5120,0.1792,1.70496,2.014208,0.152576
7168,0.249728,3.686912,4.058112,0.224256
9216,0.319488,5.302272,5.871616,0.304128


In [2]:

from merged_forward_v1 import merged_qlora_forward as v1
from merged_forward_v2 import merged_qlora_forward as v2
from merged_forward_v3 import merged_qlora_forward as v3

from triton_dense_v1 import triton_dense_forward as dense_v1



class merged_qlora_forward_v1(profile.BenchmarkCandidate):
    def __init__(self, batch_size, hidden_size, rank):
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.rank = rank
        self.X = torch.randn((batch_size, hidden_size), device='cuda', dtype=torch.bfloat16)
        self.W = torch.randn((hidden_size, hidden_size), device='cuda', dtype=torch.bfloat16)
        self.U = torch.randn((hidden_size, rank), device='cuda', dtype=torch.bfloat16)
        self.V = torch.randn((rank, hidden_size), device='cuda', dtype=torch.bfloat16)

    def benchmark_content(self):
        return v1(self.X, self.W, self.U, self.V)

class merged_qlora_forward_v2(profile.BenchmarkCandidate):
    def __init__(self, batch_size, hidden_size, rank):
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.rank = rank
        self.X = torch.randn((batch_size, hidden_size), device='cuda', dtype=torch.bfloat16)
        self.W = torch.randn((hidden_size, hidden_size), device='cuda', dtype=torch.bfloat16)
        self.U = torch.randn((hidden_size, rank), device='cuda', dtype=torch.bfloat16)
        self.V = torch.randn((rank, hidden_size), device='cuda', dtype=torch.bfloat16)

    def benchmark_content(self):
        return v2(self.X, self.W, self.U, self.V)

class merged_qlora_forward_v3(profile.BenchmarkCandidate):
    def __init__(self, batch_size, hidden_size, rank):
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.rank = rank
        self.X = torch.randn((batch_size, hidden_size), device='cuda', dtype=torch.float16)
        self.W = torch.randn((hidden_size, hidden_size), device='cuda', dtype=torch.float16)
        self.U = torch.randn((hidden_size, rank), device='cuda', dtype=torch.float16)
        self.V = torch.randn((rank, hidden_size), device='cuda', dtype=torch.float16)

    def benchmark_content(self):
        return v3(self.X, self.W, self.U, self.V)

class triton_dense_v1(profile.BenchmarkCandidate):
    def __init__(self, batch_size, hidden_size, rank):
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.rank = rank
        self.X = torch.randn((batch_size, hidden_size), device='cuda', dtype=torch.float16)
        self.W = torch.randn((hidden_size, hidden_size), device='cuda', dtype=torch.float16)
        self.U = torch.randn((hidden_size, rank), device='cuda', dtype=torch.float16)
        self.V = torch.randn((rank, hidden_size), device='cuda', dtype=torch.float16)

    def benchmark_content(self):
        return dense_v1(self.X, self.W)
    

candidates = {
    "qlora_forward": qlora_forward,
    "qlora_low_rank_part": qlora_low_rank_part,
    "qlora_dense_part": qlora_dense_part,
    "qlora_addition_part": qlora_addition_part,
    "merged_qlora_forward_v1": merged_qlora_forward_v1,
    "merged_qlora_forward_v2": merged_qlora_forward_v2,
    "merged_qlora_forward_v3": merged_qlora_forward_v3,
    "triton_dense_v1": triton_dense_v1,
}

only_my_impl = {
    "baseline": qlora_forward,
    "merged_qlora_forward_v1": merged_qlora_forward_v1,
    "merged_qlora_forward_v2": merged_qlora_forward_v2,
    "merged_qlora_forward_v3": merged_qlora_forward_v3,
    "triton_dense_v1": triton_dense_v1,
}

In [3]:

params_grid = {
    "batch_size": [1024 * i for i in range(1, 25, 2)],
    "hidden_size": [768],
    "rank": [16]
}

bench_input_list = profile.params_grid_to_list(params_grid)
batch_size_result = profile.benchmark( candidates, bench_input_list)

pivoted = batch_size_result.pivot(index="batch_size", columns="name", values="ms")
pivoted

Autotuning results for function merged_qlora_forward_kernel (1024, 768, 768, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16'):
config: block_M: 128, block_N: 256, block_K: 64, R: 16, GROUP_SIZE_M: 8, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None, time: [inf, inf, inf]
config: block_M: 64, block_N: 256, block_K: 32, R: 16, GROUP_SIZE_M: 8, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None, time: [0.030719999223947525, 0.030719999223947525, 0.030719999223947525]
config: block_M: 128, block_N: 128, block_K: 32, R: 16, GROUP_SIZE_M: 8, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None, time: [0.030719999223947525, 0.02969600073993206, 0.030719999223947525]
config: block_M: 128, block_N: 64, block_K: 32, R: 16, GROUP_SIZE_M: 8, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None, time: [0.03276799991726875, 0.03174399957060814, 0.03276799991726875]
config: block_M: 64, block_N: 128, block_K: 32, R: 16, GROUP_SIZE_M: 8, num_w

name,merged_qlora_forward_v1,merged_qlora_forward_v2,merged_qlora_forward_v3,qlora_addition_part,qlora_dense_part,qlora_forward,qlora_low_rank_part,triton_dense_v1
batch_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1024,0.03072,0.0308,0.034816,0.01024,0.028672,0.057344,0.026624,0.019456
3072,0.058368,0.060416,0.075776,0.024576,0.036864,0.077824,0.023552,0.034848
5120,0.086016,0.089088,0.119808,0.037888,0.054272,0.121856,0.033792,0.052224
7168,0.114688,0.11776,0.15872,0.050176,0.07168,0.160768,0.043008,0.069632
9216,0.157696,0.16896,0.21504,0.063488,0.103424,0.21504,0.052224,0.095232
11264,0.18944,0.197632,0.254976,0.0768,0.120832,0.254976,0.06144,0.110592
13312,0.22016,0.2304,0.300032,0.090112,0.139264,0.295936,0.070656,0.126976
15360,0.248832,0.258048,0.340992,0.1024,0.159744,0.335872,0.078848,0.14336
17408,0.278528,0.287744,0.386048,0.115712,0.181264,0.384,0.08704,0.160768
19456,0.314368,0.323584,0.428032,0.129024,0.201728,0.423936,0.095232,0.181248


In [18]:

params_grid = {
    "batch_size": [1024 * i for i in range(1, 15, 2)],
    "hidden_size": [2048],
    "rank": [16]
}
bench_input_list = profile.params_grid_to_list(params_grid)
batch_size_result = profile.benchmark(only_my_impl, bench_input_list)

pivoted = batch_size_result.pivot(index="batch_size", columns="name", values="ms")
pivoted

Autotuning results for function merged_qlora_forward_kernel (1024, 2048, 2048, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16'):
config: block_M: 128, block_N: 256, block_K: 64, R: 16, GROUP_SIZE_M: 8, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None, time: [inf, inf, inf]
config: block_M: 64, block_N: 256, block_K: 32, R: 16, GROUP_SIZE_M: 8, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None, time: [0.16486400365829468, 0.16486400365829468, 0.1658879965543747]
config: block_M: 128, block_N: 128, block_K: 32, R: 16, GROUP_SIZE_M: 8, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None, time: [0.14028799533843994, 0.14028799533843994, 0.14131200313568115]
config: block_M: 128, block_N: 64, block_K: 32, R: 16, GROUP_SIZE_M: 8, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None, time: [0.1525759994983673, 0.1515520066022873, 0.1525759994983673]
config: block_M: 64, block_N: 128, block_K: 32, R: 16, GROUP_SIZE_M: 8, num_warps: 4

name,baseline,merged_qlora_forward_v1,merged_qlora_forward_v2
batch_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1024,0.124928,0.147456,0.141312
3072,0.314368,0.350208,0.355328
5120,0.501248,0.567264,0.584704
7168,0.712704,0.78336,0.816128
9216,0.8704,1.005568,1.052672
11264,1.095616,1.235968,1.276928
13312,1.253376,1.45824,1.517568


In [16]:

params_grid = {
    "batch_size": [1024 * i for i in range(1, 15, 2)],
    "hidden_size": [1536],
    "rank": [16]
}

bench_input_list = profile.params_grid_to_list(params_grid)

batch_size_result = profile.benchmark( only_my_impl, bench_input_list)

pivoted = batch_size_result.pivot(index="batch_size", columns="name", values="ms")

pivoted

NameError: name 'only_my_impl' is not defined

In [7]:

params_grid = {
    "batch_size": [1024 * i for i in range(1, 15, 2)],
    "hidden_size": [2048],
    "rank": [16]
}

bench_input_list = profile.params_grid_to_list(params_grid)

batch_size_result = profile.benchmark(
    {
        "qlora_forward": qlora_forward,
        "qlora_low_rank_part": qlora_low_rank_part,
        "qlora_dense_part": qlora_dense_part,
        "qlora_addition_part": qlora_addition_part,
        "merged_qlora_forward_v1": merged_qlora_forward_v1
    },
    bench_input_list
)

pivoted = batch_size_result.pivot(index="batch_size", columns="name", values="ms")

pivoted

Autotuning results for function merged_qlora_forward_kernel (1024, 2048, 2048, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16'):
config: block_M: 128, block_N: 256, block_K: 64, R: 16, GROUP_SIZE_M: 8, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None, time: [inf, inf, inf]
config: block_M: 64, block_N: 256, block_K: 32, R: 16, GROUP_SIZE_M: 8, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None, time: [0.13926400244235992, 0.13926400244235992, 0.14028799533843994]
config: block_M: 128, block_N: 128, block_K: 32, R: 16, GROUP_SIZE_M: 8, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None, time: [0.1372160017490387, 0.1372160017490387, 0.1372160017490387]
config: block_M: 128, block_N: 64, block_K: 32, R: 16, GROUP_SIZE_M: 8, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None, time: [0.14745600521564484, 0.14643199741840363, 0.1476864069700241]
config: block_M: 64, block_N: 128, block_K: 32, R: 16, GROUP_SIZE_M: 8, num_warps: 4

name,merged_qlora_forward_v1,qlora_addition_part,qlora_dense_part,qlora_forward,qlora_low_rank_part
batch_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1024,0.13824,0.021504,0.0768,0.12288,0.023552
3072,0.346112,0.057344,0.213024,0.31744,0.050176
5120,0.572416,0.09216,0.354304,0.508928,0.073728
7168,0.796672,0.126976,0.504928,0.717824,0.10752
9216,1.021952,0.161792,0.612352,0.878592,0.130048
11264,1.251328,0.196608,0.782336,1.101824,0.159744
13312,1.471968,0.231536,0.897024,1.265152,0.182272
