In [1]:
import time
import torch
import numpy as np
from torchvision import models

In [2]:
mode_list = "default reduce-overhead max-autotune".split()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 实验一：sin 函数


In [3]:
def sin_func(x):
    return torch.sin(x) + torch.cos(x)


run_times = 100000
i_data = torch.tensor(1).to(device)
for mode in mode_list:
    torch.cuda.synchronize()
    time_0 = time.time()
    module_compiled = torch.compile(sin_func, mode=mode)
    torch.cuda.synchronize()
    time_1 = time.time()
    # warmup
    sin_func(i_data)
    module_compiled(i_data)
    torch.cuda.synchronize()
    time_2 = time.time()
    for i in range(run_times):
        sin_func(i_data)
    torch.cuda.synchronize()
    time_3 = time.time()
    for i in range(run_times):
        module_compiled(i_data)
    torch.cuda.synchronize()
    time_4 = time.time()
    compile_time = time_1 - time_0
    pre_time = time_3 - time_2
    post_time = time_4 - time_3
    speedup_ratio = (pre_time - post_time)/pre_time
    print(f"mode: {mode}, 编译耗时:{compile_time:.2f}，编译前运行耗时:{pre_time:.2f}, 编译后运行耗时:{post_time:.2f}，速度提升比例:{speedup_ratio:.2%}")

BackendCompilerFailed: backend='inductor' raised:
RuntimeError: Cannot find a working triton installation. Either the package is not installed or it is too old. More information on installing Triton can be found at https://github.com/openai/triton

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True


In [None]:
# 该代码的功能是 测试 PyTorch torch.compile 在不同模式下的编译时间和运行加速效果，并计算 编译前后的执行时间及速度提升比例。

# 确保定义了 device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 确保定义了 mode_list
mode_list = ["default", "reduce-overhead", "max-autotune"]

# 关闭错误抛出，防止 torch.compile 失败导致程序中断
torch._dynamo.config.suppress_errors = True

# 计算函数


def sin_func(x):
    return torch.sin(x) + torch.cos(x)


run_times = 100000
i_data = torch.tensor(1, dtype=torch.float32, device=device)

for mode in mode_list:
    torch.cuda.synchronize()
    time_0 = time.time()
    try:
        module_compiled = torch.compile(sin_func, mode=mode)
    except Exception as e:
        print(f"模式 {mode} 编译失败: {e}")
        continue

    torch.cuda.synchronize()
    time_1 = time.time()

    # 预热
    sin_func(i_data)
    module_compiled(i_data)
    torch.cuda.synchronize()
    time_2 = time.time()

    # 编译前运行
    for i in range(run_times):
        sin_func(i_data)
    torch.cuda.synchronize()
    time_3 = time.time()

    # 编译后运行
    for i in range(run_times):
        module_compiled(i_data)
    torch.cuda.synchronize()
    time_4 = time.time()

    compile_time = time_1 - time_0
    pre_time = time_3 - time_2
    post_time = time_4 - time_3
    speedup_ratio = (pre_time - post_time) / pre_time

    print(f"mode: {mode}, 编译耗时: {compile_time:.2f}s, 编译前运行耗时: {pre_time:.2f}s, 编译后运行耗时: {post_time:.2f}s, 速度提升比例: {speedup_ratio:.2%}")

W0311 20:18:52.497000 10696 site-packages\torch\_dynamo\convert_frame.py:1125] WON'T CONVERT sin_func C:\Users\Hankuke.D\AppData\Local\Temp\ipykernel_10696\3860647870.py line 15 
W0311 20:18:52.497000 10696 site-packages\torch\_dynamo\convert_frame.py:1125] due to: 
W0311 20:18:52.497000 10696 site-packages\torch\_dynamo\convert_frame.py:1125] Traceback (most recent call last):
W0311 20:18:52.497000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]   File "d:\miniforge3\envs\v3.11.6\Lib\site-packages\torch\_dynamo\output_graph.py", line 1446, in _call_user_compiler
W0311 20:18:52.497000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]     compiled_fn = compiler_fn(gm, self.example_inputs())
W0311 20:18:52.497000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
W0311 20:18:52.497000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]   File "d:\miniforge3\envs\v3.11.6\Lib\site-packages\torch\_dynamo\rep

mode: default, 编译耗时: 0.00s, 编译前运行耗时: 2.53s, 编译后运行耗时: 3.27s, 速度提升比例: -29.40%
mode: reduce-overhead, 编译耗时: 0.00s, 编译前运行耗时: 2.34s, 编译后运行耗时: 3.06s, 速度提升比例: -30.48%
mode: max-autotune, 编译耗时: 0.00s, 编译前运行耗时: 2.33s, 编译后运行耗时: 3.16s, 速度提升比例: -35.97%


# 实验二：resnet18


In [5]:
resnet18 = models.resnet18().to(device)
resnet18.eval()
fake_img = torch.randn(16, 3, 224, 224).to(device)

In [None]:
# 测试 torch.compile 对 ResNet18 模型的加速效果。
run_times = 100
with torch.no_grad():
    for mode in mode_list:
        torch.cuda.synchronize()
        time_0 = time.time()
        module_compiled = torch.compile(resnet18, mode=mode)
        torch.cuda.synchronize()
        time_1 = time.time()

        # warmup 非常关键！
        resnet18(fake_img)
        module_compiled(fake_img)

        #
        torch.cuda.synchronize()
        time_2 = time.time()
        for i in range(run_times):
            resnet18(fake_img)

        torch.cuda.synchronize()
        time_3 = time.time()
        for i in range(run_times):
            module_compiled(fake_img)

        torch.cuda.synchronize()
        time_4 = time.time()

        compile_time = time_1 - time_0
        pre_time = time_3 - time_2
        post_time = time_4 - time_3
        speedup_ratio = (pre_time - post_time)/pre_time

        print(
            f"mode: {mode}, 编译耗时:{compile_time:.2f}，编译前运行耗时:{pre_time:.2f}, 编译后运行耗时:{post_time:.2f}，速度提升比例:{speedup_ratio:.2%}")

W0311 20:22:00.969000 10696 site-packages\torch\_dynamo\convert_frame.py:1125] WON'T CONVERT forward d:\miniforge3\envs\v3.11.6\Lib\site-packages\torchvision\models\resnet.py line 284 
W0311 20:22:00.969000 10696 site-packages\torch\_dynamo\convert_frame.py:1125] due to: 
W0311 20:22:00.969000 10696 site-packages\torch\_dynamo\convert_frame.py:1125] Traceback (most recent call last):
W0311 20:22:00.969000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]   File "d:\miniforge3\envs\v3.11.6\Lib\site-packages\torch\_dynamo\output_graph.py", line 1446, in _call_user_compiler
W0311 20:22:00.969000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]     compiled_fn = compiler_fn(gm, self.example_inputs())
W0311 20:22:00.969000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
W0311 20:22:00.969000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]   File "d:\miniforge3\envs\v3.11.6\Lib\site-packages\torch\_dyna

mode: default, 编译耗时:0.00，编译前运行耗时:1.05, 编译后运行耗时:0.79，速度提升比例:25.47%
mode: reduce-overhead, 编译耗时:0.00，编译前运行耗时:0.79, 编译后运行耗时:0.79，速度提升比例:-0.09%
mode: max-autotune, 编译耗时:0.00，编译前运行耗时:0.79, 编译后运行耗时:0.79，速度提升比例:-0.09%


# 实验三：BERT


In [9]:
from transformers import BertModel, BertTokenizer
import time
cache_dir = "./huggingface_cache"
bert = BertModel.from_pretrained('bert-base-uncased', cache_dir=cache_dir)
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased', cache_dir=cache_dir)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
#  测试 torch.compile 对 BERT 模型推理的加速效果
# 准备一批输入数据
input_text = "Here is some text to encode"
inputs = tokenizer(input_text, return_tensors='pt',
                   padding=True, truncation=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
bert.to(device)
bert.eval()

run_times = 100
with torch.no_grad():
    for mode in mode_list:

        # 编译
        torch.cuda.synchronize()
        time_0 = time.time()
        bert_compiled = torch.compile(bert, mode=mode)
        torch.cuda.synchronize()
        time_1 = time.time()

        # warmup 非常关键！
        bert(**inputs)
        bert_compiled(**inputs)

        torch.cuda.synchronize()
        time_2 = time.time()
        for _ in range(run_times):
            _ = bert(**inputs)

        torch.cuda.synchronize()
        time_3 = time.time()
        for _ in range(run_times):
            _ = bert_compiled(**inputs)

        torch.cuda.synchronize()
        time_4 = time.time()

        compile_time = time_1 - time_0
        pre_time = time_3 - time_2
        post_time = time_4 - time_3
        speedup_ratio = (pre_time - post_time)/pre_time

        print(
            f"mode: {mode}, 编译耗时:{compile_time:.2f}，编译前运行耗时:{pre_time:.2f}, 编译后运行耗时:{post_time:.2f}，速度提升比例:{speedup_ratio:.2%}")

W0311 20:32:30.580000 10696 site-packages\torch\_dynamo\convert_frame.py:1125] WON'T CONVERT forward d:\miniforge3\envs\v3.11.6\Lib\site-packages\transformers\models\bert\modeling_bert.py line 1001 
W0311 20:32:30.580000 10696 site-packages\torch\_dynamo\convert_frame.py:1125] due to: 
W0311 20:32:30.580000 10696 site-packages\torch\_dynamo\convert_frame.py:1125] Traceback (most recent call last):
W0311 20:32:30.580000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]   File "d:\miniforge3\envs\v3.11.6\Lib\site-packages\torch\_dynamo\output_graph.py", line 1446, in _call_user_compiler
W0311 20:32:30.580000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]     compiled_fn = compiler_fn(gm, self.example_inputs())
W0311 20:32:30.580000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
W0311 20:32:30.580000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]   File "d:\miniforge3\envs\v3.11.6\Lib\site-packag

mode: default, 编译耗时:0.00，编译前运行耗时:0.76, 编译后运行耗时:0.62，速度提升比例:18.03%


  super().capture_end()


mode: reduce-overhead, 编译耗时:0.00，编译前运行耗时:0.52, 编译后运行耗时:0.86，速度提升比例:-66.98%


  super().capture_end()


mode: max-autotune, 编译耗时:0.00，编译前运行耗时:0.47, 编译后运行耗时:0.84，速度提升比例:-77.71%


# 实验四 numpy


In [None]:
# 这段代码的功能是 测试 torch.compile 对 PyTorch 计算函数的加速效果
run_times = 100


def numpy_fn2(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
    return np.sum(X[:, :, None] * Y[:, None, :], axis=(-2, -1))


def numpy_fn(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
    # Step 1: Normalize the input arrays to have zero mean and unit variance
    X_mean, X_std = X.mean(axis=0), X.std(axis=0)
    Y_mean, Y_std = Y.mean(axis=0), Y.std(axis=0)
    # Avoid division by zero in case of zero standard deviation
    X_std[X_std == 0] = 1
    Y_std[Y_std == 0] = 1
    X_normalized = (X - X_mean) / X_std
    Y_normalized = (Y - Y_mean) / Y_std
    # Step 2: Perform the tensor product followed by sum over last two dimensions
    intermediate_result = np.sum(
        X_normalized[:, :, None] * Y_normalized[:, None, :], axis=(-2, -1))
    # Step 3: Apply thresholding to clip values outside of [-1, 1]
    intermediate_result = np.clip(intermediate_result, -1, 1)
    # Step 4: Apply exponential function for non-linearity
    result = np.exp(intermediate_result)
    # Step 5: Add a small regularization term to avoid overfitting
    regularization_term = 0.001 * \
        np.sum(X_normalized ** 2 + Y_normalized ** 2, axis=1)
    result += regularization_term
    return result


x = np.random.randn(1024, 640)
y = np.random.randn(1024, 640)
for mode in mode_list:
    torch.cuda.synchronize()
    time_0 = time.time()
    numpy_fn_compiled = torch.compile(numpy_fn, mode=mode)
    torch.cuda.synchronize()
    time_1 = time.time()
    # warmup 非常关键！
    numpy_fn(x, y)
    numpy_fn_compiled(x, y)
    #
    torch.cuda.synchronize()
    time_2 = time.time()
    for i in range(run_times):
        numpy_fn(x, y)
    torch.cuda.synchronize()
    time_3 = time.time()
    for i in range(run_times):
        numpy_fn_compiled(x, y)
    torch.cuda.synchronize()
    time_4 = time.time()
    compile_time = time_1 - time_0
    pre_time = time_3 - time_2
    post_time = time_4 - time_3
    speedup_ratio = (pre_time - post_time)/pre_time
    print(f"mode: {mode}, 编译耗时:{compile_time:.2f}，编译前运行耗时:{pre_time:.2f}, 编译后运行耗时:{post_time:.2f}，速度提升比例:{speedup_ratio:.2%}")

W0311 20:33:15.855000 10696 site-packages\torch\_dynamo\convert_frame.py:1125] WON'T CONVERT numpy_fn C:\Users\Hankuke.D\AppData\Local\Temp\ipykernel_10696\778857968.py line 4 
W0311 20:33:15.855000 10696 site-packages\torch\_dynamo\convert_frame.py:1125] due to: 
W0311 20:33:15.855000 10696 site-packages\torch\_dynamo\convert_frame.py:1125] Traceback (most recent call last):
W0311 20:33:15.855000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]   File "d:\miniforge3\envs\v3.11.6\Lib\site-packages\torch\_inductor\cpp_builder.py", line 130, in check_compiler_exist_windows
W0311 20:33:15.855000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]     subprocess.check_output([compiler, "/help"], stderr=subprocess.STDOUT)
W0311 20:33:15.855000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]   File "d:\miniforge3\envs\v3.11.6\Lib\subprocess.py", line 466, in check_output
W0311 20:33:15.855000 10696 site-packages\torch\_dynamo\convert_frame.py:1125]     return run(*popen

mode: default, 编译耗时:0.00，编译前运行耗时:124.91, 编译后运行耗时:135.04，速度提升比例:-8.11%
