In [1]:
import torch
import triton
import platform

print(f"操作系统: {platform.platform()}")
print(f"Python 版本: {platform.python_version()}")
print(f"PyTorch 版本: {torch.__version__}")
print(f"Triton 版本: {triton.__version__}")

if torch.cuda.is_available():
    print(f"\n✅ CUDA 已就位!")
    print(f"显卡型号: {torch.cuda.get_device_name(0)}")
    print(f"显卡算力: {torch.cuda.get_device_capability(0)}")
    print(f"当前显存占用: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
    
    # 5090 专属：验证 FP8 支持 (Blackwell 核心特性)
    print(f"硬件是否支持 FP8: {torch.cuda.get_device_capability(0) >= (10, 0)}")
else:
    print(f"❌ 警告: CUDA 未识别，请检查驱动！")

操作系统: Linux-5.15.0-94-generic-x86_64-with-glibc2.35
Python 版本: 3.12.3
PyTorch 版本: 2.8.0+cu128
Triton 版本: 3.4.0

✅ CUDA 已就位!
显卡型号: NVIDIA GeForce RTX 5090
显卡算力: (12, 0)
当前显存占用: 0.00 MB
硬件是否支持 FP8: True


In [None]:
import requests
import json
import sys
import re

def test_openrouter_triton_demo():
    # 1. 配置参数[提示：这个API key仅在视频展示中有效]
    api_key = "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"
    model_name = "deepseek/deepseek-v3.2" # 也可以换成 deepseek/deepseek-r1 获取更强逻辑
    url = "https://openrouter.ai/api/v1/chat/completions"

    # 【优化：强制 AI 闭嘴的提示词】
    # 我们加入 System 角色，并要求只输出代码块，不解释
    user_content = """
    Write a Triton kernel for element-wise addition as described: 
    - Kernel: `add_kernel(in_ptr0, in_ptr1, out_ptr, n_elements, BLOCK_SIZE: tl.constexpr)`
    - Wrapper: `add_wrapper(x, y)`
    
    IMPORTANT: Output ONLY the code within a single ```python code block. 
    Do not provide any introductory text or explanation. 
    Do not say "Here is the code". Just start with the code.
    """

    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
    }

    payload = {
        "model": model_name,
        "messages": [
            {"role": "system", "content": "You are a professional Triton GPU programmer. Output only code, no conversational text."},
            {"role": "user", "content": user_content}
        ],
        "stream": True 
    }

    print(f"--- 正在调用 {model_name} 生成 Triton 算子 ---\n")

    full_response = ""
    try:
        response = requests.post(url, headers=headers, data=json.dumps(payload), stream=True)
        
        if response.status_code != 200:
            print(f"请求失败: {response.text}")
            return

        print("AI 实时生成中:\n" + "-"*30)
        for line in response.iter_lines():
            if line:
                line_str = line.decode('utf-8')
                if line_str.startswith("data: "):
                    data_content = line_str[6:]
                    if data_content.strip() == "[DONE]": break
                    
                    try:
                        chunk_json = json.loads(data_content)
                        content = chunk_json['choices'][0].get('delta', {}).get('content', '')
                        if content:
                            full_response += content
                            sys.stdout.write(content)
                            sys.stdout.flush()
                    except: continue

        print("\n" + "-"*30 + "\n生成完毕！正在格式化输出...\n")

        # 【后处理：提取代码并格式化为 JSON 字符串】
        # 提取 ```python ... ``` 之间的内容
        code_match = re.search(r"```python\s+(.*?)\s+```", full_response, re.DOTALL)
        if code_match:
            raw_code = code_match.group(1).strip()
        else:
            raw_code = full_response.strip()

        # 生成可以直接粘贴进 JSON 的转义字符串（包含 \n 和 \t）
        json_ready_string = json.dumps(raw_code)

        print("==== 一键粘贴到 JSON 'predict' 字段的内容 ====")
        print(json_ready_string)
        print("============================================")

    except Exception as e:
        print(f"发生错误: {e}")

if __name__ == "__main__":
    test_openrouter_triton_demo()

--- 正在调用 deepseek/deepseek-v3.2 生成 Triton 算子 ---

AI 实时生成中:
------------------------------
```python
import torch
import triton
import triton.language as tl


@triton.jit
def add_kernel(in_ptr0, in_ptr1, out_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
    pid = tl.program_id(axis=0)
    block_start = pid * BLOCK_SIZE
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    mask = offsets < n_elements
    
    x = tl.load(in_ptr0 + offsets, mask=mask)
    y = tl.load(in_ptr1 + offsets, mask=mask)
    output = x + y
    tl.store(out_ptr + offsets, output, mask=mask)


def add_wrapper(x, y):
    assert x.is_cuda and y.is_cuda
    assert x.dtype == y.dtype
    assert x.shape == y.shape
    
    n_elements = x.numel()
    out = torch.empty_like(x)
    
    if n_elements == 0:
        return out
    
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    
    BLOCK_SIZE = triton.next_power_of_2(min(1024, n_elements))
    
    add_kernel[grid](x, y, out, n_elements, BLOCK_