In [88]:
%reload_ext cython_openmp

111111111111111111111


In [92]:
%%cython_openmp -n parallel_ok_3d_loop
import numpy as np

def parallel_ok_3d_loop(n=50):
    result = 0.0
    for i in range(n):
        for j in range(n):
            for k in range(n):
                result += (i + j + k) * 0.000001
    return result

🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: parallel_ok_3d_loop
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_6hir4uu2
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def parallel_ok_3d_loop(int n, int num_threads=0):
        cdef Py_ssize_t i, j, k, t, tid
        cdef double result = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals
        wi

In [93]:
import numpy as np

def baseline_3d_loop(n=50):
    result = 0.0
    for i in range(n):
        for j in range(n):
            for k in range(n):
                result += (i + j + k) * 0.000001
    return result

In [94]:
n = 100
ref = baseline_3d_loop(n)
opt = parallel_ok_3d_loop(n)

# 仅输出是否一致
print("true" if abs(ref - opt) < 1e-10 else "false")

true


In [95]:
import timeit

n = 200
repeats = 5

t_base = timeit.timeit(lambda: baseline_3d_loop(n), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: parallel_ok_3d_loop(n,num_threads=4), number=repeats)  / repeats
speedup = t_base / t_opt

print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")

Baseline 平均时间: 0.584438s
Optimized 平均时间: 0.002994s
加速比: 195.23×


2222222222222222222

In [96]:
%%cython_openmp -n parallel_write_array
import numpy as np
def parallel_write_array(n=100):
    arr = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            arr[i][j] = i + j
    return arr

🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: parallel_write_array
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_b2izribe
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def parallel_write_array(int n, int num_threads=0):
        cdef Py_ssize_t i, j
        cdef np.ndarray[np.double_t, ndim=2] arr = np.empty((n, n), dtype=np.float64)
        cdef double[:, ::1] arr_mv = arr
        if num_threads > 0:
            omp_set_num_threads(num_threads)
        with nogil, parallel():
            for i in prange(

In [97]:
import numpy as np

def baseline_write_array(n=100):
    arr = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            arr[i][j] = i + j
    return arr

In [98]:
n = 100
ref = baseline_write_array(n)
opt = parallel_write_array(n)
print(ref)
print(opt)

# 仅输出是否一致
print("true" if np.allclose(ref, opt) else "false")

[[  0.   1.   2. ...  97.  98.  99.]
 [  1.   2.   3. ...  98.  99. 100.]
 [  2.   3.   4. ...  99. 100. 101.]
 ...
 [ 97.  98.  99. ... 194. 195. 196.]
 [ 98.  99. 100. ... 195. 196. 197.]
 [ 99. 100. 101. ... 196. 197. 198.]]
[[  0.   1.   2. ...  97.  98.  99.]
 [  1.   2.   3. ...  98.  99. 100.]
 [  2.   3.   4. ...  99. 100. 101.]
 ...
 [ 97.  98.  99. ... 194. 195. 196.]
 [ 98.  99. 100. ... 195. 196. 197.]
 [ 99. 100. 101. ... 196. 197. 198.]]
true


In [99]:
import timeit


n = 500
repeats = 5

t_base = timeit.timeit(lambda: baseline_write_array(n), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: parallel_write_array(n,num_threads=4), number=repeats)  / repeats
speedup = t_base / t_opt

print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")

Baseline 平均时间: 0.043725s
Optimized 平均时间: 0.000288s
加速比: 151.99×


3333333333333333333333333

In [100]:
%%cython_openmp -n compute_squares1
def compute_squares1(a):
    return [x * x for x in a]


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: compute_squares1
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_xrylata1
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def compute_squares1(np.ndarray[np.double_t, ndim=1] a, int num_threads=0):
        cdef Py_ssize_t i, n
        if a.dtype != np.float64 or not a.flags.c_contiguous:
            raise ValueError("a must be float64 and C-contiguous")

        n = a.shape[0]
        cdef np.ndarray[np.double_t, ndim=1] out = np.empty(n, dtype=np.float64)
      

In [101]:
def compute_squares1_py(data):
    return [x * x for x in data]

In [102]:
data = [1, 2, 3, 4, 5]
import numpy as np
data_np = np.array(data, dtype=np.float64)

# 调用并对比结果
res_py = compute_squares1_py(data)
res_cy = compute_squares1(data_np)

print("Python result:", res_py)
print("Cython result:", res_cy)
print("Equal?", res_py == res_cy)

Python result: [1, 4, 9, 16, 25]
Cython result: [ 1.  4.  9. 16. 25.]
Equal? [ True  True  True  True  True]


In [103]:
import numpy as np, time

# 输入规模：1千万元素（~80MB double数组）
N = 10000000
data_list = list(range(N))                # 纯 Python list
data_np   = np.arange(N, dtype=np.float64)  # NumPy array

# baseline (Python 列表推导)
t0 = time.perf_counter()
compute_squares1_py(data_list)
t1 = time.perf_counter()
t_baseline=t1- t0
print(f"Python baseline: {t1 - t0:.3f} s")

# 并行 Cython+OpenMP (8 线程)
t0 = time.perf_counter()
compute_squares1(data_np, num_threads=8)
t1 = time.perf_counter()
t_cython_omp=t1-t0
print(f"Cython+OpenMP:  {t1 - t0:.3f} s")
print(f"加速比: {t_baseline / t_cython_omp:.2f}×")




Python baseline: 0.571 s
Cython+OpenMP:  0.011 s
加速比: 53.50×


444444444444444444

In [104]:
%%cython_openmp -n parallel_ok_3d_loop
def parallel_ok_3d_loop(n=100):
    result = 0.0
    for i in range(n):
        for j in range(n):
            for k in range(n):
                temp = (i + j + k) * (i - j + k)
                if temp > 0:
                    result += temp ** 0.5
    return result

🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: parallel_ok_3d_loop
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_4w4cxr5x
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def parallel_ok_3d_loop(int n, int num_threads=0):
        cdef Py_ssize_t i, j, k, t, tid
        cdef double result = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals
        wi

In [105]:
def baseline_ok_3d_loop(n=100):
    result = 0.0
    for i in range(n):
        for j in range(n):
            for k in range(n):
                temp = (i + j + k) * (i - j + k)
                if temp > 0:
                    result += temp ** 0.5
    return result

In [106]:
n = 100
ref = baseline_ok_3d_loop(n)
opt = parallel_ok_3d_loop(n)
print(ref)
print(opt)

print("true" if abs(ref - opt) < 1e-5 else "false")

78126227.34051621
78126227.34050924
true


In [261]:
import timeit

n = 300
repeats = 5

t_base = timeit.timeit(lambda: baseline_ok_3d_loop(n), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: parallel_ok_3d_loop(n, num_threads=8), number=repeats)  / repeats
speedup = t_base / t_opt

print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")

Baseline 平均时间: 5.494869s
Optimized 平均时间: 0.012857s
加速比: 427.37×


55555555555555555555555

In [107]:
%%cython_openmp -n parallel_write_array
import numpy as np

def parallel_write_array(n=100):
    arr = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            arr[i][j] = i + j
    return arr

🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: parallel_write_array
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_pus70u5d
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def parallel_write_array(int n, int num_threads=0):
        cdef Py_ssize_t i, j
        cdef np.ndarray[np.double_t, ndim=2] arr = np.empty((n, n), dtype=np.float64)
        cdef double[:, ::1] arr_mv = arr
        if num_threads > 0:
            omp_set_num_threads(num_threads)
        with nogil, parallel():
            for i in prange(

In [108]:
import numpy as np

def baseline_write_array(n=100):
    arr = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            arr[i][j] = i + j
    return arr

In [109]:
import numpy as np


n = 100
ref = baseline_write_array(n)
opt = parallel_write_array(n)

# 仅输出是否一致
print("true" if np.array_equal(ref, opt) else "false")

true


In [110]:
import timeit


n = 300
repeats = 5

t_base = timeit.timeit(lambda: baseline_write_array(n), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: parallel_write_array(n,num_threads=8), number=repeats) / repeats
speedup = t_base / t_opt

print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")

Baseline 平均时间: 0.015456s
Optimized 平均时间: 0.000028s
加速比: 555.97×


66666666666666666666

In [111]:
%%cython_openmp -n vector_norm
def vector_norm(n=1000):
    total = 0.0
    for i in range(n):
        total += ((i * 0.001) ** 2)
    return total ** 0.5

🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: vector_norm
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_j768urhg
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def vector_norm(int n, int num_threads=0):
        cdef Py_ssize_t i, t, tid
        cdef double total = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals
        with nogil, parallel():
 

In [112]:
def baseline_vector_norm(n=1000):
    total = 0.0
    for i in range(n):
        total += ((i * 0.001) ** 2)
    return total ** 0.5

In [113]:

n = 1000
ref = baseline_vector_norm(n)
opt = vector_norm(n)
print(ref)
print(opt)

tol = 1e-12
print("T" if abs(ref - opt) < tol else "F")

18.24372494859534
18.24372494859534
T


In [120]:
import timeit

n = 3000
repeats = 5

t_base = timeit.timeit(lambda: baseline_vector_norm(n), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: vector_norm(n,num_threads=8), number=repeats)  / repeats
speedup = t_base / t_opt

print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")

Baseline 平均时间: 0.000493s
Optimized 平均时间: 0.000027s
加速比: 18.56×


777777777777777777777

In [121]:
%%cython_openmp -n matmul_nested
import numpy as np

def matmul_nested(A, B):
    N = A.shape[0]
    C = np.zeros((N, N), np.float64)
    for i in range(N):
        for j in range(N):
            for k in range(N):
                C[i, j] += A[i, k] * B[k, j]
    return C

🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: matmul_nested
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_ibzt7x76
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def matmul_nested(np.ndarray[np.double_t, ndim=2] A,
                    np.ndarray[np.double_t, ndim=2] B,
                    int num_threads=0):
        cdef Py_ssize_t M = A.shape[0]
        cdef Py_ssize_t K = A.shape[1]
        cdef Py_ssize_t N = B.shape[1]

        if A.dtype != np.float64 or B.dtype != np.float64:
            raise Value

In [122]:
import numpy as np

def matmul_py(A, B):
    N = A.shape[0]
    C = np.zeros((N, N))
    for i in range(N):
        for j in range(N):
            for k in range(N):
                C[i, j] += A[i, k] * B[k, j]
    return C

In [123]:
import numpy as np

# 生成测试矩阵
N = 100
A = np.random.rand(N, N)
B = np.random.rand(N, N)

# Python 版本计算
C_py = matmul_py(A, B)

# Cython 版本计算
C_cy = matmul_nested(A, B)


# 判断是否相等
print("Are they close? ", np.allclose(C_py, C_cy, atol=1e-12))



Are they close?  True


In [124]:
N = 50
A = np.random.rand(N, N)
B = np.random.rand(N, N)
repeats = 5

# 基线计时
t_base = timeit.timeit(lambda: matmul_py(A, B), number=repeats) / repeats
# 优化版计时
t_opt  = timeit.timeit(lambda: matmul_nested(A, B,num_threads=8), number=repeats)  / repeats
speedup=t_base/t_opt
print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")

Baseline 平均时间: 0.046919s
Optimized 平均时间: 0.000075s
加速比: 627.09×


7777777777777777777777777777

In [125]:
%%cython_openmp -n matmul_mkn
import numpy as np

def matmul_mkn(A, B):
    M = A.shape[0]
    K = A.shape[1]
    N = B.shape[1]
    C = np.zeros((M, N), np.float64)
    for i in range(M):
        for j in range(N):
            for k in range(K):
                C[i, j] += A[i, k] * B[k, j]
    return C


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: matmul_mkn
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_h_nk4r7q
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def matmul_mkn(np.ndarray[np.double_t, ndim=2] A,
                    np.ndarray[np.double_t, ndim=2] B,
                    int num_threads=0):
        cdef Py_ssize_t M = A.shape[0]
        cdef Py_ssize_t K = A.shape[1]
        cdef Py_ssize_t N = B.shape[1]

        if A.dtype != np.float64 or B.dtype != np.float64:
            raise ValueError(

In [126]:
import numpy as np
M, K, N = 64, 128, 32
A = np.ascontiguousarray(np.random.rand(M, K), dtype=np.float64)
B = np.ascontiguousarray(np.random.rand(K, N), dtype=np.float64)

C_py = A @ B
C_cy = matmul_mkn(A, B)
print("allclose? ", np.allclose(C_py, C_cy, atol=1e-12))


allclose?  True


In [127]:
import numpy as np
import timeit

def matmul_mkn_py(A, B):
    M = A.shape[0]
    K = A.shape[1]
    N = B.shape[1]
    C = np.zeros((M, N), np.float64)
    for i in range(M):
        for j in range(N):
            for k in range(K):
                C[i, j] += A[i, k] * B[k, j]
    return C


In [128]:
# 参数设置
M, K, N = 200, 300, 150   # 可改大一些
A = np.ascontiguousarray(np.random.rand(M, K), dtype=np.float64)
B = np.ascontiguousarray(np.random.rand(K, N), dtype=np.float64)

repeats = 3

# Python 版本计时
t_base = timeit.timeit(lambda: matmul_mkn_py(A, B), number=repeats) / repeats

# Cython+OpenMP 版本计时 (记得传 num_threads)
t_opt  = timeit.timeit(lambda: matmul_mkn(A, B, num_threads=8), number=repeats) / repeats

speedup = t_base / t_opt

print(f"M={M}, K={K}, N={N}")
print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")


M=200, K=300, N=150
Baseline 平均时间: 3.493973s
Optimized 平均时间: 0.001027s
加速比: 3402.23×


888888888888888888

In [130]:
%%cython_openmp -n test_4d_loop_py
def test_4d_loop_py(n):
    result = 0.0
    for a in range(n):
        for b in range(n):
            for c in range(n):
                for d in range(n):
                    result += a + b + c + d
    return result

🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: test_4d_loop_py
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_t7_6fk26
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def test_4d_loop_py(int n, int num_threads=0):
        cdef Py_ssize_t a, b, c, d, t, tid
        cdef double s = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals
        with nogil, 

In [131]:
def test_4d_loop_py_baseline(n):
    result = 0.0
    for a in range(n):
        for b in range(n):
            for c in range(n):
                for d in range(n):
                    result += a + b + c + d
    return result

In [132]:
n = 50
ref = test_4d_loop_py_baseline(n)
opt = test_4d_loop_py(n)

# 仅输出是否一致
tol = 1e-12
print("t" if abs(ref - opt) < tol else "f")

t


In [133]:
import timeit

n = 50
repeats = 3

t_base = timeit.timeit(lambda: test_4d_loop_py_baseline(n), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: test_4d_loop_py(n, num_threads=8), number=repeats)  / repeats
speedup = t_base / t_opt

print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")

Baseline 平均时间: 0.403622s
Optimized 平均时间: 0.002525s
加速比: 159.84×


99999999999

In [134]:
%%cython_openmp -n matmul_mkn
import numpy as np
def matmul_mkn(A, B):
    M = A.shape[0]; K = A.shape[1]; N = B.shape[1]
    C = np.zeros((M, N), np.float64)
    for j in range(N):     # ← 故意换循环层序
        for i in range(M):
            acc = 0.0
            for k in range(K):
                acc += A[i, k] * B[k, j]
            C[i, j] = acc
    return C


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: matmul_mkn
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_simj_yuo
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def matmul_mkn(np.ndarray[np.double_t, ndim=2] A,
                    np.ndarray[np.double_t, ndim=2] B,
                    int num_threads=0):
        cdef Py_ssize_t M = A.shape[0]
        cdef Py_ssize_t K = A.shape[1]
        cdef Py_ssize_t N = B.shape[1]

        if A.dtype != np.float64 or B.dtype != np.float64:
            raise ValueError(

In [135]:
import numpy as np, timeit

def matmul_mkn_py(A, B):
    M, K = A.shape; N = B.shape[1]
    C = np.zeros((M, N), np.float64)
    for j in range(N):
        for i in range(M):
            acc = 0.0
            for k in range(K):
                acc += A[i, k] * B[k, j]
            C[i, j] = acc
    return C

M, K, N = 120, 80, 100
A = np.random.rand(M, K)
B = np.random.rand(K, N)

C0 = matmul_mkn_py(A, B)
C1 = matmul_mkn(A, B, 8)   # 位置参数传线程数
print("equal:", np.allclose(C0, C1, rtol=1e-10, atol=1e-12))

repeats = 3
t_base = timeit.timeit(lambda: matmul_mkn_py(A, B), number=repeats)/repeats
t_opt  = timeit.timeit(lambda: matmul_mkn(A, B, 8),   number=repeats)/repeats
print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {t_base/t_opt:.2f}×")


equal: True
Baseline 平均时间: 0.269418s
Optimized 平均时间: 0.000238s
加速比: 1134.23×


100000

In [136]:
%%cython_openmp -n elem2d
import numpy as np
def elem2d(A, B, D):
    M, N = A.shape
    C = np.empty((M, N), np.float64)
    for i in range(M):
        for j in range(N):
            C[i,j] = 1.0/(1.0+np.exp(-A[i,j])) + B[i,3] - np.log(D[2,j])
    return C


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: elem2d
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_2ioxfi25
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def elem2d(np.ndarray[np.double_t, ndim=2] A, np.ndarray[np.double_t, ndim=2] B, np.ndarray[np.double_t, ndim=2] D, int num_threads=0):
        cdef Py_ssize_t i, j
        if A.dtype != np.float64 or not A.flags.c_contiguous:
            raise ValueError("A must be float64 and C-contiguous")
        if B.dtype != np.float64 or not B.flags.c_contiguous:

In [137]:
import numpy as np, timeit, math

# --- baseline: 纯 Python 双层 for ---
def elem2d_py(A, B, D):
    M, N = A.shape
    C = np.empty((M, N), np.float64)
    for i in range(M):
        for j in range(N):
            C[i,j] = 1.0/(1.0+math.exp(-A[i,j])) + B[i,3] - math.log(D[2,j])
    return C

# --- 测试数据 ---
M, N = 500, 600
rng = np.random.default_rng(0)
A = rng.normal(size=(M, N))
B = rng.normal(size=(M, N))
# 更稳妥：保证 > 0
D = rng.lognormal(mean=0.6, sigma=0.3, size=(M, N))

# --- 正确性检查 ---
C0 = elem2d_py(A, B, D)
C1 = elem2d(A, B, D, 8)   # ✅ 只传数组 + 线程数
print("equal:", np.allclose(C0, C1, rtol=1e-10, atol=1e-12))

# --- 性能测试 ---
repeats = 3
t_base = timeit.timeit(lambda: elem2d_py(A, B, D), number=repeats)/repeats
t_opt  = timeit.timeit(lambda: elem2d(A, B, D, 8),    number=repeats)/repeats
print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {t_base/t_opt:.2f}×")


equal: True
Baseline 平均时间: 0.239999s
Optimized 平均时间: 0.000823s
加速比: 291.78×


111111111

In [138]:
%%cython_openmp -n vector_norm_customret
def vector_norm_customret(n=1000):
    total = 0.0
    for i in range(n):
        total += (i * 0.001) * (i * 0.001)
    # 这里不是单纯 sqrt(total)，而是更“随意”的返回表达式
    return 1.0 / (1.0 + (total ** 0.5))


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: vector_norm_customret
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_guasqx3i
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def vector_norm_customret(int n, int num_threads=0):
        cdef Py_ssize_t i, t, tid
        cdef double total = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals
        with 

In [139]:
import math, timeit

def baseline_vector_norm_customret(n=1000):
    total = 0.0
    for i in range(n):
        total += (i * 0.001) * (i * 0.001)
    return 1.0 / (1.0 + math.sqrt(total))

# --- 正确性 ---
n = 3000
ref = baseline_vector_norm_customret(n)
opt = vector_norm_customret(n, num_threads=8)
print("ref:", ref)
print("opt:", opt)
print("equal:", abs(ref - opt) < 1e-12)

# --- 性能 ---
repeats = 5
t_base = timeit.timeit(lambda: baseline_vector_norm_customret(n), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: vector_norm_customret(n, num_threads=8), number=repeats) / repeats
print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {t_base/t_opt:.2f}×")


ref: 0.010433554636315135
opt: 0.010433554636315139
equal: True
Baseline 平均时间: 0.000353s
Optimized 平均时间: 0.000065s
加速比: 5.39×


In [140]:
%%cython_openmp -n vector_norm_total
def vector_norm_total(n=1000, scale=1.5):
    total = 0.0
    for i in range(n):
        total += (i * 0.001) * (i * 0.001)
    # return 里不只是 sqrt(total)，还引入了额外参数 scale
    return (total ** 0.5) * scale


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: vector_norm_total
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_j2vteicj
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def vector_norm_total(int n, double scale, int num_threads=0):
        cdef Py_ssize_t i, t, tid
        cdef double total = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals
       

In [141]:
import math

def baseline_vector_norm_total(n=1000, scale=1.5):
    total = 0.0
    for i in range(n):
        total += (i * 0.001) * (i * 0.001)
    return math.sqrt(total) * scale


In [142]:
import timeit

n = 5000
scale = 2.0

ref = baseline_vector_norm_total(n, scale)
opt = vector_norm_total(n, scale, num_threads=8)
print("ref:", ref)
print("opt:", opt)
print("equal:", abs(ref - opt) < 1e-12)

repeats = 5
t_base = timeit.timeit(lambda: baseline_vector_norm_total(n, scale), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: vector_norm_total(n, scale, num_threads=8), number=repeats) / repeats
print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {t_base/t_opt:.2f}×")


ref: 408.18705270990694
opt: 408.18705270990665
equal: True
Baseline 平均时间: 0.000723s
Optimized 平均时间: 0.000082s
加速比: 8.84×


In [144]:
%%cython_openmp -n reduce_with_alpha
import numpy as np

def reduce_with_alpha(n=1000):
    s = 0.0
    for i in range(n):
        s += i * 0.001
    # return 表达式里引入额外标量 alpha
    return np.sqrt(s / (n + alpha))


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: reduce_with_alpha
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_rxxcecdu
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def reduce_with_alpha(int n, double alpha, int num_threads=0):
        cdef Py_ssize_t i, t, tid
        cdef double s = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals
        wit

In [145]:
import math

# baseline 参考实现
def baseline(n, alpha):
    s = 0.0
    for i in range(n):
        s += i * 0.001
    return math.sqrt(s / (n + alpha))

# 测试
n = 2000
alpha = 3.5

ref = baseline(n, alpha)
opt = reduce_with_alpha(n, alpha, num_threads=4)  # 注意：alpha 已经自动出现在签名里
print("baseline:", ref)
print("cython_omp:", opt)
print("差值:", abs(ref - opt))


baseline: 0.9988763339980643
cython_omp: 0.9988763339980643
差值: 0.0
