In [165]:
%reload_ext cython_openmp

111111111111111111111


In [166]:
%%cython_openmp -n parallel_ok_3d_loop
import numpy as np

def parallel_ok_3d_loop(n=50):
    result = 0.0
    for i in range(n):
        for j in range(n):
            for k in range(n):
                result += (i + j + k) * 0.000001
    return result

🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: parallel_ok_3d_loop
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_cee_c7e6
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def parallel_ok_3d_loop(int n, int num_threads=0):
        cdef Py_ssize_t i, j, k, __t, __i, t, tid
        cdef double result = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals


In [167]:
import numpy as np

def baseline_3d_loop(n=50):
    result = 0.0
    for i in range(n):
        for j in range(n):
            for k in range(n):
                result += (i + j + k) * 0.000001
    return result

In [168]:
n = 100
ref = baseline_3d_loop(n)
opt = parallel_ok_3d_loop(n)

# 仅输出是否一致
print("true" if abs(ref - opt) < 1e-10 else "false")

true


In [169]:
import timeit

n = 200
repeats = 5

t_base = timeit.timeit(lambda: baseline_3d_loop(n), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: parallel_ok_3d_loop(n,num_threads=4), number=repeats)  / repeats
speedup = t_base / t_opt

print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")

Baseline 平均时间: 0.582544s
Optimized 平均时间: 0.005665s
加速比: 102.83×


2222222222222222222

In [170]:
%%cython_openmp -n parallel_write_array
import numpy as np
def parallel_write_array(n=100):
    arr = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            arr[i][j] = i + j
    return arr

🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: parallel_write_array
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_k4j3a1kz
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def parallel_write_array(int n, int num_threads=0):
        cdef Py_ssize_t i, j
        cdef np.ndarray[np.double_t, ndim=2] arr = np.empty((n, n), dtype=np.float64)
        cdef double[:, ::1] arr_mv = arr
        if num_threads > 0:
            omp_set_num_threads(num_threads)
        with nogil, parallel():
            for i in prange(

In [171]:
import numpy as np

def baseline_write_array(n=100):
    arr = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            arr[i][j] = i + j
    return arr

In [172]:
n = 100
ref = baseline_write_array(n)
opt = parallel_write_array(n)
print(ref)
print(opt)

# 仅输出是否一致
print("true" if np.allclose(ref, opt) else "false")

[[  0.   1.   2. ...  97.  98.  99.]
 [  1.   2.   3. ...  98.  99. 100.]
 [  2.   3.   4. ...  99. 100. 101.]
 ...
 [ 97.  98.  99. ... 194. 195. 196.]
 [ 98.  99. 100. ... 195. 196. 197.]
 [ 99. 100. 101. ... 196. 197. 198.]]
[[  0.   1.   2. ...  97.  98.  99.]
 [  1.   2.   3. ...  98.  99. 100.]
 [  2.   3.   4. ...  99. 100. 101.]
 ...
 [ 97.  98.  99. ... 194. 195. 196.]
 [ 98.  99. 100. ... 195. 196. 197.]
 [ 99. 100. 101. ... 196. 197. 198.]]
true


In [173]:
import timeit


n = 500
repeats = 5

t_base = timeit.timeit(lambda: baseline_write_array(n), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: parallel_write_array(n,num_threads=4), number=repeats)  / repeats
speedup = t_base / t_opt

print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")

Baseline 平均时间: 0.040908s
Optimized 平均时间: 0.000234s
加速比: 174.82×


3333333333333333333333333

In [174]:
%%cython_openmp -n compute_squares1
def compute_squares1(a):
    return [x * x for x in a]


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: compute_squares1
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_n91la8m1
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def compute_squares1(np.ndarray[np.double_t, ndim=1] a, int num_threads=0):
        cdef Py_ssize_t i, n
        if a.dtype != np.float64 or not a.flags.c_contiguous:
            raise ValueError("a must be float64 and C-contiguous")

        n = a.shape[0]
        cdef np.ndarray[np.double_t, ndim=1] out = np.empty(n, dtype=np.float64)
      

In [175]:
def compute_squares1_py(data):
    return [x * x for x in data]

In [176]:
data = [1, 2, 3, 4, 5]
import numpy as np
data_np = np.array(data, dtype=np.float64)

# 调用并对比结果
res_py = compute_squares1_py(data)
res_cy = compute_squares1(data_np)

print("Python result:", res_py)
print("Cython result:", res_cy)
print("Equal?", res_py == res_cy)

Python result: [1, 4, 9, 16, 25]
Cython result: [ 1.  4.  9. 16. 25.]
Equal? [ True  True  True  True  True]


In [177]:
import numpy as np, time

# 输入规模：1千万元素（~80MB double数组）
N = 10000000
data_list = list(range(N))                # 纯 Python list
data_np   = np.arange(N, dtype=np.float64)  # NumPy array

# baseline (Python 列表推导)
t0 = time.perf_counter()
compute_squares1_py(data_list)
t1 = time.perf_counter()
t_baseline=t1- t0
print(f"Python baseline: {t1 - t0:.3f} s")

# 并行 Cython+OpenMP (8 线程)
t0 = time.perf_counter()
compute_squares1(data_np, num_threads=8)
t1 = time.perf_counter()
t_cython_omp=t1-t0
print(f"Cython+OpenMP:  {t1 - t0:.3f} s")
print(f"加速比: {t_baseline / t_cython_omp:.2f}×")




Python baseline: 0.572 s
Cython+OpenMP:  0.009 s
加速比: 61.72×


444444444444444444

In [178]:
%%cython_openmp -n parallel_ok_3d_loop
def parallel_ok_3d_loop(n=100):
    result = 0.0
    for i in range(n):
        for j in range(n):
            for k in range(n):
                temp = (i + j + k) * (i - j + k)
                if temp > 0:
                    result += temp ** 0.5
    return result

🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: parallel_ok_3d_loop
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_cc1ns6p3
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def parallel_ok_3d_loop(int n, int num_threads=0):
        cdef Py_ssize_t i, j, k, __t, __i, t, tid
        cdef double result = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals


In [179]:
def baseline_ok_3d_loop(n=100):
    result = 0.0
    for i in range(n):
        for j in range(n):
            for k in range(n):
                temp = (i + j + k) * (i - j + k)
                if temp > 0:
                    result += temp ** 0.5
    return result

In [180]:
n = 100
ref = baseline_ok_3d_loop(n)
opt = parallel_ok_3d_loop(n)
print(ref)
print(opt)

print("true" if abs(ref - opt) < 1e-5 else "false")

78126227.34051621
78126227.34050924
true


In [31]:
import timeit

n = 300
repeats = 5

t_base = timeit.timeit(lambda: baseline_ok_3d_loop(n), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: parallel_ok_3d_loop(n, num_threads=8), number=repeats)  / repeats
speedup = t_base / t_opt

print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")

Baseline 平均时间: 5.405891s
Optimized 平均时间: 0.012614s
加速比: 428.55×


55555555555555555555555

In [181]:
%%cython_openmp -n parallel_write_array
import numpy as np

def parallel_write_array(n=100):
    arr = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            arr[i][j] = i + j
    return arr

🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: parallel_write_array
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_h6desdvl
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def parallel_write_array(int n, int num_threads=0):
        cdef Py_ssize_t i, j
        cdef np.ndarray[np.double_t, ndim=2] arr = np.empty((n, n), dtype=np.float64)
        cdef double[:, ::1] arr_mv = arr
        if num_threads > 0:
            omp_set_num_threads(num_threads)
        with nogil, parallel():
            for i in prange(

In [182]:
import numpy as np

def baseline_write_array(n=100):
    arr = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            arr[i][j] = i + j
    return arr

In [183]:
import numpy as np


n = 100
ref = baseline_write_array(n)
opt = parallel_write_array(n)

# 仅输出是否一致
print("true" if np.array_equal(ref, opt) else "false")

true


In [184]:
import timeit


n = 300
repeats = 5

t_base = timeit.timeit(lambda: baseline_write_array(n), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: parallel_write_array(n,num_threads=8), number=repeats) / repeats
speedup = t_base / t_opt

print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")

Baseline 平均时间: 0.013744s
Optimized 平均时间: 0.000026s
加速比: 523.76×


66666666666666666666

In [185]:
%%cython_openmp -n vector_norm
def vector_norm(n=1000):
    total = 0.0
    for i in range(n):
        total += ((i * 0.001) ** 2)
    return total ** 0.5

🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: vector_norm
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_bpn45e7w
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def vector_norm(int n, int num_threads=0):
        cdef Py_ssize_t __t, __i, t, tid
        cdef double total = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals

        cdef Py_ssize_t _

In [186]:
def baseline_vector_norm(n=1000):
    total = 0.0
    for i in range(n):
        total += ((i * 0.001) ** 2)
    return total ** 0.5

In [187]:

n = 1000
ref = baseline_vector_norm(n)
opt = vector_norm(n)
print(ref)
print(opt)

tol = 1e-12
print("T" if abs(ref - opt) < tol else "F")

18.24372494859534
18.24372494859534
T


In [189]:
import timeit

n = 3000
repeats = 5

t_base = timeit.timeit(lambda: baseline_vector_norm(n), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: vector_norm(n,num_threads=8), number=repeats)  / repeats
speedup = t_base / t_opt

print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")

Baseline 平均时间: 0.000385s
Optimized 平均时间: 0.000048s
加速比: 8.11×


777777777777777777777

In [190]:
%%cython_openmp -n matmul_nested
import numpy as np

def matmul_nested(A, B):
    N = A.shape[0]
    C = np.zeros((N, N), np.float64)
    for i in range(N):
        for j in range(N):
            for k in range(N):
                C[i, j] += A[i, k] * B[k, j]
    return C

🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: matmul_nested
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_anynz66_
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def matmul_nested(np.ndarray[np.double_t, ndim=2] A,
                    np.ndarray[np.double_t, ndim=2] B,
                    int num_threads=0):
        cdef Py_ssize_t M = A.shape[0]
        cdef Py_ssize_t K = A.shape[1]
        cdef Py_ssize_t N = B.shape[1]

        if A.dtype != np.float64 or B.dtype != np.float64:
            raise Value

In [191]:
import numpy as np

def matmul_py(A, B):
    N = A.shape[0]
    C = np.zeros((N, N))
    for i in range(N):
        for j in range(N):
            for k in range(N):
                C[i, j] += A[i, k] * B[k, j]
    return C

In [192]:
import numpy as np

# 生成测试矩阵
N = 100
A = np.random.rand(N, N)
B = np.random.rand(N, N)

# Python 版本计算
C_py = matmul_py(A, B)

# Cython 版本计算
C_cy = matmul_nested(A, B)


# 判断是否相等
print("Are they close? ", np.allclose(C_py, C_cy, atol=1e-12))



Are they close?  True


In [193]:
N = 50
A = np.random.rand(N, N)
B = np.random.rand(N, N)
repeats = 5

# 基线计时
t_base = timeit.timeit(lambda: matmul_py(A, B), number=repeats) / repeats
# 优化版计时
t_opt  = timeit.timeit(lambda: matmul_nested(A, B,num_threads=8), number=repeats)  / repeats
speedup=t_base/t_opt
print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")

Baseline 平均时间: 0.050135s
Optimized 平均时间: 0.000082s
加速比: 610.81×


7777777777777777777777777777

In [194]:
%%cython_openmp -n matmul_mkn
import numpy as np

def matmul_mkn(A, B):
    M = A.shape[0]
    K = A.shape[1]
    N = B.shape[1]
    C = np.zeros((M, N), np.float64)
    for i in range(M):
        for j in range(N):
            for k in range(K):
                C[i, j] += A[i, k] * B[k, j]
    return C


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: matmul_mkn
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_4nhcb4hh
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def matmul_mkn(np.ndarray[np.double_t, ndim=2] A,
                    np.ndarray[np.double_t, ndim=2] B,
                    int num_threads=0):
        cdef Py_ssize_t M = A.shape[0]
        cdef Py_ssize_t K = A.shape[1]
        cdef Py_ssize_t N = B.shape[1]

        if A.dtype != np.float64 or B.dtype != np.float64:
            raise ValueError(

In [195]:
import numpy as np
M, K, N = 64, 128, 32
A = np.ascontiguousarray(np.random.rand(M, K), dtype=np.float64)
B = np.ascontiguousarray(np.random.rand(K, N), dtype=np.float64)

C_py = A @ B
C_cy = matmul_mkn(A, B)
print("allclose? ", np.allclose(C_py, C_cy, atol=1e-12))


allclose?  True


In [196]:
import numpy as np
import timeit

def matmul_mkn_py(A, B):
    M = A.shape[0]
    K = A.shape[1]
    N = B.shape[1]
    C = np.zeros((M, N), np.float64)
    for i in range(M):
        for j in range(N):
            for k in range(K):
                C[i, j] += A[i, k] * B[k, j]
    return C


In [197]:
# 参数设置
M, K, N = 200, 300, 150   # 可改大一些
A = np.ascontiguousarray(np.random.rand(M, K), dtype=np.float64)
B = np.ascontiguousarray(np.random.rand(K, N), dtype=np.float64)

repeats = 3

# Python 版本计时
t_base = timeit.timeit(lambda: matmul_mkn_py(A, B), number=repeats) / repeats

# Cython+OpenMP 版本计时 (记得传 num_threads)
t_opt  = timeit.timeit(lambda: matmul_mkn(A, B, num_threads=8), number=repeats) / repeats

speedup = t_base / t_opt

print(f"M={M}, K={K}, N={N}")
print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")


M=200, K=300, N=150
Baseline 平均时间: 3.463087s
Optimized 平均时间: 0.001184s
加速比: 2925.40×


888888888888888888

In [198]:
%%cython_openmp -n test_4d_loop_py
def test_4d_loop_py(n):
    result = 0.0
    for a in range(n):
        for b in range(n):
            for c in range(n):
                for d in range(n):
                    result += a + b + c + d
    return result

🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: test_4d_loop_py
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_zyg2br89
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def test_4d_loop_py(int n, int num_threads=0):
        cdef Py_ssize_t a, b, c, d, __t, __i, t, tid
        cdef double result = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals

    

In [199]:
def test_4d_loop_py_baseline(n):
    result = 0.0
    for a in range(n):
        for b in range(n):
            for c in range(n):
                for d in range(n):
                    result += a + b + c + d
    return result

In [200]:
n = 50
ref = test_4d_loop_py_baseline(n)
opt = test_4d_loop_py(n)

# 仅输出是否一致
tol = 1e-12
print("t" if abs(ref - opt) < tol else "f")

t


In [201]:
import timeit

n = 50
repeats = 3

t_base = timeit.timeit(lambda: test_4d_loop_py_baseline(n), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: test_4d_loop_py(n, num_threads=8), number=repeats)  / repeats
speedup = t_base / t_opt

print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")

Baseline 平均时间: 0.411470s
Optimized 平均时间: 0.002548s
加速比: 161.48×


99999999999

In [202]:
%%cython_openmp -n matmul_mkn
import numpy as np
def matmul_mkn(A, B):
    M = A.shape[0]; K = A.shape[1]; N = B.shape[1]
    C = np.zeros((M, N), np.float64)
    for j in range(N):     # ← 故意换循环层序
        for i in range(M):
            acc = 0.0
            for k in range(K):
                acc += A[i, k] * B[k, j]
            C[i, j] = acc
    return C


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: matmul_mkn
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_4wb2eszy
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def matmul_mkn(np.ndarray[np.double_t, ndim=2] A,
                    np.ndarray[np.double_t, ndim=2] B,
                    int num_threads=0):
        cdef Py_ssize_t M = A.shape[0]
        cdef Py_ssize_t K = A.shape[1]
        cdef Py_ssize_t N = B.shape[1]

        if A.dtype != np.float64 or B.dtype != np.float64:
            raise ValueError(

In [203]:
import numpy as np, timeit

def matmul_mkn_py(A, B):
    M, K = A.shape; N = B.shape[1]
    C = np.zeros((M, N), np.float64)
    for j in range(N):
        for i in range(M):
            acc = 0.0
            for k in range(K):
                acc += A[i, k] * B[k, j]
            C[i, j] = acc
    return C

M, K, N = 120, 80, 100
A = np.random.rand(M, K)
B = np.random.rand(K, N)

C0 = matmul_mkn_py(A, B)
C1 = matmul_mkn(A, B, 8)   # 位置参数传线程数
print("equal:", np.allclose(C0, C1, rtol=1e-10, atol=1e-12))

repeats = 3
t_base = timeit.timeit(lambda: matmul_mkn_py(A, B), number=repeats)/repeats
t_opt  = timeit.timeit(lambda: matmul_mkn(A, B, 8),   number=repeats)/repeats
print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {t_base/t_opt:.2f}×")


equal: True
Baseline 平均时间: 0.269047s
Optimized 平均时间: 0.000181s
加速比: 1487.00×


100000

In [204]:
%%cython_openmp -n elem2d
import numpy as np
def elem2d(A, B, D):
    M, N = A.shape
    C = np.empty((M, N), np.float64)
    for i in range(M):
        for j in range(N):
            C[i,j] = 1.0/(1.0+np.exp(-A[i,j])) + B[i,3] - np.log(D[2,j])
    return C


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: elem2d
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_40ucazqa
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def elem2d(np.ndarray[np.double_t, ndim=2] A, np.ndarray[np.double_t, ndim=2] B, np.ndarray[np.double_t, ndim=2] D, int num_threads=0):
        cdef Py_ssize_t i, j
        if A.dtype != np.float64 or not A.flags.c_contiguous:
            raise ValueError("A must be float64 and C-contiguous")
        if B.dtype != np.float64 or not B.flags.c_contiguous:

In [205]:
import numpy as np, timeit, math

# --- baseline: 纯 Python 双层 for ---
def elem2d_py(A, B, D):
    M, N = A.shape
    C = np.empty((M, N), np.float64)
    for i in range(M):
        for j in range(N):
            C[i,j] = 1.0/(1.0+math.exp(-A[i,j])) + B[i,3] - math.log(D[2,j])
    return C

# --- 测试数据 ---
M, N = 500, 600
rng = np.random.default_rng(0)
A = rng.normal(size=(M, N))
B = rng.normal(size=(M, N))
# 更稳妥：保证 > 0
D = rng.lognormal(mean=0.6, sigma=0.3, size=(M, N))

# --- 正确性检查 ---
C0 = elem2d_py(A, B, D)
C1 = elem2d(A, B, D, 8)   # ✅ 只传数组 + 线程数
print("equal:", np.allclose(C0, C1, rtol=1e-10, atol=1e-12))

# --- 性能测试 ---
repeats = 3
t_base = timeit.timeit(lambda: elem2d_py(A, B, D), number=repeats)/repeats
t_opt  = timeit.timeit(lambda: elem2d(A, B, D, 8),    number=repeats)/repeats
print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {t_base/t_opt:.2f}×")


equal: True
Baseline 平均时间: 0.242830s
Optimized 平均时间: 0.000832s
加速比: 291.89×


111111111

In [206]:
%%cython_openmp -n vector_norm_customret
def vector_norm_customret(n=1000):
    total = 0.0
    for i in range(n):
        total += (i * 0.001) * (i * 0.001)
    # 这里不是单纯 sqrt(total)，而是更“随意”的返回表达式
    return 1.0 / (1.0 + (total ** 0.5))


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: vector_norm_customret
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_qeqyn62a
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def vector_norm_customret(int n, int num_threads=0):
        cdef Py_ssize_t __t, __i, t, tid
        cdef double total = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals

     

In [207]:
import math, timeit

def baseline_vector_norm_customret(n=1000):
    total = 0.0
    for i in range(n):
        total += (i * 0.001) * (i * 0.001)
    return 1.0 / (1.0 + math.sqrt(total))

# --- 正确性 ---
n = 3000
ref = baseline_vector_norm_customret(n)
opt = vector_norm_customret(n, num_threads=8)
print("ref:", ref)
print("opt:", opt)
print("equal:", abs(ref - opt) < 1e-12)

# --- 性能 ---
repeats = 5
t_base = timeit.timeit(lambda: baseline_vector_norm_customret(n), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: vector_norm_customret(n, num_threads=8), number=repeats) / repeats
print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {t_base/t_opt:.2f}×")


ref: 0.010433554636315135
opt: 0.010433554636315139
equal: True
Baseline 平均时间: 0.000452s
Optimized 平均时间: 0.000091s
加速比: 4.96×


In [208]:
%%cython_openmp -n vector_norm_total
def vector_norm_total(n=1000, scale=1.5):
    total = 0.0
    for i in range(n):
        total += (i * 0.001) * (i * 0.001)
    # return 里不只是 sqrt(total)，还引入了额外参数 scale
    return (total ** 0.5) * scale


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: vector_norm_total
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_k8rb1e03
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def vector_norm_total(int n, double scale, int num_threads=0):
        cdef Py_ssize_t __t, __i, t, tid
        cdef double total = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals


In [209]:
import math

def baseline_vector_norm_total(n=1000, scale=1.5):
    total = 0.0
    for i in range(n):
        total += (i * 0.001) * (i * 0.001)
    return math.sqrt(total) * scale


In [210]:
import timeit

n = 5000
scale = 2.0

ref = baseline_vector_norm_total(n, scale)
opt = vector_norm_total(n, scale, num_threads=8)
print("ref:", ref)
print("opt:", opt)
print("equal:", abs(ref - opt) < 1e-12)

repeats = 5
t_base = timeit.timeit(lambda: baseline_vector_norm_total(n, scale), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: vector_norm_total(n, scale, num_threads=8), number=repeats) / repeats
print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {t_base/t_opt:.2f}×")


ref: 408.18705270990694
opt: 408.18705270990665
equal: True
Baseline 平均时间: 0.000665s
Optimized 平均时间: 0.000064s
加速比: 10.33×


In [211]:
%%cython_openmp -n reduce_with_alpha
import numpy as np

def reduce_with_alpha(n=1000):
    s = 0.0
    for i in range(n):
        s += i * 0.001
    # return 表达式里引入额外标量 alpha
    return np.sqrt(s / (n + alpha))


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: reduce_with_alpha
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_ox2_432j
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def reduce_with_alpha(int n, double alpha, int num_threads=0):
        cdef Py_ssize_t __t, __i, t, tid
        cdef double s = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals

   

In [212]:
import math

# baseline 参考实现
def baseline(n, alpha):
    s = 0.0
    for i in range(n):
        s += i * 0.001
    return math.sqrt(s / (n + alpha))

# 测试
n = 2000
alpha = 3.5

ref = baseline(n, alpha)
opt = reduce_with_alpha(n, alpha, num_threads=4)  # 注意：alpha 已经自动出现在签名里
print("baseline:", ref)
print("cython_omp:", opt)
print("差值:", abs(ref - opt))


baseline: 0.9988763339980643
cython_omp: 0.9988763339980643
差值: 0.0


In [157]:
%reload_ext cython_openmp

In [213]:
%%cython_openmp -n range_fullform_sum
import numpy as np

def range_fullform_sum(n=10):
    result = 0.0
    # 使用三参数 range(start, stop, step)
    for i in range(1, 2*n, 2):   # i = 1,3,5,...,2n-1
        result += i * 0.1
    return result


🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: range_fullform_sum
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_3q3ymlqe
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def range_fullform_sum(int n, int num_threads=0):
        cdef Py_ssize_t __t, __i, t, tid
        cdef double result = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals

        cd

In [214]:
import math
import timeit
import numpy as np

# 纯 Python 版本（基线）
def range_fullform_sum_py(n=10):
    result = 0.0
    for i in range(1, 2*n, 2):   # 1,3,5,..., 2n-1   共 n 项
        result += i * 0.1
    return result


In [215]:
# 随机几个 n + 解析公式一起校验
for n in [1, 10, 100, 1000, 1234]:
    ref_py = range_fullform_sum_py(n)
    ref_formula = 0.1 * n * n          # 和(1+3+...+2n-1)=n^2
    opt = range_fullform_sum(n, num_threads=4)

    ok_py = abs(ref_py - opt) < 1e-12
    ok_formula = abs(ref_formula - opt) < 1e-12
    print(f"n={n:5d} | 与Python一致: {ok_py!s:5s} | 与解析公式一致: {ok_formula!s:5s} | 值={opt:.12f}")


n=    1 | 与Python一致: True  | 与解析公式一致: True  | 值=0.100000000000
n=   10 | 与Python一致: True  | 与解析公式一致: True  | 值=10.000000000000
n=  100 | 与Python一致: True  | 与解析公式一致: True  | 值=1000.000000000000
n= 1000 | 与Python一致: False | 与解析公式一致: True  | 值=100000.000000000000
n= 1234 | 与Python一致: True  | 与解析公式一致: True  | 值=152275.600000000006


In [216]:
import timeit
import math

# 原始 Python 版本（基准）
def baseline_range_fullform_sum(n=10):
    result = 0.0
    for i in range(1, 2*n, 2):   # 1,3,5,...,2n-1  共 n 次
        result += i * 0.1
    return result

# ---- 正确性：与 Python/解析公式一致性 ----
def expected_by_formula(n):
    # 1+3+...+(2n-1) = n^2
    return 0.1 * (n**2)

for n in [1, 10, 100, 1234]:
    py = baseline_range_fullform_sum(n)
    cy = range_fullform_sum(n, num_threads=4)   # 如需改线程数，改这里
    fx = expected_by_formula(n)
    print(f"n={n:5d} | 与Python一致: {abs(py-cy) < 1e-12} | 与解析公式一致: {abs(cy-fx) < 1e-12} | 值={cy:.12f}")

# ---- 计时与加速比 ----
N = 5_000_000    # 规模（可根据机器性能调整，比如 1_000_000 / 10_000_000）
repeats = 3      # 重复次数
threads = 4      # OpenMP 线程数（根据 CPU 修改）

t_base = timeit.timeit(lambda: baseline_range_fullform_sum(N), number=repeats) / repeats
t_opt  = timeit.timeit(lambda: range_fullform_sum(N, num_threads=threads), number=repeats) / repeats
speedup = t_base / t_opt

print("\n=== 基准结果 ===")
print(f"N = {N:,}, repeats = {repeats}, threads = {threads}")
print(f"Baseline 平均时间: {t_base:.6f}s")
print(f"Optimized 平均时间: {t_opt:.6f}s")
print(f"加速比: {speedup:.2f}×")


n=    1 | 与Python一致: True | 与解析公式一致: True | 值=0.100000000000
n=   10 | 与Python一致: True | 与解析公式一致: True | 值=10.000000000000
n=  100 | 与Python一致: True | 与解析公式一致: True | 值=1000.000000000000
n= 1234 | 与Python一致: True | 与解析公式一致: True | 值=152275.600000000006

=== 基准结果 ===
N = 5,000,000, repeats = 3, threads = 4
Baseline 平均时间: 0.280990s
Optimized 平均时间: 0.001950s
加速比: 144.08×


In [217]:
%%cython_openmp -n nested_fullform_sum
import numpy as np

def nested_fullform_sum(n=100):
    result = 0.0
    for i in range(1, 2*n, 2):   # 外层三参 range
        for j in range(n):       # 内层单参 range
            result += (i + j) * 0.01
    return result



🔧 Cython + OpenMP 并行编译器 v2 (magic: %cython_openmp)
📌 目标函数名: nested_fullform_sum
🧩 临时构建目录: C:\Users\xiong\AppData\Local\Temp\cython_omp_g8mq_wns
📐 prange 调度: static

📄 生成的 Cython 并行代码如下：


    # cython: boundscheck=False, wraparound=False, initializedcheck=False, nonecheck=False
    # cython: cdivision=True, infer_types=True
    from cython.parallel cimport prange, parallel
    from openmp cimport omp_get_max_threads, omp_get_thread_num, omp_set_num_threads
    from libc.math cimport sqrt, pow, fabs as c_fabs, exp as c_exp, log as c_log, sin as c_sin, cos as c_cos, tanh as c_tanh
    cimport cython
    import numpy as np
    cimport numpy as np


    def nested_fullform_sum(int n, int num_threads=0):
        cdef Py_ssize_t i, j, __t, __i, t, tid
        cdef double result = 0.0
        cdef int PAD = 16
    
        cdef int T = omp_get_max_threads()
        cdef np.ndarray[np.double_t, ndim=1] _locals = np.zeros(T * PAD, dtype=np.float64)
        cdef double[:] locals_mv = _locals

  

In [218]:
import time
import numpy as np

# Python 参考版
def nested_fullform_sum_py(n=100):
    result = 0.0
    for i in range(1, 2*n, 2):
        for j in range(n):
            result += (i + j) * 0.01
    return result

# 闭式公式（“金标准”）
def nested_fullform_sum_formula(n):
    return 0.005 * (3*n**3 - n**2)

def bench(n=200, num_threads=8, repeat=3):
    # 预热
    nested_fullform_sum(n, num_threads=num_threads)
    nested_fullform_sum_py(100)

    # 计时：Python
    best_py = 1e9
    for _ in range(repeat):
        t0 = time.perf_counter()
        rp = nested_fullform_sum_py(n)
        best_py = min(best_py, time.perf_counter() - t0)

    # 计时：Cython+OpenMP
    best_cy = 1e9
    for _ in range(repeat):
        t0 = time.perf_counter()
        rc = nested_fullform_sum(n, num_threads=num_threads)
        best_cy = min(best_cy, time.perf_counter() - t0)

    rf = nested_fullform_sum_formula(n)

    print(f"n={n}, threads={num_threads}")
    print(f"  py: {rp:.12f}   t={best_py*1e3:.2f} ms")
    print(f"  cy: {rc:.12f}   t={best_cy*1e3:.2f} ms")
    print(f"  fm: {rf:.12f}")
    print(f"  abs(cy-py)={abs(rc-rp):.3e}, abs(cy-fm)={abs(rc-rf):.3e}")
    print(f"  speedup(py/cy) ≈ {best_py / best_cy:.2f}×")

# 跑一下
bench(n=200, num_threads=8, repeat=5)


n=200, threads=8
  py: 119800.000000000000   t=2.52 ms
  cy: 119800.000000000000   t=0.02 ms
  fm: 119800.000000000000
  abs(cy-py)=0.000e+00, abs(cy-fm)=0.000e+00
  speedup(py/cy) ≈ 154.58×
