加载cython

In [None]:
%load_ext Cython

测试Cython变量定义

In [31]:
##"%%cython"声明cython使用cdef来定义c变量和c函数
%%cython

import numpy as np
cimport numpy as cnp
import pandas as pd

cdef:
    int i = 1
    unsigned long j = 2
    signed short k = 3
    long long ll = 4LL
    bint tflag = True

安装profiler，方便效率分析

In [33]:
pip install line_profiler

Collecting line_profiler
[?25l  Downloading https://files.pythonhosted.org/packages/66/eb/417ace64f45fee7a0394946f8e1f90f925420fd9b14f1f09abb5284a0ca4/line_profiler-3.1.0-cp36-cp36m-manylinux2010_x86_64.whl (63kB)
[K     |████████████████████████████████| 71kB 3.2MB/s 
Installing collected packages: line-profiler
Successfully installed line-profiler-3.1.0


加载profiler

In [34]:
%load_ext line_profiler

In [None]:
%%cython -a

import numpy as np
cimport numpy as cnp
import pandas as pd

def cy_target_mean_v1(data:pd.DataFrame, y_name:str, x_name:str) -> np.ndarray: 
  cdef:
    int data_shape = data.shape[0]
    cnp.ndarray[cnp.float64_t] result = np.zeros(data_shape, dtype=np.float64)
    dict value_dict = {}
    dict count_dict = {}
    cnp.ndarray[cnp.int_t] x_val_array = data[x_name].values
    cnp.ndarray[cnp.int_t] y_val_array = data[y_name].values

  for i in range(data_shape):
    data_loc_x = x_val_array[i]
    data_loc_y = y_val_array[i]
    if data_loc_x not in value_dict:
      value_dict[data_loc_x] = data_loc_y
      count_dict[data_loc_x] = 1
    else:
      value_dict[data_loc_x] += data_loc_y
      count_dict[data_loc_x] += 1
  for i in range(data_shape):
    count = count_dict[x_val_array[i]] - 1
    result[i] = (value_dict[x_val_array[i]] - y_val_array[i]) / count

  return result

In [None]:
%%cython -a

import numpy as np
cimport numpy as cnp
import pandas as pd
import cython
cimport cython
from cython.parallel import prange 

cpdef cy_target_mean_v2(data, cnp.str y_name, cnp.str x_name): 
  cdef:
    int data_shape = data.shape[0]
    double[:,] result = np.zeros(data_shape, dtype=np.float64)
    double[:,] value_dict = np.zeros(10, dtype=np.float64)
    double[:,] count_dict = np.zeros(10, dtype=np.float64)
    long[:,] x_val_array = data[x_name].values
    long[:,] y_val_array = data[y_name].values
    int i = 0 

  for i in prange(data_shape, nogil=True):
    value_dict[x_val_array[i]] += y_val_array[i]
    count_dict[x_val_array[i]] += 1
  for i in prange(data_shape, nogil=True):
    result[i] = (value_dict[x_val_array[i]] - y_val_array[i]) / (count_dict[x_val_array[i]] - 1)

  return result

添加一个分割线

————————————————————————————————————————————————————————————————————————————————

将python函数转换成cython的cpdef函数，并使用prange来代替range进行遍历，为了打开Gil的锁，使用并行进行加速

In [None]:
%%cython -a

import numpy as np
cimport numpy as cnp
import pandas as pd
import cython
cimport cython
from cython.parallel import prange 

cpdef cy_target_mean_v8(data, cnp.str y_name, cnp.str x_name): 
  cdef:
    int data_shape = data.shape[0]
    double[:,] result = np.zeros(data_shape, dtype=np.float64)
    double[:,] value_dict = np.zeros(10, dtype=np.float64)
    double[:,] count_dict = np.zeros(10, dtype=np.float64)
    long[:,] x_val_array = data[x_name].values
    long[:,] y_val_array = data[y_name].values
    int i = 0 

  for i in prange(data_shape, nogil=True):
    value_dict[x_val_array[i]] += y_val_array[i]
    count_dict[x_val_array[i]] += 1
  for i in prange(data_shape, nogil=True):
    result[i] = (value_dict[x_val_array[i]] - y_val_array[i]) / (count_dict[x_val_array[i]] - 1)

  return result

用memoryview代替的数组指针

In [None]:
%%cython -a

import numpy as np
cimport numpy as cnp
import pandas as pd
import cython
cimport cython
from cython.parallel import prange 

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef cy_target_mean_v4(data, cnp.str y_name, cnp.str x_name): 
  cdef:
    int data_shape = data.shape[0]
    double[::1] result = np.zeros(data_shape, dtype=np.float64)
    double[::1] value_dict = np.zeros(10, dtype=np.float64)
    long[::1] count_dict = np.zeros(10, dtype=np.int64)
    long[::1] x_val_array = np.asfortranarray(data[x_name].values, dtype=np.int64)
    long[::1] y_val_array = np.asfortranarray(data[y_name].values, dtype=np.int64)
    int i = 0 
    long x

  for i in prange(data_shape, nogil=True):
    x = x_val_array[i]
    value_dict[x] += y_val_array[i]
    count_dict[x] += 1
  for i in prange(data_shape, nogil=True):
    x = x_val_array[i]
    result[i] = (value_dict[x] - y_val_array[i]) / (count_dict[x] - 1)

  return result

使用指针和memoryview版本进行对比

In [None]:
%%cython -a

import numpy as np
cimport numpy as cnp
import pandas as pd
import cython
cimport cython
from cython.parallel import prange 

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef cy_target_mean_v5(data, cnp.str y_name, cnp.str x_name): 
  cdef:
    int data_shape = data.shape[0]
    double[::1] result = np.zeros(data_shape, dtype=np.float64)
    double[::1] value_dict = np.zeros(10, dtype=np.float64)
    long[::1] count_dict = np.zeros(10, dtype=np.int64)
    long[::1] x_val_array = np.asfortranarray(data[x_name].values, dtype=np.int64)
    long[::1] y_val_array = np.asfortranarray(data[y_name].values, dtype=np.int64)
    int i = 0 
    long x

  for i in prange(data_shape, nogil=True):
    x = x_val_array[i]
    value_dict[x] += y_val_array[i]
    count_dict[x] += 1
  for i in prange(data_shape, nogil=True):
    x = x_val_array[i]
    result[i] = (value_dict[x] - y_val_array[i]) / (count_dict[x] - 1)

  return result

In [62]:
y = np.random.randint(2, size=(100000, 1))
x = np.random.randint(10, size=(100000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [None]:
%lprun -f cy_target_mean_v1 cy_target_mean_v1(data, 'y', 'x')

In [61]:
%lprun -f cy_target_mean_v2 cy_target_mean_v2(data, 'y', 'x')

  profile = LineProfiler(*funcs)


%lprun结果均为：Timer unit: 1e-06 ，原因：无法提取代码对象

改用%%timeit评估

In [58]:
%%timeit
cy_target_mean_v1(data, 'y', 'x')


10 loops, best of 3: 23.9 ms per loop


In [51]:
%%timeit
cy_target_mean_v2(data, 'y', 'x')

1000 loops, best of 3: 1.09 ms per loop


In [57]:
%%timeit
cy_target_mean_v3(data, 'y', 'x')

The slowest run took 5.65 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 1.11 ms per loop


In [67]:
%%timeit
cy_target_mean_v4(data, 'y', 'x')

1000 loops, best of 3: 884 µs per loop


In [70]:
%%timeit
cy_target_mean_v5(data, 'y', 'x')

The slowest run took 8.39 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 878 µs per loop
