In [1]:
%load_ext Cython

In [2]:
import numpy as np
import pandas as pd
import time

def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        x_val=data.loc[i, x_name]
        y_val=data.loc[i, y_name]
        if x_val not in value_dict.keys():
            value_dict[x_val] = y_val
            count_dict[x_val] = 1
        else:
            value_dict[x_val] += y_val
            count_dict[x_val] += 1
    for i in range(data.shape[0]):
        val_loc=data.loc[i, x_name]
        result[i] = (value_dict[val_loc] - data.loc[i, y_name]) / (count_dict[val_loc] - 1)
    return result

def timeCount(func,data):
    start=time.time()
    result=func(data,'y','x')
    end=time.time()
    return result,end-start

y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])
result2=timeCount(target_mean_v2,data)
print(result2[1])

0.37527894973754883


In [6]:
%%cython -a
import numpy as np
import pandas as pd
import time
cimport numpy as np
import cython
cimport cython
from cython.parallel import prange

cpdef convert_demo(matrix,num_dim=1):
    cdef np.ndarray[double, ndim=1, mode='fortran'] arg = np.asfortranarray(matrix, dtype=np.float64)
    return arg


cpdef target_mean_v3(data, y_name, x_name):                  
    result = convert_demo(np.zeros(data.shape[0]))
    value_dict = dict()
    count_dict = dict()
    
    cdef x_val,y_val,i,val_loc
    
    for i in range(data.shape[0]):
        x_val=data.loc[i, x_name]
        y_val=data.loc[i, y_name]
        if x_val not in value_dict.keys():
            value_dict[x_val] = y_val
            count_dict[x_val] = 1
        else:
            value_dict[x_val] += y_val
            count_dict[x_val] += 1
    for i in range(data.shape[0]):
        val_loc=data.loc[i, x_name]
        result[i] = (value_dict[val_loc] - data.loc[i, y_name]) / (count_dict[val_loc] - 1)
    return result


# @cython.boundscheck(False)
# @cython.wraparound(False)
cpdef target_mean_v4(np.ndarray[long,ndim=2] data):        #cdef parameters and func
    cdef double[:] result = np.zeros(data.shape[0])
    cdef dict value_dict={}
    cdef dict count_dict={}
    
    cdef int x_val
    cdef int y_val
    cdef int i
    cdef int val_loc
    cdef int num=data.shape[0]
    
    for i in range(num):
        x_val=data[i][1]
        y_val=data[i][0]
        if x_val not in value_dict.keys():
            value_dict[x_val] = y_val
            count_dict[x_val] = 1
        else:
            value_dict[x_val] += y_val
            count_dict[x_val] += 1
    for i in range(num):
        val_loc=data[i][1]
        result[i] = (value_dict[val_loc] - data[i][0]) / (count_dict[val_loc] - 1)
    return result




cpdef target_mean_v5(long[:,:] data):                       #magic func
    cdef double[:] result = np.zeros(data.shape[0])
    cdef dict value_dict={}
    cdef dict count_dict={}
    cdef int x_val
    cdef int y_val
    cdef int i
    cdef int val_loc
    
    for i in range(data.shape[0]):
        x_val=data[i][1]
        y_val=data[i][0]
        if x_val not in value_dict.keys():
            value_dict[x_val] = y_val
            count_dict[x_val] = 1
        else:
            value_dict[x_val] += y_val
            count_dict[x_val] += 1
    for i in range(data.shape[0]):
        val_loc=data[i][1]
        result[i] = (value_dict[val_loc] - data[i][0]) / (count_dict[val_loc] - 1)
    return result

cpdef target_mean_v6(np.ndarray[long,ndim=2] data):    #prange,nogil
    cdef int num=data.shape[0]
    cdef double[:] result = np.zeros(num)
    cdef np.ndarray[long,ndim=1] value_dict=np.zeros(10)
    cdef np.ndarray[long,ndim=1] count_dict=np.zeros(10)
    cdef int x_val,y_val,i,val_loc
    cdef np.ndarray[long,ndim=1] y_value=data[i][0]
    cdef np.ndarray[long,ndim=1] x_value=data[i][1]
    
    for i in prange(num,nogil=True):
        value_dict[x_value[i]] += y_value[i]
        count_dict[x_value[i]] += 1
        
    for i in prange(num,nogil=True):
        result[i] = (value_dict[i] - y_value[i]) / (count_dict[i] - 1)
    return result




y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data0=np.concatenate([y, x], axis=1)
data = pd.DataFrame(data0, columns=['y', 'x'])

start=time.time()
result=target_mean_v3(data,'y','x')
end=time.time()
print(end-start)

start4=time.time()
result4=target_mean_v4(data0)
end4=time.time()
print(end4-start4)

start5=time.time()
result5=target_mean_v5(data0)
end5=time.time()

print(end5-start5)


start6=time.time()
result6=target_mean_v6(data0)
end6=time.time()

print(end6-start6)





0.37260890007019043
0.014346122741699219
0.0023381710052490234


ValueError: Buffer dtype mismatch, expected 'long' but got 'double'

***nogil and prange实现并行***

In [7]:
%%timeit
target_mean_v5(data0)

20.4 ms ± 1.36 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
import numpy as np
a=np.array([])
a=np.append(a,1)
a