In [2]:
# coding = 'utf-8'
import numpy as np
import pandas as pd

In [3]:
%load_ext Cython

In [4]:
y = np.random.randint(2, size=(500, 1))
x = np.random.randint(10, size=(500, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [5]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [6]:
def target_mean_v2(data, y_name, x_name):
    d1 = {}
    d2 = {}
    result = []
    for x,y in zip(data[x_name],data[y_name]):
        if x not in d1:
            d1[x] = y
            d2[x] = 1
        else:
            d1[x] += y
            d2[x] += 1
    for x,y in zip(data[x_name],data[y_name]):
        feature = (d1[x]-y)/(d2[x]-1)
        result.append(feature)
    return result

In [7]:
def target_mean_v3(data, y_name, x_name):
    d1 = {}
    d2 = {}
    result = []
    x, y = data[x_name],data[y_name]
    for i in range(data.shape[0]):
        if x[i] not in d1:
            d1[x[i]] = y[i]
            d2[x[i]] = 1
        else:
            d1[x[i]] += y[i]
            d2[x[i]] += 1
    for i in range(data.shape[0]):
        feature = (d1[x[i]]-y[i])/(d2[x[i]]-1)
        result.append(feature)
    return result

In [8]:
%%cython
import numpy as np
cimport numpy as np
cimport cython
from cython.parallel import prange

cpdef target_mean_v4(np.ndarray x, np.ndarray y):
    cdef int feature_size = 10
    cdef int i
    cdef int n = x.shape[0]

    cdef np.ndarray memo_sum = np.zeros(feature_size, dtype=int)
    cdef np.ndarray memo_cnt = np.zeros(feature_size, dtype=int)
    cdef np.ndarray result = np.zeros(n, dtype=float)
    
    for i in range(n):
        memo_sum[x[i]] += y[i]
        memo_cnt[x[i]] += 1
    for i in range(n):
        result[i] = (memo_sum[x[i]]-y[i])/(memo_cnt[x[i]]-1)
    return result

In [47]:
%%cython
import numpy as np
cimport numpy as np
cimport cython
from cython.parallel import prange

@cython.boundscheck(False)  # Deactivate bounds checking
@cython.wraparound(False)   # Deactivate negative indexing.
cpdef target_mean_v5(int[:] x, int[:] y):
    cdef int feature_size = 10
    cdef int i
    cdef int n = x.shape[0]
    cdef int[:] memo_cnt=np.zeros(feature_size).astype(np.intc)
    cdef int[:] memo_sum=np.zeros(feature_size).astype(np.intc)
    cdef double[:] result=np.zeros(n)
    
    for i in prange(n, nogil=True):
        memo_sum[x[i]] += y[i]
        memo_cnt[x[i]] += 1
    for i in prange(n, nogil=True):
        result[i] = (memo_sum[x[i]]-y[i])/(memo_cnt[x[i]]-1)
    return result

In [15]:
answer = target_mean_v1(data, 'y', 'x')

In [8]:
%%timeit
target_mean_v1(data, 'y', 'x')

1.53 s ± 58.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
result2 = target_mean_v2(data, 'y', 'x')
np.linalg.norm(result2-answer)

0.0

In [10]:
%%timeit
target_mean_v2(data, 'y', 'x')

209 µs ± 2.54 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [11]:
result3 = target_mean_v3(data, 'y', 'x')
np.linalg.norm(result3-answer)

0.0

In [12]:
%%timeit
target_mean_v3(data, 'y', 'x')

21.5 ms ± 367 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [73]:
result4 = target_mean_v4(data['x'].values, data['y'].values)
np.linalg.norm(result4-answer)

0.0

In [20]:
%%timeit
target_mean_v4(data['x'].values, data['y'].values)

771 µs ± 26.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [49]:
result5 = target_mean_v5(data['x'].values.astype(np.intc), data['y'].values.astype(np.intc))
np.linalg.norm(result5-answer)

0.0

In [50]:
%%timeit
target_mean_v5(data['x'].values.astype(np.intc), data['y'].values.astype(np.intc))

14.2 µs ± 368 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
