In [2]:
# coding = 'utf-8'
import numpy as np
import pandas as pd
import timeit 

In [3]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [4]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [5]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])
    # result_1 = target_mean_v1(data, 'y', 'x')
    # result_2 = target_mean_v2(data, 'y', 'x')
    # diff = np.linalg.norm(result_1 - result_2)

In [6]:
%%timeit
result_1 = target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 24.8 s per loop


In [26]:
%%timeit
result_2 = target_mean_v2(data, 'y', 'x')

1 loop, best of 3: 263 ms per loop


In [12]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [28]:
%%cython

import numpy as np
cimport numpy as cnp 
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
def target_mean_v3(data,y_name,x_name):
    cdef cnp.ndarray[cnp.double_t] result
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [31]:
%%timeit
result_3 = target_mean_v3(data, 'y', 'x')

1 loop, best of 3: 265 ms per loop


In [59]:
%%cython

import numpy as np
cimport numpy as cnp 
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v4(data,str y_name, str x_name):
    cdef cnp.ndarray[cnp.double_t] result
    cdef cnp.ndarray[long] x_value = data[x_name].values
    cdef cnp.ndarray[long] y_value = data[y_name].values
    cdef dict value_dict = {}
    cdef dict count_dict = {}
    cdef int data_len = data.shape[0]
    cdef int i=0

    result = np.zeros(data_len)
    
    for i from 0<=i<data_len:
      index = x_value[i]
      if index not in value_dict.keys():
          value_dict[index] = y_value[i]
          count_dict[index] = 1
      else:
          value_dict[index] += y_value[i]
          count_dict[index] += 1
        
    for i from 0<=i<data_len:
        index = x_value[i]
        result[i] = (value_dict[index] - y_value[i]) / (count_dict[index] - 1)

    return result

In [34]:
%%timeit
result_2 = target_mean_v2(data, 'y', 'x')
result_3 = target_mean_v3(data, 'y', 'x')
diff = np.linalg.norm(result_3 - result_2)

1 loop, best of 3: 529 ms per loop


In [35]:
%%timeit
result_3 = target_mean_v3(data, 'y', 'x')

1 loop, best of 3: 268 ms per loop


In [60]:
%%timeit
result_4 = target_mean_v4(data, 'y', 'x')

1000 loops, best of 3: 1.15 ms per loop


In [65]:
%%cython

import numpy as np
cimport numpy as cnp 
cimport cython
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v5(data,str y_name, str x_name):
    cdef cnp.ndarray[cnp.double_t] result
    cdef cnp.ndarray[long] x_value = data[x_name].values
    cdef cnp.ndarray[long] y_value = data[y_name].values
    cdef long[:] value_arr = np.zeros(10).astype(long)
    cdef long[:] count_arr = np.zeros(10).astype(long)
    cdef int data_len = data.shape[0]
    cdef int i=0

    result = np.zeros(data_len)
    
    for i in prange(data_len, nogil=True):
      value_arr[x_value[i]] += y_value[i]
      count_arr[x_value[i]] += 1
        
    for i in prange(data_len, nogil=True):
        result[i] = (value_arr[x_value[i]] - y_value[i]) / (count_arr[x_value[i]] - 1)

    return result

In [66]:
%%timeit
result_5 = target_mean_v5(data, 'y', 'x')

The slowest run took 45.98 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 45.4 µs per loop
