In [1]:
import numpy as np
import pandas as pd


In [2]:
%load_ext Cython

In [4]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [30]:
y = np.random.randint(2, size=(1000000, 1))
x = np.random.randint(10, size=(1000000, 1))
data1 = np.concatenate([y, x], axis=1)
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [6]:
#result_1 = target_mean_v1(data, 'y', 'x')
%timeit target_mean_v1(data, 'y', 'x')

KeyboardInterrupt: ignored

In [8]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [9]:
%timeit target_mean_v2(data, 'y', 'x')

1 loop, best of 3: 266 ms per loop


In [10]:

%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [11]:
%%cython -a
import cython
cimport cython
cimport numpy as cnp
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)

cpdef target_mean_v4(cnp.ndarray[long, ndim=2] data):
  cdef int len
  len = data.shape[0]
  cdef cnp.ndarray[double] result
  cdef cnp.ndarray[long] value_dict
  cdef cnp.ndarray[long] count_dict
  result = np.zeros((len), dtype=np.double)
  value_dict = np.zeros((len), dtype=long)
  count_dict = np.zeros((len), dtype=long)
  cdef long xx
  cdef long yy
  for i in range(len):
    xx = data[i,1]
    yy = data[i,0]
    value_dict[xx] += yy
    count_dict[xx] += 1
  for i in range(len):
    xx = data[i,1]
    yy = data[i,0]
    result[i] = (value_dict[xx] - yy) / (count_dict[xx] - 1)
  return result



In [12]:
diff=target_mean_v2(data,'y','x') - target_mean_v4(data1)
diff

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
# y = np.random.randint(2, size=(1000000, 1))
# x = np.random.randint(10, size=(1000000, 1))
# data1 = np.concatenate([y, x], axis=1)

In [13]:
%timeit target_mean_v4(data1)

10000 loops, best of 3: 27.4 µs per loop


In [14]:
%%cython -a
import cython
cimport cython
cimport numpy as cnp
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)

cpdef target_mean_v5(cnp.ndarray[long, ndim=2] data):
  cdef int len
  len = data.shape[0]
  cdef cnp.ndarray[double] result
  cdef cnp.ndarray[long] value_dict
  cdef cnp.ndarray[long] count_dict
  result = np.zeros((len), dtype=np.double)
  value_dict = np.zeros((len), dtype=long)
  count_dict = np.zeros((len), dtype=long)
  cdef long xx
  cdef long yy
  for i from 0 <= i < len :
    xx = data[i,1]
    # yy = data[i,0]
    value_dict[xx] += data[i,0]
    count_dict[xx] += 1
  for i from 0 <= i < len :
    xx = data[i,1]
    # yy = data[i,0]
    result[i] = (value_dict[xx] - data[i,0]) / (count_dict[xx] - 1)
  return result




In [15]:
%timeit target_mean_v5(data1)

The slowest run took 126.10 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 28.1 µs per loop


In [None]:
diff =target_mean_v5(data1)-target_mean_v4(data1)
diff

array([0., 0., 0., ..., 0., 0., 0.])

In [16]:
%%cython -a
import cython
cimport cython
cimport numpy as cnp
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)

cpdef target_mean_v6(long[:,:] data):
  cdef int len = data.shape[0]
  cdef cnp.ndarray[double] result
  cdef cnp.ndarray[long] value_dict
  cdef cnp.ndarray[long] count_dict
  result = np.empty(len,dtype=np.double)
  value_dict = np.empty(len, dtype=long)
  count_dict = np.empty(len, dtype=long)
  value_dict.fill(0)
  count_dict.fill(0)
  cdef long xx
  cdef long yy
  for i from 0 <= i < len :
    xx = data[i,1]
    value_dict[xx] += data[i,0]
    count_dict[xx] += 1
  for i from 0 <= i < len :
    xx = data[i,1]
    result[i] = (value_dict[xx] - data[i,0]) / (count_dict[xx] - 1)
  return result



In [17]:
%timeit target_mean_v6(data1)

The slowest run took 12.27 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 28.1 µs per loop


In [None]:
diff=target_mean_v6(data1)-target_mean_v5(data1)
diff

array([0., 0., 0., ..., 0., 0., 0.])

In [27]:
%%cython -a
import cython
cimport cython
cimport numpy as cnp
import numpy as np
from math import exp  
from libc.math cimport exp as c_exp 
from cython.parallel import prange,parallel
@cython.boundscheck(False)
@cython.wraparound(False)

cpdef target_mean_v7(long[:,:] data):
  cdef int len = data.shape[0]
  cdef double[:] result = np.empty(len,dtype=np.double)
  cdef long[:] value_dict
  cdef long[:] count_dict
  value_dict = np.zeros((len), dtype=long)   
  count_dict = np.zeros((len), dtype=long)
  cdef long[:] xx= data[:,1]
  cdef long[:] yy= data[:,0]
  cdef long x
  cdef long y
  cdef int i = 0
  for i in prange(len,nogil=True) :
    x=xx[i]
    y=yy[i]
    value_dict[x] += y
    count_dict[x] += 1
  for i in prange(len,nogil=True):
    result[i] = (value_dict[x] - y) / (count_dict[x] - 1)
  return result



In [31]:
%timeit target_mean_v7(data1)

100 loops, best of 3: 8.91 ms per loop
