<a href="https://colab.research.google.com/github/John1Tang/ML-000/blob/main/optimize_target_encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Lib

In [None]:
!pip3 install cython
!pip3 install line_profiler

## Generate Sample Data

In [None]:
# coding = 'utf-8'
import numpy as np
import pandas as pd

In [None]:
y = np.random.randint(2, size=(500, 1))
x = np.random.randint(10, size=(500, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

### Original Function

In [None]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

### Improved During the Class

In [None]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

### Reduce Time to index

In [None]:
def target_mean_v3(data, y_name, x_name):
    data_shape = data.shape[0]
    result = np.zeros(data_shape)
    value_dict = dict()
    count_dict = dict()

    loc_x_list = data.loc[:, x_name].values
    loc_y_list = data.loc[:, y_name].values
    for i in range(data_shape):
        loc_x = loc_x_list[i]
        loc_y = loc_y_list[i]
        value_dict.setdefault(loc_x, 0.0)
        value_dict[loc_x] += loc_y
        count_dict.setdefault(loc_x, 0)
        count_dict[loc_x] += 1

    for i in range(data_shape):
        loc_x = loc_x_list[i]
        loc_y = loc_y_list[i]
        result[i] = (value_dict[loc_x] - loc_y) / (count_dict[loc_x] - 1)
    return result

### Cost

In [17]:
%%timeit -n 10
target_mean_v1(data, 'y', 'x')

10 loops, best of 3: 2.69 s per loop


### Cost

In [18]:
%%timeit -n 10
target_mean_v2(data, 'y', 'x')

10 loops, best of 3: 32.4 ms per loop


### Cost

In [19]:
%%timeit -n 10
target_mean_v3(data, 'y', 'x')

10 loops, best of 3: 1.1 ms per loop


### Check Correctness

In [20]:
result_1 = target_mean_v1(data, 'y', 'x')
result_2 = target_mean_v2(data, 'y', 'x')
result_3 = target_mean_v3(data, 'y', 'x')

diff = np.linalg.norm(result_1 - result_2)
print(diff)

diff13 = np.linalg.norm(result_1 - result_3)
print(diff)

0.0
0.0


## Profile

In [21]:
from line_profiler import LineProfiler

profile = LineProfiler(target_mean_v3)
profile.enable()
target_mean_v3(data, 'y', 'x')
profile.disable()
profile.print_stats()

Timer unit: 1e-06 s

Total time: 0.005611 s
File: <ipython-input-16-93be6b75cd46>
Function: target_mean_v3 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def target_mean_v3(data, y_name, x_name):
     2         1         20.0     20.0      0.4      data_shape = data.shape[0]
     3         1          8.0      8.0      0.1      result = np.zeros(data_shape)
     4         1          1.0      1.0      0.0      value_dict = dict()
     5         1          1.0      1.0      0.0      count_dict = dict()
     6                                           
     7         1        258.0    258.0      4.6      loc_x_list = data.loc[:, x_name].values
     8         1        269.0    269.0      4.8      loc_y_list = data.loc[:, y_name].values
     9       501        393.0      0.8      7.0      for i in range(data_shape):
    10       500        537.0      1.1      9.6          loc_x = loc_x_list[i]
    11       500       

## Use Cython

In [22]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [23]:
%%cython -a

import cython
import numpy as np
cimport numpy as cnp
import pandas as pd

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef cnp.ndarray[double] target_mean_v4(cnp.ndarray[int, ndim=2] data):
    cdef cnp.ndarray[int] x = data[:,1]
    cdef cnp.ndarray[int] y = data[:,0]
    cdef int data_shape = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(data_shape)
    cdef cnp.ndarray[int] value_dict = np.zeros(data_shape).astype(np.intc)
    cdef cnp.ndarray[int] count_dict = np.zeros(data_shape).astype(np.intc)
    cdef int i
    for i in range(data_shape):
        value_dict[x[i]] += y[i]
        count_dict[x[i]] += 1
    for i in range(data_shape):
        result[i] = (value_dict[x[i]] - y[i]) / (count_dict[x[i]] - 1)
    return result

### Cost

In [24]:
%%timeit -n 10
target_mean_v4(data.values.astype(np.intc))

10 loops, best of 3: 31.2 µs per loop


### Run in Parallel with OpenMP

In [25]:
%%cython -a

import cython
import numpy as np
cimport numpy as cnp
import pandas as pd
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef cnp.ndarray[double] target_mean_v5(cnp.ndarray[int, ndim=2] data):
    cdef cnp.ndarray[int] x = data[:,1]
    cdef cnp.ndarray[int] y = data[:,0]
    cdef int data_shape = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(data_shape)
    cdef cnp.ndarray[int] value_dict = np.zeros(data_shape).astype(np.intc)
    cdef cnp.ndarray[int] count_dict = np.zeros(data_shape).astype(np.intc)
    cdef int i
    for i in prange(data_shape, nogil=True):
        value_dict[x[i]] += y[i]
        count_dict[x[i]] += 1
    for i in prange(data_shape, nogil=True):
        result[i] = (value_dict[x[i]] - y[i]) / (count_dict[x[i]] - 1)
    return result

### Cost

In [26]:
%%timeit -n 10
target_mean_v5(data.values.astype(np.intc))

The slowest run took 6.62 times longer than the fastest. This could mean that an intermediate result is being cached.
10 loops, best of 3: 21.5 µs per loop
