# Compare numeric values

Import numeric modules.

In [16]:
from __future__ import division, print_function

import pandas
import numpy


import recordlinkage

## Sample data

In [2]:
size = (1000000,4)

df_A = pandas.DataFrame(
    numpy.random.random(size)*360, 
    columns = ['a', 'b', 'c', 'd']
)

df_B = pandas.DataFrame(
    numpy.random.random(size)*360, 
    columns = ['a', 'b', 'c', 'd']
)

pairs = pandas.MultiIndex.from_arrays([df_A.index.values, df_B.index.values])

## Current implementation

In [3]:
comp = recordlinkage.Compare(pairs, df_A, df_B )
%timeit -n10 comp.geo('a', 'b', 'a', 'b', 'linear', scale=10, offset=20)

%prun comp.geo('a', 'b', 'a', 'b', 'linear', scale=10, offset=20)

10 loops, best of 3: 354 ms per loop
 

## Numexpr implementation

Use the package ``numexpr`` to compute the haversine. ``numexpr`` uses multicore processing and reformats the mathematical expression. 

In [4]:
haversine_expr = '2*6371*arcsin(sqrt((sin((lat2*to_rad-lat1*to_rad)/2))**2+cos(lat1*to_rad)*cos(lat2*to_rad)*(sin((lng2*to_rad-lng1*to_rad)/2))**2))'

lat1, lng1 = df_A['a'], df_A['b']
lat2, lng2 = df_B['a'], df_B['b']

to_rad = numpy.deg2rad(1)

print("The default implementation")
%timeit -n10 pandas.eval(haversine_expr)

print("The python implementation")
%timeit -n10 pandas.eval(haversine_expr, engine='python')

print("The numexpr implementation")
%timeit -n10 pandas.eval(haversine_expr, engine='numexpr')

The default implementation
10 loops, best of 3: 62.7 ms per loop
The python implementation
10 loops, best of 3: 192 ms per loop
The numexpr implementation
10 loops, best of 3: 66.6 ms per loop


## Cython
http://doublemap.github.io/blog/2015/05/29/optimizing-python/

Use Cython to make a C-version of the haversine distance computation.

In [27]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [32]:
%%cython --annotate

from libc.math cimport sin, cos, asin, sqrt

import numpy as np
cimport numpy as np
cimport cython

ctypedef np.float_t DTYPE_t

@cython.wraparound(False)
@cython.cdivision(True)
@cython.nonecheck(False)
@cython.boundscheck(False)
cdef np.ndarray[DTYPE_t, ndim=1] haversine_c(np.ndarray[DTYPE_t, ndim=1] lat1_array, 
                 np.ndarray[DTYPE_t, ndim=1] lat2_array, 
                 np.ndarray[DTYPE_t, ndim=1] lng1_array, 
                 np.ndarray[DTYPE_t, ndim=1] lng2_array):
    
    cdef int samples = lat1_array.shape[0]
    cdef int x
    cdef float to_rad = 0.01745329251
    
    cdef np.ndarray[DTYPE_t, ndim=1] h = np.zeros((samples,), dtype=np.float)

    cdef float lat1, lat2, lng1, lng2
    
    for x in range(samples):
        
        lat1 = lat1_array[x]
        lat2 = lat2_array[x]
        lng1 = lng1_array[x]
        lng2 = lng2_array[x]
        
        h[x] += 2*6371*asin(sqrt((sin(((lat2-lat1)*to_rad)/2))**2+cos(lat1*to_rad)*cos(lat2*to_rad)*(sin(((lng2-lng1)*to_rad)/2))**2))
    
    return h

In [33]:
%timeit haversine_c(df_A['a'].values, df_A['b'].values, df_B['a'].values, df_B['b'].values)

NameError: global name 'haversine_c' is not defined

## Parallel computing

In [34]:
from multiprocessing import Pool, cpu_count
print ("The number of cpu's", cpu_count())

The number of cpu's 4


In [40]:
import math
def f(x):
    y = [1]*1000000
    [math.exp(i) for i in y]
def g(x):
    y = numpy.ones(1000000)
    numpy.exp(y)

In [38]:
r_times = range(0, 100)

# Use 4 cores. 
p = Pool(cpu_count()-1)

# numpy implementation
%timeit -n3 p.map(g, r_times)
# python implementation
%timeit -n3 p.map(f, r_times)

%timeit -n3 [g(i) for i in r_times]

%timeit -n3 [f(i) for i in r_times]

3 loops, best of 3: 35.2 ms per loop
3 loops, best of 3: 12.6 s per loop
3 loops, best of 3: 72.6 ms per loop


In [39]:
%timeit -n3 [f(i) for i in r_times]

3 loops, best of 3: 26.2 s per loop


In [25]:
p = Pool(cpu_count()-1)

p.map(haversine_c, df_A['a'].values, df_A['b'].values, df_B['a'].values, df_B['b'].values)

NameError: name 'haversine_c' is not defined