In [129]:
from __future__ import division

import pandas
import numpy

In [150]:
size = (1000000,4)

A = numpy.random.random(size)*360

In [151]:
df_A = pandas.DataFrame(A)
df_A.columns = ['a', 'b', 'c', 'd']
df_A.head()

Unnamed: 0,a,b,c,d
0,14.238735,15.102979,41.594584,224.523794
1,200.947209,31.165473,165.932666,193.240572
2,57.028766,168.284683,99.401927,323.019624
3,117.752589,126.194713,130.261305,232.500418
4,227.028106,54.814648,67.165154,92.759663


In [114]:
b = numpy.random.randint(2, size=size).astype(bool)

In [104]:
df_A = df_A.mask(b)

In [115]:
%timeit 2*6371*numpy.arcsin(numpy.sqrt((numpy.sin((df_A['a']-df_A['b'])/2))**2+numpy.cos(df_A['a'])*numpy.cos(df_A['b'])*(numpy.sin((df_A['c']-df_A['d'])/2))**2))

1 loop, best of 3: 1.49 s per loop


In [155]:
lng1 = df_A['a']
lng2 = df_A['b']

lat1 = df_A['c']
lat2 = df_A['d']

to_rad = 1/360*numpy.pi*2

%timeit pandas.eval('2*6371*arcsin(sqrt((sin((lat2*to_rad-lat1*to_rad)/2))**2+cos(lat1*to_rad)*cos(lat2*to_rad)*(sin((lng2*to_rad-lng1*to_rad)/2))**2))')

a = pandas.eval('2*6371*arcsin(sqrt((sin((lat2*to_rad-lat1*to_rad)/2))**2+cos(lat1*to_rad)*cos(lat2*to_rad)*(sin((lng2*to_rad-lng1*to_rad)/2))**2))')
a.head()

10 loops, best of 3: 42.5 ms per loop


0    19681.897146
1    18907.250259
2    13688.466294
3    11396.274402
4     2235.453928
dtype: float64

In [154]:
%timeit pandas.eval('((lat1-lng1) <= 10) & ((lat1-lng2) >= 100)')

%timeit ((lat1-lng1) <= 10) & ((lat1-lng2) >= 100)

100 loops, best of 3: 5.39 ms per loop
100 loops, best of 3: 12.6 ms per loop


## Cython
http://doublemap.github.io/blog/2015/05/29/optimizing-python/

In [160]:
%load_ext Cython

In [168]:
from numpy import sin, cos, arccos


def haversine(coord1, coord2):
    """Given two (lat, lng) tuples, returns the distance between them in
    meters."""
    lat1, lng1 = coord1
    lat2, lng2 = coord2

    phi1 = (90.0 - lat1) * 0.0174532925
    phi2 = (90.0 - lat2) * 0.0174532925
    theta1 = lng1 * 0.0174532925
    theta2 = lng2 * 0.0174532925

    c = (sin(phi1) * sin(phi2) * cos(theta1 - theta2) + cos(phi1) * cos(phi2))
    arc = arccos(c)
    return arc * 6367444.7

In [170]:
%timeit haversine((lat1, lng1), (lat2, lng2))

10 loops, best of 3: 164 ms per loop


In [303]:
%%cython --annotate

from __future__ import division

from libc.math cimport sin, cos, asin, sqrt

import numpy as np
cimport numpy as np
cimport cython

ctypedef np.float_t DTYPE_t

@cython.wraparound(False)
@cython.cdivision(True)
@cython.nonecheck(False)
@cython.boundscheck(False)
def haversine_c(np.ndarray[DTYPE_t, ndim=1] lat1_array, np.ndarray[DTYPE_t, ndim=1] lat2_array, np.ndarray[DTYPE_t, ndim=1] lng1_array, np.ndarray[DTYPE_t, ndim=1] lng2_array):
    
    cdef int samples = lat1_array.shape[0]
    cdef int x
    cdef float to_rad = 0.01745329251
    
    cdef np.ndarray[DTYPE_t, ndim=1] h = np.zeros((samples,), dtype=np.float)
    
    cdef float lat1, lat2, lng1, lng2
    
    for x in range(samples):
        
        lat1 = lat1_array[x]
        lat2 = lat2_array[x]
        lng1 = lng1_array[x]
        lng2 = lng2_array[x]
        
        h[x] += 2*6371*asin(sqrt((sin((lat2*to_rad-lat1*to_rad)/2))**2+cos(lat1*to_rad)*cos(lat2*to_rad)*(sin((lng2*to_rad-lng1*to_rad)/2))**2))
    
    return h

In [314]:
%%cython --annotate

from __future__ import division

from libc.math cimport sin, cos, asin, sqrt

import numpy as np
cimport numpy as np
cimport cython

ctypedef np.float_t DTYPE_t

@cython.wraparound(False)
@cython.cdivision(True)
@cython.nonecheck(False)
@cython.boundscheck(False)
def haversine_c2(np.ndarray[DTYPE_t, ndim=2] latlngarray):
    
    cdef int samples = latlngarray.shape[0]
    cdef int x
    cdef double to_rad = 0.01745329251
    
    cdef np.ndarray[DTYPE_t, ndim=1] h = np.zeros((samples,), dtype=np.float)
    
    cdef float lat1, lat2, lng1, lng2
    
    for x in range(samples):
        
        latlngarray[x,:]
        
        lat1 = latlngarray[x,0]
        lat2 = latlngarray[x,1]
        lng1 = latlngarray[x,2]
        lng2 = latlngarray[x,3]
        
        h[x] += 2*6371*asin(sqrt((sin((lat2*to_rad-lat1*to_rad)/2))**2+cos(lat1*to_rad)*cos(lat2*to_rad)*(sin((lng2*to_rad-lng1*to_rad)/2))**2))
    
    return h

In [313]:
df_A.values.ndim

2

In [317]:
%timeit haversine_c(lat1.values, lng1.values, lat2.values, lng2.values)
%timeit haversine_c2(df_A.values)

10 loops, best of 3: 103 ms per loop
10 loops, best of 3: 85.8 ms per loop


In [256]:
haversine_c(lat1.values, lng1.values, lat2.values, lng2.values)

array([[ 13111.23695717],
       [ 17968.12471423],
       [  2753.14456285],
       ..., 
       [  8460.04859533],
       [  5697.33040221],
       [ 14965.26653564]])

In [295]:
%%cython --annotate

from __future__ import division

from libc.math cimport sin, cos, asin, sqrt

import numpy as np
cimport numpy as np

cimport cython

ctypedef np.float_t DTYPE_t

@cython.wraparound(False)
@cython.cdivision(True)
@cython.nonecheck(False)
@cython.boundscheck(False)
def haversine_c(np.ndarray[DTYPE_t, ndim=1] lat1_array):
    
    cdef int samples = lat1_array.shape[0]
    
    cdef np.ndarray[DTYPE_t, ndim=1] h = np.zeros((100,), dtype=np.float)
    
    cdef int x
        
    for x in range(samples):
        
        h[x]
        
    return h