In [40]:
import itertools

import recordlinkage
from recordlinkage.datasets import load_febrl1, load_febrl3

import pandas as pd
import numpy as np

In [44]:
a = load_febrl1()
a.shape

(1000, 10)

In [45]:
b = load_febrl3()
b.shape

(5000, 10)

## Linking two dataframes

In [175]:
%timeit pd.MultiIndex.from_product([a.index.values, a.index.values])
%timeit pd.MultiIndex.from_product([b.index.values, b.index.values])

100 loops, best of 3: 8.29 ms per loop
1 loop, best of 3: 189 ms per loop


## Linking one dataframe

In [49]:
def current_full(df):
    x = pd.MultiIndex.from_product([df.index.values, df.index.values])
    return x[x.get_level_values(0) < x.get_level_values(1)]

In [164]:
def current_full_numepxr(df):
    x = pd.MultiIndex.from_product([df.index.values, df.index.values])
    
    ia = x.get_level_values(0)
    ib = x.get_level_values(1)
    return x[pd.eval('ia < ib', engine='numexpr')]

In [169]:
def new_full(df):
    
    return pd.MultiIndex(levels=[df.index.values, df.index.values], labels=np.triu_indices(len(df.index), k =1), verify_integrity=False)

In [203]:
def new_itertools(df):
    
    labels = np.array(list(itertools.combinations(np.arange(len(df)), 2)))

    return pd.MultiIndex(levels=[df.index.values, df.index.values], labels=[labels[:,0], labels[:,1]], verify_integrity=False)


In [192]:
r = current_full(a)
print (r.shape)
# print (r[0:10])

(499500,)


In [198]:
r = current_full(b)
print (r.shape)
# print (r[0:10])

(12497500,)


In [194]:
r = current_full_numepxr(a)
print (r.shape)
# print (r[0:10])

(499500,)


In [156]:
r = new_full(a)
print (r.shape)
# print (r[0:10])

(499500,)


In [174]:
r = new_full(b)
print (r.shape)
# print (r[0:10])

(12497500,)


In [205]:
r = new_itertools(a)
print (r.shape)
# print (r[0:10])

(499500,)


In [162]:
%timeit current_full(a);
%timeit current_full(b);

1 loop, best of 3: 188 ms per loop
1 loop, best of 3: 4.95 s per loop


In [165]:
%timeit current_full_numepxr(a);
%timeit current_full_numepxr(b);

1 loop, best of 3: 186 ms per loop
1 loop, best of 3: 4.79 s per loop


In [170]:
%timeit new_full(a);
%timeit new_full(b);

100 loops, best of 3: 7.34 ms per loop
1 loop, best of 3: 255 ms per loop


In [206]:
%timeit new_itertools(a);
%timeit new_itertools(b);

1 loop, best of 3: 241 ms per loop
1 loop, best of 3: 6.15 s per loop
