In [1]:
import numpy as np
import pandas as pd
import timeit

**This notebook captures different ways in which to join 2 pandas dataframes and compares the execution time of these methods.**

We will:
1. Generate a set of dataframes with different number of rows (1, 10, 100, 1_000, 10_000, 100_000). One dataframe will have duplicated strings or numbers, and the other will contain unique values.
2. Create different join methods
3. Calculate the execution time for each and determine how much worse are the rest compared to the fastest (1-method time/fastest time)

**Helper functions**
*Plotting/presenting*

In [2]:
def style_negative(v, props=''):
    return props if v < 0 else None

def highlight_min(s, props=''):
    return np.where(s == np.nanmin(s.values), props, '')

def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')

*Data creation*

In [6]:
def random_dataframe(rows):
    repeated_key_numeric = np.random.randint(low=0, high=rows/10, size=rows)
    unique_key_numeric = np.arange(start=0, stop=rows, step=1, dtype=int)

    repeated_key_string = ['id' + str(i) for i in repeated_key_numeric]
    unique_key_string = ['id' + str(i) for i in unique_key_numeric]
    df = pd.DataFrame({'r_numeric': repeated_key_numeric,
                       'u_numeric': unique_key_numeric,
                       'r_string': repeated_key_string,
                       'u_string': unique_key_string})

    return df

d1 = random_dataframe(rows=10)
d2 = random_dataframe(rows=100)
d3 = random_dataframe(rows=1_000)
d4 = random_dataframe(rows=10_000)
d5 = random_dataframe(rows=100_000)
d6 = random_dataframe(rows=1_000_000)
d7 = random_dataframe(rows=10_000_000)

# Example output
print(d1, d2, sep='\n\n')

   r_numeric  u_numeric r_string u_string
0          0          0      id0      id0
1          0          1      id0      id1
2          0          2      id0      id2
3          0          3      id0      id3
4          0          4      id0      id4
5          0          5      id0      id5
6          0          6      id0      id6
7          0          7      id0      id7
8          0          8      id0      id8
9          0          9      id0      id9

    r_numeric  u_numeric r_string u_string
0           3          0      id3      id0
1           6          1      id6      id1
2           5          2      id5      id2
3           7          3      id7      id3
4           1          4      id1      id4
..        ...        ...      ...      ...
95          5         95      id5     id95
96          2         96      id2     id96
97          1         97      id1     id97
98          5         98      id5     id98
99          3         99      id3     id99

[100 rows x 4 column

## Filter methods

In [50]:
class JoinMethods:
    def __init__(self, pd_df):
        self.pd_df = pd_df
        self.left_df = self.pd_df[['u_numeric', 'u_string']]
        self.right_df = self.pd_df[['r_numeric', 'r_string']].drop_duplicates()

    def join_merge_numeric(self):
        join_df = pd.merge(self.left_df, self.right_df, how='left', left_on=['u_numeric'], right_on=['r_numeric'])
        return join_df

    def join_merge_string(self):
        join_df = pd.merge(self.left_df, self.right_df, how='left', left_on=['u_string'], right_on=['r_string'])
        return join_df

    def join_indexing_numeric(self):
        return self.left_df.set_index('u_numeric').join(self.right_df.set_index('r_numeric'), how='left')

    def join_indexing_string(self):
        return self.left_df.set_index('u_string').join(self.right_df.set_index('r_string'), how='left')

    def join_set_index_numeric(self):
        self.left_df.set_index('u_numeric')
        self.right_df.set_index('r_numeric')
        return self.left_df.join(self.right_df, how='left')

    def join_set_index_string(self):
        self.left_df.set_index('u_string')
        self.right_df.set_index('r_string')
        return self.left_df.join(self.right_df, how='left')

    def join_set_index_sorted_numeric(self):
        self.left_df.set_index('u_numeric')
        self.left_df.sort_index()
        self.right_df.set_index('r_numeric')
        self.right_df.sort_index()
        return self.left_df.join(self.right_df, how='left')

    def join_set_index_sorted_string(self):
        self.left_df.set_index('u_string')
        self.left_df.sort_index()
        self.right_df.set_index('r_string')
        self.right_df.sort_index()
        return self.left_df.join(self.right_df, how='left')

fm = JoinMethods(pd_df=d2)

In [51]:
fm.join_merge_numeric()
# fm.join_merge_string()
# fm.join_indexing_numeric()
# fm.join_indexing_string()
# fm.join_set_index_numeric()
# fm.join_set_index_string()
# fm.join_set_index_sorted_numeric()
# fm.join_set_index_sorted_string()

Unnamed: 0,u_numeric,u_string,r_numeric,r_string
0,0,id0,0.0,id0
1,1,id1,1.0,id1
2,2,id2,2.0,id2
3,3,id3,3.0,id3
4,4,id4,4.0,id4
...,...,...,...,...
95,95,id95,,
96,96,id96,,
97,97,id97,,
98,98,id98,,


In [49]:
# fm.ifelse_lambda()

In [50]:
# fm.ifelse_loc()

In [51]:
# fm.ifelse_where()

In [53]:
# fm.ifelse_select()

# Execution time summary - numeric
We display the final output as the ratio of each execution method vs the fastest method. Instead of looking at the absolute execution time, we prefer the relative time. This is because you might run the code in different machines, and it might be more interesting to know that method A was 3x slower, than to know it was 35 seconds slower.

In [52]:
results_dict = {'rows': [], 'merge': [], 'indexing': [], 'set_index': [], 'set_index_sorted': []}

n = 10

for dataframe in [d1, d2, d3, d4, d5, d6, d7]:
    results_dict['rows'].append(dataframe.shape[0])

    fm = JoinMethods(pd_df=dataframe)
    results_dict['merge'].append(timeit.timeit(lambda: fm.join_merge_numeric(), number=n))
    results_dict['indexing'].append(timeit.timeit(lambda: fm.join_indexing_numeric(), number=n))
    results_dict['set_index'].append(timeit.timeit(lambda: fm.join_set_index_numeric(), number=n))
    results_dict['set_index_sorted'].append(timeit.timeit(lambda: fm.join_set_index_sorted_numeric(), number=n))

results_df = pd.DataFrame(results_dict)
results_df.index = results_df['rows']
results_df = results_df.drop('rows', axis=1)

results_df['min_time_of_all_methods'] = results_df.min(axis=1)
results_df['merge_deterioration'] = (results_df['merge']/results_df['min_time_of_all_methods'])-1
results_df['indexing_deterioration'] = (results_df['indexing']/results_df['min_time_of_all_methods'])-1
results_df['set_index_deterioration'] = (results_df['set_index']/results_df['min_time_of_all_methods'])-1
results_df['set_index_sorted_deterioration'] = (results_df['set_index_sorted']/results_df['min_time_of_all_methods'])-1

In [53]:
# Execution time proportion between of method/fastest.
present_df = results_df[['merge_deterioration','indexing_deterioration','set_index_deterioration','set_index_sorted_deterioration']].transpose()
present_df = (present_df.style.format(precision=2)
              .apply(highlight_min, props='color:white;background-color:darkgreen', axis=0)
              .apply(highlight_max, props='color:white;background-color:red', axis=0)
             )
present_df

rows,10,100,1000,10000,100000,1000000,10000000
merge_deterioration,0.1,0.06,0.08,0.72,2.64,2.8,5.41
indexing_deterioration,0.0,0.08,0.04,0.17,0.66,0.51,1.35
set_index_deterioration,0.17,0.0,0.0,0.0,0.0,0.0,0.0
set_index_sorted_deterioration,0.2,0.15,0.13,0.18,0.36,0.24,0.25


# Execution time summary - string

In [54]:
results_dict = {'rows': [], 'merge': [], 'indexing': [], 'set_index': [], 'set_index_sorted': []}

n = 10

for dataframe in [d1, d2, d3, d4, d5, d6, d7]:
    results_dict['rows'].append(dataframe.shape[0])

    fm = JoinMethods(pd_df=dataframe)
    results_dict['merge'].append(timeit.timeit(lambda: fm.join_merge_string(), number=n))
    results_dict['indexing'].append(timeit.timeit(lambda: fm.join_indexing_string(), number=n))
    results_dict['set_index'].append(timeit.timeit(lambda: fm.join_set_index_string(), number=n))
    results_dict['set_index_sorted'].append(timeit.timeit(lambda: fm.join_set_index_sorted_string(), number=n))

results_df = pd.DataFrame(results_dict)
results_df.index = results_df['rows']
results_df = results_df.drop('rows', axis=1)

results_df['min_time_of_all_methods'] = results_df.min(axis=1)
results_df['merge_deterioration'] = (results_df['merge']/results_df['min_time_of_all_methods'])-1
results_df['indexing_deterioration'] = (results_df['indexing']/results_df['min_time_of_all_methods'])-1
results_df['set_index_deterioration'] = (results_df['set_index']/results_df['min_time_of_all_methods'])-1
results_df['set_index_sorted_deterioration'] = (results_df['set_index_sorted']/results_df['min_time_of_all_methods'])-1

In [55]:
# Execution time proportion between of method/fastest.
present_df = results_df[['merge_deterioration','indexing_deterioration','set_index_deterioration','set_index_sorted_deterioration']].transpose()
present_df = (present_df.style.format(precision=2)
              .apply(highlight_min, props='color:white;background-color:darkgreen', axis=0)
              .apply(highlight_max, props='color:white;background-color:red', axis=0)
             )
present_df

rows,10,100,1000,10000,100000,1000000,10000000
merge_deterioration,0.0,0.0,0.05,0.59,2.76,4.67,13.3
indexing_deterioration,0.21,0.16,0.1,0.68,1.18,3.54,10.41
set_index_deterioration,0.06,0.02,0.06,0.0,0.0,0.0,0.0
set_index_sorted_deterioration,0.1,0.1,0.0,0.27,0.0,0.24,0.25
