In [1]:
import numpy as np
import pandas as pd
import timeit

**This notebook captures different ways in which to filter data dataframes with string data and compares the execution time of these methods.**

We will:
1. Generate a set of dataframes with different number of rows (1, 10, 100, 1_000, 10_000, 100_000)
2. Create different filter methods
3. Calculate the execution time for each and determine how much worse are the rest compared to the fastest (1-method time/fastest time)

**Helper functions**
*Plotting/presenting*

In [2]:
def style_negative(v, props=''):
    return props if v < 0 else None

def highlight_min(s, props=''):
    return np.where(s == np.nanmin(s.values), props, '')

def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')

*Data creation*

In [3]:
def random_array(rows, columns):
    return np.random.rand(int(rows), columns)

a1 = random_array(1, 3)
a2 = random_array(10, 3)
a3 = random_array(100, 3)
a4 = random_array(1_000, 3)
a5 = random_array(10_000, 3)
a6 = random_array(100_000, 3)

# Example output
print(a1, a2, sep='\n\n')

[[0.00976712 0.63378325 0.10083864]]

[[0.78120957 0.76874644 0.75871791]
 [0.60406791 0.78987309 0.17163633]
 [0.32820779 0.2259688  0.24204104]
 [0.89634332 0.59647016 0.94259758]
 [0.78333348 0.93288204 0.83350076]
 [0.7686362  0.91446473 0.10439522]
 [0.53504098 0.36766139 0.69741891]
 [0.1843358  0.4890348  0.96643135]
 [0.26312983 0.08539515 0.02822771]
 [0.1133378  0.01583675 0.08775055]]


In [11]:
def random_dataframe(np_array):
    df = pd.DataFrame(np_array)
    df.columns = 'Col' + df.columns.astype(str)

    df['cat'] = np.where(df['Col0']<0.2, 'blue', np.where(df['Col0'] > 0.8, 'green', 'red'))

    return df

d1 = random_dataframe(a1)
d2 = random_dataframe(a2)
d3 = random_dataframe(a3)
d4 = random_dataframe(a4)
d5 = random_dataframe(a5)
d6 = random_dataframe(a6)

# Example output
print(d1, d2, sep='\n\n')

       Col0      Col1      Col2   cat
0  0.009767  0.633783  0.100839  blue

       Col0      Col1      Col2    cat
0  0.781210  0.768746  0.758718    red
1  0.604068  0.789873  0.171636    red
2  0.328208  0.225969  0.242041    red
3  0.896343  0.596470  0.942598  green
4  0.783333  0.932882  0.833501    red
5  0.768636  0.914465  0.104395    red
6  0.535041  0.367661  0.697419    red
7  0.184336  0.489035  0.966431   blue
8  0.263130  0.085395  0.028228    red
9  0.113338  0.015837  0.087751   blue


## Filter methods

In [31]:
class FilterMethods:
    def __init__(self, pd_df, cat1='blue', cat2='green'):
        self.pd_df = pd_df
        self.cat1 = cat1
        self.cat2 = cat2

    def filter_booleans(self):
        return self.pd_df[(self.pd_df['cat']==self.cat1) | (self.pd_df['cat']==self.cat2)]

    def filter_isin(self):
        return self.pd_df[self.pd_df['cat'].isin([self.cat1, self.cat2])]

    def filter_contains(self):
        return self.pd_df[self.pd_df['cat'].str.contains(f'{self.cat1}|{self.cat2}')]

    def filter_query_booleans(self):
        return self.pd_df.query(f'cat == "{self.cat1}" or cat == "{self.cat2}"')

    def filter_query_in(self):
        return self.pd_df.query(f'cat in ["{self.cat1}" , "{self.cat2}"]')

fm = FilterMethods(pd_df=d2)

In [32]:
fm.filter_booleans()

Unnamed: 0,Col0,Col1,Col2,cat
3,0.896343,0.59647,0.942598,green
7,0.184336,0.489035,0.966431,blue
9,0.113338,0.015837,0.087751,blue


In [33]:
fm.filter_isin()

Unnamed: 0,Col0,Col1,Col2,cat
3,0.896343,0.59647,0.942598,green
7,0.184336,0.489035,0.966431,blue
9,0.113338,0.015837,0.087751,blue


In [34]:
fm.filter_contains()

Unnamed: 0,Col0,Col1,Col2,cat
3,0.896343,0.59647,0.942598,green
7,0.184336,0.489035,0.966431,blue
9,0.113338,0.015837,0.087751,blue


In [35]:
fm.filter_query_booleans()

Unnamed: 0,Col0,Col1,Col2,cat
3,0.896343,0.59647,0.942598,green
7,0.184336,0.489035,0.966431,blue
9,0.113338,0.015837,0.087751,blue


In [36]:
fm.filter_query_in()

Unnamed: 0,Col0,Col1,Col2,cat
3,0.896343,0.59647,0.942598,green
7,0.184336,0.489035,0.966431,blue
9,0.113338,0.015837,0.087751,blue


# Execution time summary
We display the final output as the ratio of each execution method vs the fastest method. Instead of looking at the absolute execution time, we prefer the relative time. This is because you might run the code in different machines, and it might be more interesting to know that method A was 3x slower, than to know it was 35 seconds slower.

In [37]:
results_dict = {'rows': [], 'booleans': [], 'isin': [], 'contains': [], 'query_booleans': [], 'query_in': []}

n = 10

for dataframe in [d1, d2, d3, d4, d5, d6]:
    results_dict['rows'].append(dataframe.shape[0])

    fm = FilterMethods(pd_df=dataframe)
    results_dict['booleans'].append(timeit.timeit(lambda: fm.filter_booleans(), number=n))
    results_dict['isin'].append(timeit.timeit(lambda: fm.filter_isin(), number=n))
    results_dict['contains'].append(timeit.timeit(lambda: fm.filter_contains(), number=n))
    results_dict['query_booleans'].append(timeit.timeit(lambda: fm.filter_query_booleans(), number=n))
    results_dict['query_in'].append(timeit.timeit(lambda: fm.filter_query_in(), number=n))

results_df = pd.DataFrame(results_dict)
results_df.index = results_df['rows']
results_df = results_df.drop('rows', axis=1)

results_df['min_time_of_all_methods'] = results_df.min(axis=1)
results_df['booleans_deterioration'] = (results_df['booleans']/results_df['min_time_of_all_methods'])-1
results_df['isin_deterioration'] = (results_df['isin']/results_df['min_time_of_all_methods'])-1
results_df['contains_deterioration'] = (results_df['contains']/results_df['min_time_of_all_methods'])-1
results_df['query_booleans_deterioration'] = (results_df['query_booleans']/results_df['min_time_of_all_methods'])-1
results_df['query_in_deterioration'] = (results_df['query_in']/results_df['min_time_of_all_methods'])-1

In [39]:
# Execution time proportion between of method/fastest.
present_df = results_df[['booleans_deterioration','isin_deterioration','contains_deterioration','query_booleans_deterioration','query_in_deterioration']].transpose()
present_df = (present_df.style.format(precision=2)
              .apply(highlight_min, props='color:white;background-color:darkgreen', axis=0)
              .apply(highlight_max, props='color:white;background-color:red', axis=0)
             )
present_df

rows,1,10,100,1000,10000,100000
booleans_deterioration,0.52,0.68,0.75,0.98,1.02,1.19
isin_deterioration,0.0,0.0,0.0,0.0,0.0,0.0
contains_deterioration,0.09,0.16,0.56,1.09,4.37,5.04
query_booleans_deterioration,3.49,3.88,4.94,3.79,1.07,0.2
query_in_deterioration,1.8,2.49,3.64,2.53,0.74,0.04
