In [None]:
import numpy as np
import pandas as pd
import timeit

**This notebook captures different ways in which to filter data dataframes with string data and compares the execution time of these methods.**

We will:
1. Generate a set of dataframes with different number of rows (1, 10, 100, 1_000, 10_000, 100_000)
2. Create different filter methods
3. Calculate the execution time for each and determine how much worst are the rest compared to the fastest (1-method time/fastest time)

**Helper functions**
*Plotting/presenting*

In [12]:
def style_negative(v, props=''):
    return props if v < 0 else None

def highlight_min(s, props=''):
    return np.where(s == np.nanmin(s.values), props, '')

def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')

*Data creation*

In [13]:
def random_array(rows, columns):
    return np.random.rand(int(rows), columns)

a1 = random_array(1, 3)
a2 = random_array(10, 3)
a3 = random_array(100, 3)
a4 = random_array(1_000, 3)
a5 = random_array(10_000, 3)
a6 = random_array(100_000, 3)

# Example output
print(a1, a2, sep='\n\n')

[[0.74390648 0.56213611 0.62668242]]

[[0.81485235 0.36854371 0.41038085]
 [0.40843746 0.49671776 0.76371875]
 [0.06417739 0.48729016 0.38110063]
 [0.63517912 0.80929681 0.04016806]
 [0.58092426 0.25081664 0.77342315]
 [0.87532778 0.71843202 0.63076544]
 [0.91690661 0.42540879 0.84439086]
 [0.160405   0.38311527 0.7100582 ]
 [0.21769687 0.26837316 0.55061125]
 [0.37342344 0.30770088 0.84103679]]


In [14]:
def random_dataframe(np_array):
    df = pd.DataFrame(np_array)
    df.columns = 'Col' + df.columns.astype(str)

    df['cat'] = np.where(df['Col0']<0.2, 'Blue', np.where(df['Col0'] > 0.8, 'green', 'red'))

    return df

d1 = random_dataframe(a1)
d2 = random_dataframe(a2)
d3 = random_dataframe(a3)
d4 = random_dataframe(a4)
d5 = random_dataframe(a5)
d6 = random_dataframe(a6)

# Example output
print(d1, d2, sep='\n\n')

       Col0      Col1      Col2
0  0.743906  0.562136  0.626682

       Col0      Col1      Col2
0  0.814852  0.368544  0.410381
1  0.408437  0.496718  0.763719
2  0.064177  0.487290  0.381101
3  0.635179  0.809297  0.040168
4  0.580924  0.250817  0.773423
5  0.875328  0.718432  0.630765
6  0.916907  0.425409  0.844391
7  0.160405  0.383115  0.710058
8  0.217697  0.268373  0.550611
9  0.373423  0.307701  0.841037


## Filter methods

In [15]:
class FilterMethods:
    def __init__(self, pd_df, threshold=0.2):
        self.pd_df = pd_df
        self.threshold = threshold

    def filter_for_loops(self):
        index_to_drop = []
        for i in range(self.pd_df.shape[0]):
            if self.pd_df.iloc[i]['Col0'] > self.threshold:
                index_to_drop.append(i)

        return self.pd_df.drop(index_to_drop)

    def filter_booleans(self):
        return self.pd_df[self.pd_df['Col0'] <= self.threshold]

    def filter_query(self):
        return self.pd_df.query(f'Col0 <= {self.threshold}')

In [16]:
# Testing
fm = FilterMethods(pd_df=d2, threshold=0.2)
fm.filter_for_loops()

Unnamed: 0,Col0,Col1,Col2
2,0.064177,0.48729,0.381101
7,0.160405,0.383115,0.710058


In [17]:
fm.filter_booleans()

Unnamed: 0,Col0,Col1,Col2
2,0.064177,0.48729,0.381101
7,0.160405,0.383115,0.710058


In [18]:
fm.filter_query()

Unnamed: 0,Col0,Col1,Col2
2,0.064177,0.48729,0.381101
7,0.160405,0.383115,0.710058


# Execution time summary
We display the final output as the ratio of each execution method vs the fastest method. Instead of looking at the absolute execution time, we prefer the relative time. This is because you might run the code in different machines, and it might be more interesting to know that method A was 3x slower, than to know it was 35 seconds slower.

In [30]:
results_dict = {'rows': [], 'for_loops': [], 'booleans': [], 'query': []}

n = 10

for dataframe in [d1, d2, d3, d4, d5, d6]:
    results_dict['rows'].append(dataframe.shape[0])

    fm = FilterMethods(pd_df=dataframe, threshold=0.2)
    results_dict['for_loops'].append(timeit.timeit(lambda: fm.filter_for_loops(), number=n))
    results_dict['booleans'].append(timeit.timeit(lambda: fm.filter_booleans(), number=n))
    results_dict['query'].append(timeit.timeit(lambda: fm.filter_query(), number=n))

results_df = pd.DataFrame(results_dict)
results_df.index = results_df['rows']
results_df = results_df.drop('rows', axis=1)

results_df['min_time_of_all_methods'] = results_df.min(axis=1)
results_df['for_loop_deterioration'] = (results_df['for_loops']/results_df['min_time_of_all_methods'])-1
results_df['booleans_deterioration'] = (results_df['booleans']/results_df['min_time_of_all_methods'])-1
results_df['query_deterioration'] = (results_df['query']/results_df['min_time_of_all_methods'])-1

In [31]:
# Execution time proportion between of method/fastest.
present_df = results_df[['for_loop_deterioration', 'booleans_deterioration','query_deterioration']].transpose()
present_df = (present_df.style.format(precision=2)
              .apply(highlight_min, props='color:white;background-color:darkgreen', axis=0)
              .apply(highlight_max, props='color:white;background-color:red', axis=0)
             )
present_df

rows,1,10,100,1000,10000,100000
for_loop_deterioration,0.84,2.88,28.2,273.39,1982.91,7131.77
booleans_deterioration,0.0,0.0,0.0,0.0,0.0,0.0
query_deterioration,2.16,2.69,2.64,2.43,2.1,0.59
