**This notebook captures different ways in which to filter data dataframes with numeric data and compares the execution time of these methods.**

# Import required libraries

In [1]:
import numpy as np
import pandas as pd
import timeit

# Helper functions


## Plotting/presenting

In [19]:
def style_negative(v, props=''):
    return props if v < 0 else None

def highlight_min(s, props=''):
    return np.where(s == np.nanmin(s.values), props, '')

def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')

## Data creation

In [3]:
def random_array(rows, columns):
    return np.random.rand(int(rows), columns)

a1 = random_array(1, 3)
a2 = random_array(10, 3)
a3 = random_array(100, 3)
a4 = random_array(1000, 3)
a5 = random_array(10000, 3)

# Example output
print(a1, a2, sep='\n\n')

[[0.44269621 0.48588314 0.43797785]]

[[0.3397854  0.33246919 0.09022564]
 [0.46119931 0.60575487 0.40024575]
 [0.53936067 0.99958495 0.77276803]
 [0.71741707 0.90390641 0.31956542]
 [0.58070596 0.02177606 0.15787757]
 [0.71603873 0.75159181 0.22766796]
 [0.768755   0.87832289 0.17592466]
 [0.05665252 0.4957822  0.32560798]
 [0.71831049 0.6586061  0.93823641]
 [0.88893692 0.75479253 0.70612879]]


In [5]:
def random_dataframe(np_array):
    df = pd.DataFrame(np_array)
    df.columns = 'Col' + df.columns.astype(str)
    return df

d1 = random_dataframe(a1)
d2 = random_dataframe(a2)
d3 = random_dataframe(a3)
d4 = random_dataframe(a4)
d5 = random_dataframe(a5)

# Example output
print(d1, d2, sep='\n\n')

       Col0      Col1      Col2
0  0.442696  0.485883  0.437978

       Col0      Col1      Col2
0  0.339785  0.332469  0.090226
1  0.461199  0.605755  0.400246
2  0.539361  0.999585  0.772768
3  0.717417  0.903906  0.319565
4  0.580706  0.021776  0.157878
5  0.716039  0.751592  0.227668
6  0.768755  0.878323  0.175925
7  0.056653  0.495782  0.325608
8  0.718310  0.658606  0.938236
9  0.888937  0.754793  0.706129


## Filter methods

In [6]:
class FilterMethods:
    def __init__(self, pd_df, threshold=0.2):
        self.pd_df = pd_df
        self.threshold = threshold

    def filter_for_loops(self):
        index_to_drop = []
        for i in range(self.pd_df.shape[0]):
            if self.pd_df.iloc[i]['Col0'] > self.threshold:
                index_to_drop.append(i)

        return self.pd_df.drop(index_to_drop)

    def filter_booleans(self):
        return self.pd_df[self.pd_df['Col0'] <= self.threshold]

    def filter_query(self):
        return self.pd_df.query(f'Col0 <= {self.threshold}')

In [7]:
# Testing
fm = FilterMethods(pd_df=d2, threshold=0.2)
fm.filter_for_loops()

Unnamed: 0,Col0,Col1,Col2
7,0.056653,0.495782,0.325608


In [8]:
fm.filter_booleans()

Unnamed: 0,Col0,Col1,Col2
7,0.056653,0.495782,0.325608


In [9]:
fm.filter_query()

Unnamed: 0,Col0,Col1,Col2
7,0.056653,0.495782,0.325608


# Execution time summary

In [11]:
results_dict = {'rows': [], 'for_loops': [], 'booleans': [], 'query': []}

n = 10

for dataframe in [d1, d2, d3, d4, d5]:
    results_dict['rows'].append(dataframe.shape[0])

    fm = FilterMethods(pd_df=dataframe, threshold=0.2)
    results_dict['for_loops'].append(timeit.timeit(lambda: fm.filter_for_loops(), number=n))
    results_dict['booleans'].append(timeit.timeit(lambda: fm.filter_booleans(), number=n))
    results_dict['query'].append(timeit.timeit(lambda: fm.filter_query(), number=n))

results_df = pd.DataFrame(results_dict)
results_df.index = results_df['rows']
results_df = results_df.drop('rows', axis=1)

results_df['min_time_of_all_methods'] = results_df.min(axis=1)
results_df['for_loop_deterioration'] = (results_df['for_loops']/results_df['min_time_of_all_methods'])-1
results_df['booleans_deterioration'] = (results_df['booleans']/results_df['min_time_of_all_methods'])-1
results_df['query_deterioration'] = (results_df['query']/results_df['min_time_of_all_methods'])-1

In [20]:
# Execution time proportion between of method/fastest.
present_df = results_df[['for_loop_deterioration', 'booleans_deterioration','query_deterioration']].transpose()
present_df = (present_df.style
              .apply(highlight_min, props='color:white;background-color:darkblue', axis=0)
              .apply(highlight_max, props='color:white;background-color:red', axis=0)
             )
present_df

rows,1,10,100,1000,10000
for_loop_deterioration,0.055619,1.119378,27.34915,224.614964,1855.693705
booleans_deterioration,0.0,0.0,0.0,0.0,0.0
query_deterioration,3.268267,2.015899,3.846454,3.254351,3.029263
