This notebook captures different ways in which to filter data (arrays, dataframes and lists) and compares the execution time of these methods.

#### Import required libraries

In [10]:
import numpy as np
import pandas as pd
import timeit

TO DO LIST
* Filter continuous variable pandas
* Filter strings in pandas
* Filter lists
* Filter numpy arrays

#### Helper functions
##### Data creation

In [11]:
def random_array(rows, columns):
    return np.random.rand(int(rows), columns)

a1 = random_array(1, 3)
a2 = random_array(10, 3)
a3 = random_array(100, 3)
a4 = random_array(1000, 3)
a5 = random_array(10000, 3)

# Example output
print(a1, a2, sep='\n\n')

[[0.10825116 0.50528468 0.45775421]]

[[0.80769024 0.621332   0.19212521]
 [0.04525548 0.61689994 0.04908778]
 [0.03305066 0.20014432 0.11344962]
 [0.02120582 0.42039958 0.78732384]
 [0.24121096 0.90254837 0.65840456]
 [0.34529858 0.35916948 0.07903664]
 [0.30154708 0.88742273 0.75994022]
 [0.9047726  0.844301   0.18578411]
 [0.84955282 0.38876652 0.05343542]
 [0.68943409 0.22901931 0.56031095]]


In [12]:
# def random_list(np_array):
#     return np_array.flatten().tolist()
#
# # Testing output
# l1 = random_list(a1)
# l2 = random_list(a2)
# l3 = random_list(a3)
#
# print(l1, l2, l3, sep='\n\n')

In [13]:
def random_dataframe(np_array, add_categorical_variable=False):
    df = pd.DataFrame(np_array)
    df.columns = 'Col' + df.columns.astype(str)

    if add_categorical_variable:
        df['cat'] = np.where(df['Col0'] <= 0.2, 'Blue', 'Red')

    return df

d1 = random_dataframe(a1)
d2 = random_dataframe(a2)
d3 = random_dataframe(a3)
d4 = random_dataframe(a4)
d5 = random_dataframe(a5)

# Example output
print(d1, d2, sep='\n\n')

       Col0      Col1      Col2
0  0.108251  0.505285  0.457754

       Col0      Col1      Col2
0  0.807690  0.621332  0.192125
1  0.045255  0.616900  0.049088
2  0.033051  0.200144  0.113450
3  0.021206  0.420400  0.787324
4  0.241211  0.902548  0.658405
5  0.345299  0.359169  0.079037
6  0.301547  0.887423  0.759940
7  0.904773  0.844301  0.185784
8  0.849553  0.388767  0.053435
9  0.689434  0.229019  0.560311


In [45]:
class FilterMethods:
    def __init__(self, pd_df, threshold=0.2):
        self.pd_df = pd_df
        self.threshold = threshold

    def filter_for_loops(self):
        index_to_drop = []
        for i in range(self.pd_df.shape[0]):
            if self.pd_df.iloc[i]['Col0'] > self.threshold:
                index_to_drop.append(i)

        return self.pd_df.drop(index_to_drop)

    def filter_booleans(self):
        return self.pd_df[self.pd_df['Col0'] <= self.threshold]

    def filter_query(self):
        return self.pd_df.query(f'Col0 <= {self.threshold}')



In [46]:
# Testing
fm = FilterMethods(pd_df=d2, threshold=0.2)
fm.filter_for_loops()

Unnamed: 0,Col0,Col1,Col2
1,0.045255,0.6169,0.049088
2,0.033051,0.200144,0.11345
3,0.021206,0.4204,0.787324


In [47]:
fm.filter_booleans()

Unnamed: 0,Col0,Col1,Col2
1,0.045255,0.6169,0.049088
2,0.033051,0.200144,0.11345
3,0.021206,0.4204,0.787324


In [48]:
fm.filter_query()

Unnamed: 0,Col0,Col1,Col2
1,0.045255,0.6169,0.049088
2,0.033051,0.200144,0.11345
3,0.021206,0.4204,0.787324


In [17]:
# filter_for_loops_time = timeit.timeit(lambda: filter_for_loops(d2), number=100)
# filter_for_loops_time

# Execution time summary

In [51]:
results_dict = {'rows': [], 'for_loops': [], 'booleans': [], 'query': []}

n = 10

for dataframe in [d1, d2, d3, d4, d5]:
    results_dict['rows'].append(dataframe.shape[0])

    fm = FilterMethods(pd_df=dataframe, threshold=0.2)
    results_dict['for_loops'].append(timeit.timeit(lambda: fm.filter_for_loops(), number=n))
    results_dict['booleans'].append(timeit.timeit(lambda: fm.filter_booleans(), number=n))
    results_dict['query'].append(timeit.timeit(lambda: fm.filter_query(), number=n))

results_df = pd.DataFrame(results_dict)
results_df['min_time_of_all_methods'] = results_df.min(axis=1)
results_df['for_loop_improvement_over_min'] = 1-(results_df['for_loops']/results_df['min_time_of_all_methods'])
results_df['booleans_improvement_over_min'] = 1-(results_df['booleans']/results_df['min_time_of_all_methods'])
results_df['query_improvement_over_min'] = 1-(results_df['query']/results_df['min_time_of_all_methods'])

In [52]:
results_df

Unnamed: 0,rows,for_loops,booleans,query,min_time_of_all_methods,for_loop_improvement_over_min,booleans_improvement_over_min,query_improvement_over_min
0,1,0.006506,0.004149,0.013841,0.004149,-0.568106,0.0,-2.335966
1,10,0.012817,0.00264,0.010056,0.00264,-3.854755,0.0,-2.808979
2,100,0.068911,0.002351,0.008841,0.002351,-28.305672,0.0,-2.759692
3,1000,0.682075,0.002712,0.009003,0.002712,-250.535376,0.0,-2.320013
4,10000,5.460063,0.002551,0.007823,0.002551,-2139.093543,0.0,-2.066232
