In [33]:
import numpy as np
import pandas as pd
import timeit

**This notebook captures different ways in which to create new fields with if-else statements in dataframes and compares the execution time of these methods.**

We will:
1. Generate a set of dataframes with different number of rows (1, 10, 100, 1_000, 10_000, 100_000)
2. Create different if-else methods
3. Calculate the execution time for each and determine how much worse are the rest compared to the fastest (1-method time/fastest time)

**Helper functions**
*Plotting/presenting*

In [34]:
def style_negative(v, props=''):
    return props if v < 0 else None

def highlight_min(s, props=''):
    return np.where(s == np.nanmin(s.values), props, '')

def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')

*Data creation*

In [35]:
def random_array(rows, columns):
    return np.random.rand(int(rows), columns)

a1 = random_array(1, 2)
a2 = random_array(10, 2)
a3 = random_array(100, 2)
a4 = random_array(1_000, 2)
a5 = random_array(10_000, 2)
a6 = random_array(100_000, 2)
a7 = random_array(1_000_000, 2)
a8 = random_array(10_000_000, 2)

# Example output
print(a1, a2, sep='\n\n')

[[0.24932918 0.19839753]]

[[0.15676042 0.2415717 ]
 [0.68507674 0.76818031]
 [0.56051054 0.8186549 ]
 [0.29662026 0.48085418]
 [0.09488237 0.82308512]
 [0.19580033 0.97157598]
 [0.1620105  0.74571507]
 [0.89065339 0.65271713]
 [0.13194536 0.35161891]
 [0.33391404 0.91563293]]


In [36]:
def random_dataframe(np_array):
    df = pd.DataFrame(np_array)
    df.columns = 'Col' + df.columns.astype(str)

    return df

d1 = random_dataframe(a1)
d2 = random_dataframe(a2)
d3 = random_dataframe(a3)
d4 = random_dataframe(a4)
d5 = random_dataframe(a5)
d6 = random_dataframe(a6)
d7 = random_dataframe(a7)
d8 = random_dataframe(a8)

# Example output
print(d1, d2, sep='\n\n')

       Col0      Col1
0  0.249329  0.198398

       Col0      Col1
0  0.156760  0.241572
1  0.685077  0.768180
2  0.560511  0.818655
3  0.296620  0.480854
4  0.094882  0.823085
5  0.195800  0.971576
6  0.162011  0.745715
7  0.890653  0.652717
8  0.131945  0.351619
9  0.333914  0.915633


## Filter methods

In [47]:
class IfElseMethods:
    def __init__(self, pd_df):
        self.pd_df = pd_df

    def _new_column(self, row):
        if row['Col0'] < 0.2:
            return 'blue'
        elif row['Col0'] > 0.8:
            return 'green'
        else:
            return 'red'

    def ifelse_apply(self):
        self.pd_df['new_column'] = self.pd_df.apply(self._new_column, axis=1)
        return self.pd_df

    def ifelse_lambda(self):
        self.pd_df['new_column'] = self.pd_df.apply(lambda x: 'blue' if x['Col0'] < 0.2 else ('green' if x['Col0'] > 0.8 else 'red'), axis=1)
        return self.pd_df

    def ifelse_loc(self):
        self.pd_df.loc[self.pd_df['Col0'] < 0.2, 'new_column'] = 'blue'
        self.pd_df.loc[self.pd_df['Col0'] > 0.8, 'new_column'] = 'green'
        self.pd_df.loc[(self.pd_df['Col0'] >= 0.2) & (self.pd_df['Col0'] <= 0.8), 'new_column'] = 'red'
        return self.pd_df

    def ifelse_where(self):
        self.pd_df['new_column'] = np.where(self.pd_df['Col0']<0.2, 'blue', np.where(self.pd_df['Col0'] > 0.8, 'green', 'red'))
        return self.pd_df

    def ifelse_select(self):
        self.pd_df['new_column'] = np.select(condlist=[self.pd_df['Col0'] < 0.2, self.pd_df['Col0']>0.8],
                                             choicelist=['blue', 'green'],
                                             default='red')
        return self.pd_df

fm = IfElseMethods(pd_df=d2)

In [48]:
fm.ifelse_apply()

Unnamed: 0,Col0,Col1,new_column
0,0.15676,0.241572,blue
1,0.685077,0.76818,red
2,0.560511,0.818655,red
3,0.29662,0.480854,red
4,0.094882,0.823085,blue
5,0.1958,0.971576,blue
6,0.162011,0.745715,blue
7,0.890653,0.652717,green
8,0.131945,0.351619,blue
9,0.333914,0.915633,red


In [49]:
# fm.ifelse_lambda()

In [50]:
# fm.ifelse_loc()

In [51]:
# fm.ifelse_where()

In [53]:
# fm.ifelse_select()

# Execution time summary
We display the final output as the ratio of each execution method vs the fastest method. Instead of looking at the absolute execution time, we prefer the relative time. This is because you might run the code in different machines, and it might be more interesting to know that method A was 3x slower, than to know it was 35 seconds slower.

In [54]:
results_dict = {'rows': [], 'apply': [], 'lambda': [], 'loc': [], 'where': [], 'select': []}

n = 10

for dataframe in [d1, d2, d3, d4, d5, d6, d7, d8]:
    results_dict['rows'].append(dataframe.shape[0])

    fm = IfElseMethods(pd_df=dataframe)
    results_dict['apply'].append(timeit.timeit(lambda: fm.ifelse_apply(), number=n))
    results_dict['lambda'].append(timeit.timeit(lambda: fm.ifelse_lambda(), number=n))
    results_dict['loc'].append(timeit.timeit(lambda: fm.ifelse_loc(), number=n))
    results_dict['where'].append(timeit.timeit(lambda: fm.ifelse_where(), number=n))
    results_dict['select'].append(timeit.timeit(lambda: fm.ifelse_select(), number=n))

results_df = pd.DataFrame(results_dict)
results_df.index = results_df['rows']
results_df = results_df.drop('rows', axis=1)

results_df['min_time_of_all_methods'] = results_df.min(axis=1)
results_df['apply_deterioration'] = (results_df['apply']/results_df['min_time_of_all_methods'])-1
results_df['lambda_deterioration'] = (results_df['lambda']/results_df['min_time_of_all_methods'])-1
results_df['loc_deterioration'] = (results_df['loc']/results_df['min_time_of_all_methods'])-1
results_df['where_deterioration'] = (results_df['where']/results_df['min_time_of_all_methods'])-1
results_df['select_deterioration'] = (results_df['select']/results_df['min_time_of_all_methods'])-1

In [55]:
# Execution time proportion between of method/fastest.
present_df = results_df[['apply_deterioration','lambda_deterioration','loc_deterioration','where_deterioration','select_deterioration']].transpose()
present_df = (present_df.style.format(precision=2)
              .apply(highlight_min, props='color:white;background-color:darkgreen', axis=0)
              .apply(highlight_max, props='color:white;background-color:red', axis=0)
             )
present_df

rows,1,10,100,1000,10000,100000,1000000,10000000
apply_deterioration,0.63,0.78,2.14,36.08,121.19,145.26,139.92,165.93
lambda_deterioration,0.48,0.79,2.63,29.03,119.87,146.8,132.59,153.75
loc_deterioration,2.79,2.82,2.81,2.22,0.82,0.0,0.0,0.0
where_deterioration,0.0,0.0,0.13,0.0,0.08,0.58,0.57,0.84
select_deterioration,0.33,0.09,0.0,0.09,0.0,0.48,0.52,0.76
