# Pandas tips and tricks - Efficient data manipulation

In [1]:
import pandas as pd
import numpy as np

In [2]:
size = 100000
multipliers = {'A':1.1, 'B':1.5, 'C':1.8}
cats = np.array(['A', 'B', 'C'])

idx = np.random.randint(3, size=size)
category = cats[idx]
consumption = np.abs(np.random.randn(size)*10)

data_original = pd.DataFrame({'category':category, 'consumption':consumption})
data_original['final_cost'] = None
data = data_original.copy()

In [3]:
print(multipliers)
data

{'A': 1.1, 'B': 1.5, 'C': 1.8}


Unnamed: 0,category,consumption,final_cost
0,C,0.444007,
1,B,2.278810,
2,C,7.353666,
3,A,4.252215,
4,C,0.508390,
...,...,...,...
99995,A,21.751370,
99996,C,1.721286,
99997,A,5.354282,
99998,C,13.814207,


# Explicit looping

In [4]:
data = data_original.copy()

In [5]:
%%timeit
for i in range(len(data)):
    cat = data['category'].iloc[i]
    con = data['consumption'].iloc[i]
    data.loc[i, 'final_cost'] = multipliers[cat] * con

2min 35s ± 1.38 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
data

Unnamed: 0,category,consumption,final_cost
0,C,0.444007,0.799213
1,B,2.278810,3.418215
2,C,7.353666,13.236599
3,A,4.252215,4.677437
4,C,0.508390,0.915102
...,...,...,...
99995,A,21.751370,23.926507
99996,C,1.721286,3.098315
99997,A,5.354282,5.88971
99998,C,13.814207,24.865572


# Panda apply function on rows

In [7]:
data = data_original.copy()

In [8]:
def calculate_cost(x):
    cat = x['category']
    con = x['consumption']
    return multipliers[cat] * con

In [9]:
%%timeit
data['final_cost'] = data.apply(calculate_cost, axis=1)

1.11 s ± 8.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
data

Unnamed: 0,category,consumption,final_cost
0,C,0.444007,0.799213
1,B,2.278810,3.418215
2,C,7.353666,13.236599
3,A,4.252215,4.677437
4,C,0.508390,0.915102
...,...,...,...
99995,A,21.751370,23.926507
99996,C,1.721286,3.098315
99997,A,5.354282,5.889710
99998,C,13.814207,24.865572


# Panda boolean indexing

In [11]:
data = data_original.copy()

In [12]:
%%timeit
idx_A = data['category'] == 'A'
idx_B = data['category'] == 'B'
idx_C = data['category'] == 'C'
data.loc[idx_A, 'final_cost'] = data.loc[idx_A, 'consumption'] * multipliers['A']
data.loc[idx_B, 'final_cost'] = data.loc[idx_B, 'consumption'] * multipliers['B']
data.loc[idx_C, 'final_cost'] = data.loc[idx_C, 'consumption'] * multipliers['C']

44.4 ms ± 8.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
data

Unnamed: 0,category,consumption,final_cost
0,C,0.444007,0.799213
1,B,2.278810,3.418215
2,C,7.353666,13.236599
3,A,4.252215,4.677437
4,C,0.508390,0.915102
...,...,...,...
99995,A,21.751370,23.926507
99996,C,1.721286,3.098315
99997,A,5.354282,5.88971
99998,C,13.814207,24.865572


## Reference: https://realpython.com/fast-flexible-pandas/