# High-Performance Pandas: eval and query

In [1]:
import numpy as np
import pandas as pd

## pandas.eval for Efficient Operations

In [2]:
rng = np.random.default_rng(42)

In [7]:
rows, cols = 1000000, 100
df1, df2, df3, df4 = (pd.DataFrame(rng.random((rows, cols))) for i in range(4))

In [9]:
%timeit df1 + df2 + df3 + df4

933 ms ± 13.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%timeit pd.eval('df1 + df2 + df3 + df4')

402 ms ± 12.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
np.allclose(df1 + df2 + df3 + df4, pd.eval('df1 + df2 + df3 + df4'))

True

In [14]:
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.integers(0, 1000, (100, 3))) for i in range(5))

__Here’s a summary of the operations pd.eval supports__

*Arithmetic operators*

In [16]:
result1 = -df1 * df2 / (df3 + df4) - df5
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')
np.allclose(result1, result2)

True

*Comparison operators*

In [17]:
result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval('df1 < df2 <= df3 != df4')
np.allclose(result1, result2)

True

*Bitwise operators*

In [18]:
result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result1, result2)

True

In [22]:
result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)')
np.allclose(result1, result3)

True

*Object attributes and indices*

In [23]:
result1 = df2.T[0] + df3.iloc[1]
result2 = pd.eval('df2.T[0] + df3.iloc[1]')
np.allclose(result1, result2)

True

## DataFrame.eval for Column-Wise Operations

In [28]:
df = pd.DataFrame(rng.random((1000, 3)), columns=['A', 'B', 'C'])
df.head()

Unnamed: 0,A,B,C
0,0.139519,0.651117,0.708733
1,0.905253,0.457656,0.143186
2,0.46175,0.248981,0.440648
3,0.206901,0.404336,0.76717
4,0.72738,0.640151,0.707613


In [29]:
result1 = (df['A'] + df['B']) / (df['C'] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1, result2)

True

In [30]:
result3 = df.eval('(A + B) / (C - 1)')
np.allclose(result1, result3)

True

### Assignment in DataFrame.eval

In [31]:
df.head()

Unnamed: 0,A,B,C
0,0.139519,0.651117,0.708733
1,0.905253,0.457656,0.143186
2,0.46175,0.248981,0.440648
3,0.206901,0.404336,0.76717
4,0.72738,0.640151,0.707613


In [32]:
df.eval('D = (A + B) / C', inplace=True)

In [33]:
df.head()

Unnamed: 0,A,B,C,D
0,0.139519,0.651117,0.708733,1.115562
1,0.905253,0.457656,0.143186,9.51848
2,0.46175,0.248981,0.440648,1.612923
3,0.206901,0.404336,0.76717,0.796742
4,0.72738,0.640151,0.707613,1.932597


In [34]:
df.eval('D = (A - B) / C', inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.139519,0.651117,0.708733,-0.721849
1,0.905253,0.457656,0.143186,3.125987
2,0.46175,0.248981,0.440648,0.482854
3,0.206901,0.404336,0.76717,-0.257355
4,0.72738,0.640151,0.707613,0.123272


### Local Variables in DataFrame.eval

In [35]:
column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean')
np.allclose(result1, result2)

True

## The DataFrame.query Method

In [36]:
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)

True

In [40]:
result2 = df.query('(A < 0.5) & (B < 0.5)')
result3 = df.query('A < 0.5 and B < 0.5')
np.allclose(result1, result2)

True

In [41]:
np.allclose(result1, result3)

True

In [42]:
Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)

True

## Performance: When to Use These Functions

In [43]:
df.values.nbytes

32000