# <b>High-Performance Pandas: eval and query</b>

## <b>Motivating query and eval: Compound Expressions</b>

In [51]:
import numpy as np
rng = np.random.default_rng(42)
x = rng.random(1000000)
y = rng.random(1000000)
%timeit x + y

3.83 ms ± 318 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [52]:
%timeit np.fromiter((xi + yi for xi, yi in zip(x, y)),dtype=x.dtype, count=len(x))

281 ms ± 20.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [53]:
mask = (x > 0.5) & (y < 0.5)
mask

array([False, False,  True, ...,  True,  True,  True])

In [54]:
import numexpr
# mask_numexpr = numexpr.evaluate('(x > 0.5) & (y < 0.5)')
# np.all(mask == mask_numexpr)
#output: True

In [55]:
%timeit (x > 0.5) & (y < 0.5)
%timeit numexpr.evaluate('(x > 0.5) & (y < 0.5)')

3.05 ms ± 462 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
6.46 ms ± 496 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [56]:
x_big = rng.random(100_000_000)
y_big = rng.random(100_000_000)

%timeit (x_big > 0.5) & (y_big < 0.5)
%timeit numexpr.evaluate('(x_big > 0.5) & (y_big < 0.5)')

268 ms ± 27.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
846 ms ± 376 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [57]:
%timeit np.sin(x_big) + np.cos(y_big) + x_big * y_big
%timeit numexpr.evaluate('sin(x_big) + cos(y_big) + x_big * y_big')

4.76 s ± 2.86 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.3 s ± 183 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## <b>pandas.eval for Efficient Operations</b>

In [58]:
import pandas as pd
nrows, ncols = 100000, 100
df1, df2, df3, df4 = (pd.DataFrame(rng.random((nrows, ncols))) for i in range(4))
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.864304,0.994176,0.925138,0.589267,0.395323,0.898974,0.076748,0.881996,0.771740,0.918453,...,0.337325,0.027868,0.731685,0.651146,0.018243,0.797288,0.113844,0.325229,0.207676,0.914333
1,0.212896,0.216168,0.241444,0.700461,0.536945,0.322931,0.047400,0.687690,0.039041,0.607538,...,0.648092,0.857048,0.541596,0.005089,0.571168,0.381035,0.593158,0.415402,0.508927,0.874329
2,0.097849,0.596627,0.509653,0.675959,0.437531,0.154627,0.108378,0.205098,0.343155,0.996423,...,0.941251,0.627711,0.481594,0.012181,0.127897,0.680059,0.581453,0.177522,0.308676,0.417417
3,0.463539,0.895614,0.910606,0.216624,0.033773,0.382093,0.465512,0.675046,0.610316,0.069334,...,0.194444,0.937532,0.955805,0.195826,0.227486,0.988852,0.481580,0.051215,0.210700,0.164242
4,0.728646,0.416906,0.580005,0.650507,0.902602,0.955853,0.302167,0.368274,0.750120,0.069743,...,0.411232,0.197739,0.012801,0.815844,0.010336,0.725907,0.788061,0.677759,0.076500,0.656609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.929685,0.888725,0.794680,0.824149,0.993765,0.826668,0.334048,0.747966,0.367914,0.204277,...,0.258448,0.169234,0.865453,0.765889,0.955857,0.665990,0.846364,0.553839,0.426605,0.725445
99996,0.872481,0.514576,0.472775,0.840819,0.103963,0.570674,0.353627,0.776929,0.623622,0.888449,...,0.872222,0.558077,0.995347,0.061331,0.539346,0.103438,0.601383,0.694609,0.892246,0.178290
99997,0.505345,0.372089,0.491240,0.021446,0.436812,0.035385,0.277333,0.571239,0.257051,0.874096,...,0.327053,0.125249,0.563284,0.101264,0.507357,0.209379,0.533454,0.090097,0.160280,0.752975
99998,0.404638,0.311857,0.151602,0.906683,0.194243,0.994947,0.185744,0.726206,0.237058,0.174387,...,0.148056,0.636325,0.527506,0.029223,0.743040,0.952026,0.293487,0.175485,0.162583,0.998070


In [59]:
%timeit df1 + df2 + df3 + df4

111 ms ± 29.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [60]:
%timeit pd.eval('df1 + df2 + df3 + df4')

The slowest run took 4.59 times longer than the fastest. This could mean that an intermediate result is being cached.
89.4 ms ± 55.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [61]:
np.allclose(df1 + df2 + df3 + df4, pd.eval('df1 + df2 + df3 + df4'))

True

In [62]:
np.all(df1 + df2 + df3 + df4 == pd.eval('df1 + df2 + df3 + df4'))

True

In [63]:
import numpy as np

a = np.array([0.1 + 0.2])
b = np.array([0.3])

print(np.all(a == b))         # False because 0.1+0.2 != 0.3 exactly!
print(np.allclose(a, b))      # True because they are close enough!


False
True


In [64]:
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.integers(0, 1000, (100, 3)))
for i in range(5))

In [65]:
result1 = -df1 * df2 / (df3 + df4) - df5
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')
np.allclose(result1, result2)

True

In [66]:
result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval('df1 < df2 <= df3 != df4')
np.allclose(result1, result2)

True

In [67]:
print(df1 ,'\n\n', df2)

      0    1    2
0   779  446  470
1   891  626  866
2   189  136  216
3   855   88   65
4   114  920  905
..  ...  ...  ...
95  463   36  275
96  442  638  304
97  591  975  914
98   26  788  785
99   61  503  875

[100 rows x 3 columns] 

       0    1    2
0   110   53  709
1   343  685  420
2   876  550  713
3   390  283  307
4   894  367  125
..  ...  ...  ...
95  546  684  545
96  667   26  991
97  441  675  661
98  981  704  354
99  520  288  225

[100 rows x 3 columns]


In [68]:
print(df3, '\n\n', df4)

      0    1    2
0   167  837  968
1   824  406   88
2   546  510   90
3    18  976  972
4   979  925  687
..  ...  ...  ...
95  195  747  994
96  390  671  834
97  349  505  838
98  158   21  106
99  145   98   51

[100 rows x 3 columns] 

       0    1    2
0   452  910  383
1    93  768  785
2   970  271  268
3   810  914  542
4   446  121  200
..  ...  ...  ...
95  304  821  217
96  132  777  565
97  301  360  579
98  933  809   31
99  746  625  856

[100 rows x 3 columns]


In [69]:
result1

Unnamed: 0,0,1,2
0,False,False,True
1,False,False,False
2,False,False,False
3,False,True,True
4,True,False,False
...,...,...,...
95,False,True,True
96,False,False,False
97,False,False,False
98,False,False,False


In [70]:
result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result1, result2)

True

In [72]:
result1

Unnamed: 0,0,1,2
0,True,True,False
1,False,True,True
2,True,False,True
3,True,False,False
4,False,False,False
...,...,...,...
95,True,True,False
96,False,True,False
97,False,False,False
98,True,True,False


In [73]:
result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)')
np.allclose(result1, result3)

True

In [91]:
df = pd.DataFrame(rng.random((1000, 3)), columns=['A', 'B', 'C'])
df.head()

Unnamed: 0,A,B,C
0,0.674118,0.062189,0.787252
1,0.73414,0.173477,0.310231
2,0.431117,0.32726,0.481619
3,0.994307,0.528563,0.388215
4,0.878129,0.963458,0.639574


In [92]:
result1 = (df['A'] + df['B']) / (df['C'] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1, result2)

True

In [95]:
type(result1)   #pandas.core.series.Series

pandas.core.series.Series

In [96]:
result3 = df.eval('(A + B) / (C - 1)')
np.allclose(result1, result3)

True

In [97]:
result3

0     -3.460942
1     -1.315826
2     -1.462972
3     -2.489225
4     -5.109470
         ...   
995   -0.728613
996   -1.942860
997   -1.273018
998   -1.934016
999   -2.549037
Length: 1000, dtype: float64

In [98]:
df.head()

Unnamed: 0,A,B,C
0,0.674118,0.062189,0.787252
1,0.73414,0.173477,0.310231
2,0.431117,0.32726,0.481619
3,0.994307,0.528563,0.388215
4,0.878129,0.963458,0.639574


## <b>Local variables in DataFrame.eval</b>

In [99]:
df.shape

(1000, 3)

In [100]:
column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean')
np.allclose(result1, result2)

True

In [105]:
type(result1)

pandas.core.series.Series

In [106]:
df['A']

0      0.674118
1      0.734140
2      0.431117
3      0.994307
4      0.878129
         ...   
995    0.004401
996    0.779812
997    0.729935
998    0.815992
999    0.773512
Name: A, Length: 1000, dtype: float64

In [107]:
df.A

0      0.674118
1      0.734140
2      0.431117
3      0.994307
4      0.878129
         ...   
995    0.004401
996    0.779812
997    0.729935
998    0.815992
999    0.773512
Name: A, Length: 1000, dtype: float64

In [108]:
df[df['A']<0.5]

Unnamed: 0,A,B,C
2,0.431117,0.327260,0.481619
5,0.448055,0.586663,0.332335
9,0.498257,0.256380,0.857488
10,0.446259,0.668615,0.989573
11,0.286742,0.064709,0.769330
...,...,...,...
990,0.224539,0.084635,0.004360
991,0.034647,0.897595,0.481074
992,0.486439,0.629186,0.471220
993,0.100243,0.447944,0.743824


In [101]:
df['A'].shape

(1000,)

In [102]:
df['A'].dtype

dtype('float64')

In [103]:
type(df['A'])

pandas.core.series.Series

In [109]:
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)

True

In [110]:
Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)

True

In [111]:
Cmean

0.49715648015408964