## query() 与eval()的设计动机：如何代数式

In [2]:
import numpy as np
import pandas as pd

In [6]:
rng = np.random.RandomState(42)
x = rng.rand(1000000)
y = rng.rand(1000000)
%timeit x+y

3.95 ms ± 90.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
# 复合代数运算,耗时长
%timeit np.fromiter((xi+yi for xi, yi in zip(x, y)), dtype=x.dtype, count=len(x))

612 ms ± 79.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
mask = (x> 0.5) & (y<0.5)
mask

# Numpy 会计算每个代数子式，所以上面的式子相当于
temp1 = x > 0.5
temp2 = y < 0.5
mask = temp1 & temp2

也就是说，中间过程都需要显示地分配内存。如果x、y数据非常大，那么消耗的资源是巨大的  
而`Numexpr`程序库可以在不为中间过程分配内存的前提下，完成复合代数运算

In [16]:
import numexpr

In [30]:
mask_numexpr = numexpr.evaluate('(x>0.5) & (y<0.5)')
# 判断两个数组是否完全一样
np.allclose(mask, mask_numexpr)

True

In [29]:
import sys
print(sys.getsizeof(mask.size))
print(sys.getsizeof(mask_numexpr))


28
1000096


In [32]:
%timeit mask = (x> 0.5) & (y<0.5)
%timeit mask_numexpr = numexpr.evaluate('(x>0.5) & (y<0.5)')


3.67 ms ± 605 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.02 ms ± 41.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## 用Pandas.eval()实现高性能运算

In [34]:
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols))
                      for i in range(4))

In [36]:
%timeit df1 + df2 + df3 + df4

143 ms ± 1.37 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [37]:
%timeit pd.eval('df1 + df2 + df3 + df4')

67 ms ± 1.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [38]:
np.allclose(df1+df2+df3+df4, 
           pd.eval('df1 + df2 + df3 + df4'))

True

### pd.eval()支持的运算

In [40]:
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 100, (100,3)))
                          for i in range(5))

#### 算数运算符

In [41]:
result = -df1 * df2 / (df3 + df4) - df5

In [43]:
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')

In [44]:
np.allclose(result, result2)

True

#### 比较运算符(包括链式代数式)

In [45]:
result1 = (df1 > df2) & (df2 <= df3) | (df3 != df4)
result2 = pd.eval('(df1 > df2) & (df2 <= df3) | (df3 != df4)')
np.allclose(result1, result2)

True

#### 位运算符(& | 等)


In [46]:
result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result1, result2)

True

In [47]:
# 可以在布尔类型的代数式中使用and 和 or
result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)')
np.allclose(result1, result3)

True

#### 对象属性与索引

In [48]:
result1 = df2.T[0] + df3.iloc[1]
result2 = pd.eval('df2.T[0] + df3.iloc[1]')
np.allclose(result1, result2)

True

#### 其他运算
pd.eval()暂时不支持函数调用、条件语句、循环以及复杂的运算

## 用`DataFrame.eval()`实现列间运算

In [52]:
df = pd.DataFrame(rng.rand(1000, 3), columns=['a','b','c'])
df.head()

Unnamed: 0,a,b,c
0,0.05248,0.767476,0.263192
1,0.10516,0.987673,0.196751
2,0.361128,0.764004,0.368646
3,0.596964,0.257546,0.840027
4,0.375106,0.908979,0.125058


In [58]:
result1 = (df['a'] + df['b']) / (df['c'] - 1)
result2 = pd.eval('(df.a + df.b) / (df.c - 1) ')
np.allclose(result1, result2)

True

In [59]:
# 也可以用如下：
result3 = df.eval('(a+b) / (c-1)')
np.allclose(result1, result2)

True

\# pd.eval("df.loc[:,'a'] + 1")  
这样使用会报错

### 用`DataFrame.eval()`新增列

In [65]:
df.head()

Unnamed: 0,a,b,c
0,0.05248,0.767476,0.263192
1,0.10516,0.987673,0.196751
2,0.361128,0.764004,0.368646
3,0.596964,0.257546,0.840027
4,0.375106,0.908979,0.125058


In [66]:
df.eval('d = (a+b) / (c)', inplace=True)
df.head()

Unnamed: 0,a,b,c,d
0,0.05248,0.767476,0.263192,3.11543
1,0.10516,0.987673,0.196751,5.554388
2,0.361128,0.764004,0.368646,3.052065
3,0.596964,0.257546,0.840027,1.017242
4,0.375106,0.908979,0.125058,10.267946


In [68]:
# 修改已有的列
df.eval('c = (a+d)', inplace=True)
df.head()

Unnamed: 0,a,b,c,d
0,0.05248,0.767476,3.167909,3.11543
1,0.10516,0.987673,5.659549,5.554388
2,0.361128,0.764004,3.413193,3.052065
3,0.596964,0.257546,1.614206,1.017242
4,0.375106,0.908979,10.643052,10.267946


### `DataFrame.eval()`使用局部变量 
通过`@var`符号使用python局部变量  
`@`只能在`DataFrame.eval()`中使用，不能再`pandas.eval()`使用

In [73]:
column_mean = df.mean(1)

result1 = df['a'] + column_mean
result2 = df.eval('a + @column_mean')
np.allclose(result1, result2)

True

## `DataFrame.query()`方法

In [79]:
result1 = df[(df.a < 0.5) & (df.b <0.5)]
result2 = pd.eval('df[(df.a < 0.5) & (df.b <0.5)]')
np.allclose(result1, result2)

True

\# `df.eval('df[a < 0.5) & (b <0.5)]'`  
再df无法使用


In [86]:
# result3 = df.eval('a<0.5 & b<0.5')
# np.allclose(result1, result3)

In [87]:
result3 = df.query('a<0.5 & b<0.5')
np.allclose(result1, result3)

True

`query()`也支持使用`@`符号引用局部变量


In [101]:
columns_mean = df.mean(axis=1)
result1 = df[(df.a > column_mean-1) & (df.c < column_mean+1)]
result2 = df.query('(a>@column_mean-1) & (c<@column_mean+1)')
np.allclose(result1, result2)

True

## 性能决定使用时机

In [104]:
x = df[(df.a < 0.5) & (df.b <0.5)]

等价于

In [105]:
tmp1 = df.a < 0.5
tmp2 = df.b < 0.5
tmp3 = tmp1 & tmp2
x = df[tmp3]