In [None]:
# HIGH PERFOMANCE PANDAS

In [4]:
import numpy as np


In [5]:
rng = np.random.RandomState(42)
x = rng.rand(1000000)
y = rng.rand(1000000)
%timeit x + y

3.24 ms ± 274 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
%timeit np.fromiter((xi + yi for xi, yi in zip(x, y)), dtype=x.dtype, count=len(x))

219 ms ± 8.13 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
#pandas.eval() for Efficient Operations


In [8]:
import pandas as pd

In [10]:
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols))
                     for i in range (4))

In [11]:
#result can be computed via pd.eval by constructing the expression as a string
%timeit pd.eval('df1 + df2 + df3 + df4')

The slowest run took 5.20 times longer than the fastest. This could mean that an intermediate result is being cached.
238 ms ± 165 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
#The eval() version of this expression is about 50% faster (and uses much less memory), while giving the same resultThe eval() version of this expression is about 50% faster (and uses much less memory), while giving the same result
np.allclose(df1 + df2+ df3+ df4,
           pd.eval('df1 + df2 + df3 + df4'))

True

In [13]:
#  Operations supported by pd.eval()
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3)))
                          for i in range (5))

In [14]:
# Arithmetic operators
# pd.eval() supports all arithmetic operators
result1 = -df1 * df2 / (df3 + df4) - df5
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')
np.allclose(result1, result2)

True

In [15]:
# Comparison operators
#pd.eval() supports all comparison operators, including chained expressions:
result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval('df1 < df2 <= df3 != df4')
np.allclose(result1, result2)

True

In [17]:
# Bitwise operators
#pd.eval() supports the & and | bitwise operators
result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 =  pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result1, result2)

True

In [19]:
# In addition, it supports the use of the literal and and or in Boolean expressions
result3 = pd.eval('(df1< 0.5) and (df2 < 0.5) or (df3 < df4)')
np.allclose(result1, result3)

True

In [21]:
# Object attributes and indices
#pd.eval() supports access to object attributes via the obj.attr syntax, and indexes via the obj[index]

result1 = df2.T[0] + df3.iloc[1]
result2 = pd.eval('df2.T[0] + df3.iloc[1]')
np.allclose(result1, result2)

True

In [22]:
# DataFrame.eval() for Column-Wise Operations
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C', ])
df

Unnamed: 0,A,B,C
0,0.375506,0.406939,0.069938
1,0.069087,0.235615,0.154374
2,0.677945,0.433839,0.652324
3,0.264038,0.808055,0.347197
4,0.589161,0.252418,0.557789
...,...,...,...
995,0.082646,0.036840,0.439733
996,0.008826,0.896578,0.723374
997,0.907270,0.916424,0.978655
998,0.758995,0.535431,0.347766


In [23]:
# Using pd.eval() as above, we can compute expressions with the three columns like this
result1 = (df['A'] + df['B']) / (df['C'] - 1)
result2 = pd.eval('(df.A + df.B) / (df.C - 1)')
np.allclose(result1, result2)

True