# Написание расширений на языке C для pandas

## "Чистый" Python

In [1]:
# импортируем numpy и pandas
import numpy as np
import pandas as pd
np.random.seed(12345) 
df = pd.DataFrame({'a': np.random.randn(1000),
                   'b': np.random.randn(1000),
                   'N': np.random.randint(100, 1000, (1000)),
                   'x': 'x'})

In [2]:
df.head(5)

Unnamed: 0,N,a,b,x
0,826,-0.204708,-0.983505,x
1,220,0.478943,0.930944,x
2,401,-0.519439,-0.811676,x
3,363,-0.55573,-1.830156,x
4,235,1.965781,-0.13873,x


In [3]:
def f(x):
    return x * (x - 1)

In [4]:
def integrate_f(a, b, N):
    s = 0
    dx = (b - a) / N
    for i in range(N):
        s += f(a + i * dx)
    return s * dx

In [5]:
%timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1)

1 loop, best of 3: 203 ms per loop


In [6]:
%prun -l 4 df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1)

 

## Обычный Cython

In [7]:
%load_ext Cython

In [8]:
%%cython
def f_plain(x):
    return x * (x - 1)

def integrate_f_plain(a, b, N):
    s = 0
    dx = (b - a) / N
    for i in range(N):
        s += f_plain(a + i * dx)
    return s * dx

In [9]:
%timeit df.apply(lambda x: integrate_f_plain(x['a'], x['b'], x['N']), axis=1)

10 loops, best of 3: 118 ms per loop


# Использование библиотеки Numba

## Jit

In [10]:
import numba

@numba.jit
def f_plain(x):
   return x * (x - 1)

@numba.jit
def integrate_f_numba(a, b, N):
   s = 0
   dx = (b - a) / N
   for i in range(N):
       s += f_plain(a + i * dx)
   return s * dx

@numba.jit
def apply_integrate_f_numba(col_a, col_b, col_N):
   n = len(col_N)
   result = np.empty(n, dtype='float64')
   assert len(col_a) == len(col_b) == n
   for i in range(n):
      result[i] = integrate_f_numba(col_a[i], col_b[i], col_N[i])
   return result

def compute_numba(df):
   result = apply_integrate_f_numba(df['a'].values, df['b'].values, df['N'].values)
   return pd.Series(result, index=df.index, name='result')

In [11]:
%timeit compute_numba(df)

The slowest run took 438.90 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 1.09 ms per loop


## Vectorize

In [12]:
# импортируем vectorize
from numba import vectorize

# пишем функцию, которая умножает каждое 
# значение на 2, не используя numba
def double_every_value_nonumba(x):
    return x*2

# пишем функцию, которая умножает каждое 
# значение на 2, используя numba
@vectorize
def double_every_value_withnumba(x):
    return x*2

In [13]:
# применяем самостоятельно написанную функцию 
# без использования numba
%timeit df['col1_doubled'] = df.a.apply(double_every_value_nonumba)

1000 loops, best of 3: 605 µs per loop


In [14]:
# применяем реализацию по умолчанию (работает быстрее
# самостоятельно написанной функции, не использующей
# numba)
%timeit df['col1_doubled'] = df.a*2

1000 loops, best of 3: 292 µs per loop


In [15]:
# применяем самостоятельно написанную функцию 
# c использованием библиотеки numba
%timeit df['col1_doubled'] = double_every_value_withnumba(df.a.values)

The slowest run took 556.64 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 172 µs per loop


# Вычисление выражений с помощью функции eval()

## Поддерживаемый синтаксис

## Примеры использования функции eval()

In [16]:
nrows, ncols = 20000, 100
df1, df2, df3, df4 = [pd.DataFrame(np.random.randn(nrows, ncols)) for _ in range(4)]

In [17]:
%timeit df1 + df2 + df3 + df4

10 loops, best of 3: 31 ms per loop


In [18]:
%timeit pd.eval('df1 + df2 + df3 + df4')

10 loops, best of 3: 16.2 ms per loop


In [19]:
%timeit (df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)

10 loops, best of 3: 67 ms per loop


In [20]:
%timeit pd.eval('(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')

10 loops, best of 3: 23.1 ms per loop


In [21]:
s = pd.Series(np.random.randn(50))

In [22]:
%timeit df1 + df2 + df3 + df4 + s

10 loops, best of 3: 52.3 ms per loop


In [23]:
%timeit pd.eval('df1 + df2 + df3 + df4 + s')

10 loops, best of 3: 17.4 ms per loop


## Метод DataFrame.eval()

In [24]:
np.random.seed(12345) 
df = pd.DataFrame(np.random.randn(5, 2), columns=['a', 'b'])

In [25]:
df.eval('a + b')

0    0.274236
1   -1.075169
2    3.359186
3    0.374654
4    2.015457
dtype: float64

In [26]:
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))

In [27]:
df.eval('c = a + b', inplace=True)

In [28]:
df.eval('d = a + b + c', inplace=True)

In [29]:
df.eval('a = 1', inplace=True)

In [30]:
df

Unnamed: 0,a,b,c,d
0,1,5,5,10
1,1,6,7,14
2,1,7,9,18
3,1,8,11,22
4,1,9,13,26


In [31]:
df

Unnamed: 0,a,b,c,d
0,1,5,5,10
1,1,6,7,14
2,1,7,9,18
3,1,8,11,22
4,1,9,13,26


In [32]:
df.eval('e = a - c', inplace=False)

Unnamed: 0,a,b,c,d,e
0,1,5,5,10,-4
1,1,6,7,14,-6
2,1,7,9,18,-8
3,1,8,11,22,-10
4,1,9,13,26,-12


In [33]:
df

Unnamed: 0,a,b,c,d
0,1,5,5,10
1,1,6,7,14
2,1,7,9,18
3,1,8,11,22
4,1,9,13,26


In [34]:
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
df.eval("""
c = a + b
d = a + b + c
a = 1""", inplace=False)

Unnamed: 0,a,b,c,d
0,1,5,5,10
1,1,6,7,14
2,1,7,9,18
3,1,8,11,22
4,1,9,13,26


In [35]:
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
df['c'] = df.a + df.b
df['d'] = df.a + df.b + df.c
df['a'] = 1
df

Unnamed: 0,a,b,c,d
0,1,5,5,10
1,1,6,7,14
2,1,7,9,18
3,1,8,11,22
4,1,9,13,26
