In [2]:
print("""
@File         : chaining_with_.pipe.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-12-28 22:14:48
@Email        : cuixuanstephen@gmail.com
@Description  : Chaining with .pipe
""")


@File         : chaining_with_.pipe.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-12-28 22:14:48
@Email        : cuixuanstephen@gmail.com
@Description  : Chaining with .pipe



In [3]:
import pandas as pd
import numpy as np

在编写 pandas 代码时，开发人员遵循两种主要的风格形式。第一种方法是在整个程序中自由使用变量，无论这是否意味着创建新变量：

```python
df = pd.DataFrame(...)
df1 = do_something(df)
df2 = do_another_thing(df1)
df3 = do_yet_another_thing(df2)
```

或者简单地重复重新分配给同一个变量：

```python
df = pd.DataFrame(...)
df = do_something(df)
df = do_another_thing(df)
df = do_yet_another_thing(df)
```

另一种方法是将代码表达为 `pipeline`，其中每个步骤接受并返回一个 `pd.DataFrame`。

```python
(
    pd.DataFrame(...)
    .pipe(do_something)
    .pipe(do_another_thing)
    .pipe(do_yet_another_thing)
)
```

In [11]:
df = pd.DataFrame({
    "col1": pd.Series([1, 2, 3], dtype=pd.Int64Dtype()),
    "col2": pd.Series(["a", "b", "c"], dtype=pd.StringDtype()),
})
df

Unnamed: 0,col1,col2
0,1,a
1,2,b
2,3,c


In [12]:
def change_col1(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(col1=pd.Series([4, 5, 6], dtype=pd.Int64Dtype()))


def change_col2(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(col2=pd.Series(['X', 'Y', 'Z'], dtype=pd.StringDtype()))

In [13]:
df2 = change_col1(df)
df3 = change_col2(df2)
df3

Unnamed: 0,col1,col2
0,4,X
1,5,Y
2,6,Z


In [14]:
change_col2(change_col1(df))

Unnamed: 0,col1,col2
0,4,X
1,5,Y
2,6,Z


In [15]:
df.pipe(change_col1).pipe(change_col2)

Unnamed: 0,col1,col2
0,4,X
1,5,Y
2,6,Z


如果要在管道中应用的任何函数需要接受更多参数，`pd.DataFrame.pipe` 能够转发它们。

In [16]:
from typing import Literal

def change_col2(df: pd.DataFrame, str_case: Literal['upper', 'lower']):
    
    if str_case == 'upper':
        values = ['X', 'Y', 'Z']
    else:
        values = ['x', 'y', 'z']
    return values

In [17]:
df.pipe(change_col2, str_case='lower')

['x', 'y', 'z']

> 在管道中能不能用前面管道返回的变量名？

In [21]:
def return_new_col(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(new_col=df['col1'] + 5)

def use_new_col(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(new_col2=df['new_col'] * 5)

In [22]:
df = pd.DataFrame(
    np.random.randn(10, 2), columns=['col1', 'col2'],
    dtype=pd.Float64Dtype()
)

In [23]:
df.pipe(return_new_col).pipe(use_new_col)

Unnamed: 0,col1,col2,new_col,new_col2
0,-1.715276,0.13583,3.284724,16.423621
1,1.09693,0.692924,6.09693,30.484649
2,0.611829,-0.885114,5.611829,28.059143
3,-0.270849,-1.109456,4.729151,23.645757
4,0.930956,-0.433878,5.930956,29.654778
5,-1.271204,0.302288,3.728796,18.643978
6,-1.381792,1.026883,3.618208,18.09104
7,-0.35725,-0.07354,4.64275,23.213749
8,0.133714,0.514869,5.133714,25.668569
9,0.977295,0.06798,5.977295,29.886473


> 完全可以使用，这对我使用来说是个好处。