In [2]:
print("""
@File         : exercise_9_tax_planning.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-10-03 15:31:05
@Email        : cuixuanstephen@gmail.com
@Description  : 税务规划
""")


@File         : exercise_9_tax_planning.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-10-03 15:31:05
@Email        : cuixuanstephen@gmail.com
@Description  : 税务规划



我们使用想要生成的列的名称来分配给数据框。通常分配一个 Series，但我们也可以分配一个 NumPy 数组或列表，只要它的长度与其他现有列相同即可。列名是唯一的 ‑ 因此就像字典一样，分配给现有列会将其替换为新列。

In [3]:
import pandas as pd
import numpy as np

## Adding columns with assign

In [4]:
df = pd.DataFrame(
    [
        {
            "product_id": 23,
            "name": "computer",
            "wholesale_price": 500,
            "retail_price": 1000,
            "sales": 100,
        },
        {
            "product_id": 96,
            "name": "Python Workout",
            "wholesale_price": 35,
            "retail_price": 75,
            "sales": 1000,
        },
        {
            "product_id": 97,
            "name": "Pandas Workout",
            "wholesale_price": 35,
            "retail_price": 75,
            "sales": 500,
        },
        {
            "product_id": 15,
            "name": "banana",
            "wholesale_price": 0.5,
            "retail_price": 1,
            "sales": 200,
        },
        {
            "product_id": 87,
            "name": "sandwich",
            "wholesale_price": 3,
            "retail_price": 5,
            "sales": 300,
        },
    ]
)

df["current_net"] = (df["retail_price"] - df["wholesale_price"]) * df["sales"]

In [5]:
df.assign(current_net=(df['retail_price'] - df['wholesale_price']) * df['sales'])

Unnamed: 0,product_id,name,wholesale_price,retail_price,sales,current_net
0,23,computer,500.0,1000,100,50000.0
1,96,Python Workout,35.0,75,1000,40000.0
2,97,Pandas Workout,35.0,75,500,20000.0
3,15,banana,0.5,1,200,100.0
4,87,sandwich,3.0,5,300,600.0


In [6]:
df['after_15'] = df['current_net'] * 0.85
df['after_20'] = df['current_net'] * 0.80
df['after_25'] = df['current_net'] * 0.75
df

Unnamed: 0,product_id,name,wholesale_price,retail_price,sales,current_net,after_15,after_20,after_25
0,23,computer,500.0,1000,100,50000.0,42500.0,40000.0,37500.0
1,96,Python Workout,35.0,75,1000,40000.0,34000.0,32000.0,30000.0
2,97,Pandas Workout,35.0,75,500,20000.0,17000.0,16000.0,15000.0
3,15,banana,0.5,1,200,100.0,85.0,80.0,75.0
4,87,sandwich,3.0,5,300,600.0,510.0,480.0,450.0


In [7]:
df[['current_net', 'after_15', 'after_20', 'after_25']].sum()

current_net    110700.0
after_15        94095.0
after_20        88560.0
after_25        83025.0
dtype: float64

Beyond

In [8]:
df = pd.DataFrame(
    [
        {
            "product_id": 23,
            "name": "computer",
            "wholesale_price": 500,
            "retail_price": 1000,
            "sales": 100,
        },
        {
            "product_id": 96,
            "name": "Python Workout",
            "wholesale_price": 35,
            "retail_price": 75,
            "sales": 1000,
        },
        {
            "product_id": 97,
            "name": "Pandas Workout",
            "wholesale_price": 35,
            "retail_price": 75,
            "sales": 500,
        },
        {
            "product_id": 15,
            "name": "banana",
            "wholesale_price": 0.5,
            "retail_price": 1,
            "sales": 200,
        },
        {
            "product_id": 87,
            "name": "sandwich",
            "wholesale_price": 3,
            "retail_price": 5,
            "sales": 300,
        },
    ]
)

df["current_net"] = (df["retail_price"] - df["wholesale_price"]) * df["sales"]
df

Unnamed: 0,product_id,name,wholesale_price,retail_price,sales,current_net
0,23,computer,500.0,1000,100,50000.0
1,96,Python Workout,35.0,75,1000,40000.0
2,97,Pandas Workout,35.0,75,500,20000.0
3,15,banana,0.5,1,200,100.0
4,87,sandwich,3.0,5,300,600.0


In [9]:
df['current_net'].apply(lambda x: x * 0.75 if x > 20_000 else x).sum()

88200.0

In [10]:
def calculate_tax(x):
    if x > 20_000:
        return x * 0.75
    else:
        return x
    
df['current_net'].apply(calculate_tax).sum()

88200.0

In [11]:
df["after_tax"] = pd.cut(
    df["retail_price"],
    bins=[0, 30, 80, df["retail_price"].max()],
    labels=[1, 0.9, 0.75],
).astype(np.float64)
df

Unnamed: 0,product_id,name,wholesale_price,retail_price,sales,current_net,after_tax
0,23,computer,500.0,1000,100,50000.0,0.75
1,96,Python Workout,35.0,75,1000,40000.0,0.9
2,97,Pandas Workout,35.0,75,500,20000.0,0.9
3,15,banana,0.5,1,200,100.0,1.0
4,87,sandwich,3.0,5,300,600.0,1.0


In [12]:
df['final_net'] = df['current_net'] * df['after_tax']
df

Unnamed: 0,product_id,name,wholesale_price,retail_price,sales,current_net,after_tax,final_net
0,23,computer,500.0,1000,100,50000.0,0.75,37500.0
1,96,Python Workout,35.0,75,1000,40000.0,0.9,36000.0
2,97,Pandas Workout,35.0,75,500,20000.0,0.9,18000.0
3,15,banana,0.5,1,200,100.0,1.0,100.0
4,87,sandwich,3.0,5,300,600.0,1.0,600.0


In [13]:
pd.options.display.float_format = '{:,.2f}'.format

In [14]:
df

Unnamed: 0,product_id,name,wholesale_price,retail_price,sales,current_net,after_tax,final_net
0,23,computer,500.0,1000,100,50000.0,0.75,37500.0
1,96,Python Workout,35.0,75,1000,40000.0,0.9,36000.0
2,97,Pandas Workout,35.0,75,500,20000.0,0.9,18000.0
3,15,banana,0.5,1,200,100.0,1.0,100.0
4,87,sandwich,3.0,5,300,600.0,1.0,600.0


## Retrieving and assigning with `loc`

如果我们只想检索行的一部分该怎么办？更重要的是，我们如何才能只对行的一部分设置值？

In [34]:
df = pd.DataFrame(np.linspace(10, 250, 25).reshape(5, 5),
                  index=list('abcde'), columns=list('vwxyz')).astype(int)
df

Unnamed: 0,v,w,x,y,z
a,10,20,30,40,50
b,60,70,80,90,100
c,110,120,130,140,150
d,160,170,180,190,200
e,210,220,230,240,250


In [35]:
df.loc['a', 'x']

30

In [36]:
df.loc['a', # Row selector
       'x'] # Column selector
# 对于复杂的选择器，可以将行列选择器分开写，更加明显

30

![](../IMAGES/loc1.png)

In [39]:
df.loc[['a', 'c'], 
       'x']

a     30
c    130
Name: x, dtype: int32

![](../IMAGES/loc2.png)

In [40]:
df.loc['a',
       ['v', 'y']]

v    10
y    40
Name: a, dtype: int32

![](../IMAGES/loc3.png)

In [41]:
df.loc[['a', 'c'],
       ['v', 'y']]

Unnamed: 0,v,y
a,10,40
c,110,140


![](../IMAGES/loc4.png)

In [42]:
df.loc[df['x'] > 200]

Unnamed: 0,v,w,x,y,z
e,210,220,230,240,250


In [43]:
df.loc[df['x'] > 200,
       df.loc['c'] > 135]

Unnamed: 0,y,z
e,240,250


![](../IMAGES/loc5.png)

In [44]:
df.loc['b',
       df.loc['c'] > 135]

y     90
z    100
Name: b, dtype: int32

![](../IMAGES/loc6.png)

我们的情况可能比这些复杂得多。但只要记住，要根据逗号前的行进行选择，并根据逗号后的进行列选择，就没问题了。

如果我们想要修改我们检索的值？我们可以通过将检索查询放在赋值语句的左侧。唯一的问题是右边的值必须是标量（在这种情况下它被广播并分配给所有匹配的元素）或具有匹配的形状（即行和列）。

In [45]:
df.loc['b', 'y'] = 123

![](../IMAGES/loc7.png)

In [46]:
df.loc['b', df.loc['c'] > 125] = [123, 456, 789]
# 这里有问题，就是不知道列选择器得到的结果有几列

Of course, this requires knowing precisely how many values will be needed.

![](../IMAGES/loc8.png)

In [47]:
df.loc['b',
       df.loc['c'] % 3 == 0] *= 2

![](../IMAGES/loc9.png)

In [49]:
df.loc[df['v'] > 100,
       df.loc['d'] > 180] = 987

![](../IMAGES/loc10.png)