In [1]:
print("""
@File         : Binning algorithms.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-12-28 14:56:49
@Email        : cuixuanstephen@gmail.com
@Description  : Binning algorithms
""")


@File         : Binning algorithms.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-12-28 14:56:49
@Email        : cuixuanstephen@gmail.com
@Description  : Binning algorithms



In [2]:
import pandas as pd
import numpy as np

分箱是将连续变量归类到离散存储桶的过程。将可能无限数量的值转换为有限数量的“存储桶”以供分析非常有用。

In [3]:
df = pd.DataFrame([
    ["Jane", 34],
    ["John", 18],
    ["Jamie", 22],
    ["Jessica", 36],
    ["Jackie", 33],
    ["Steve", 40],
    ["Sam", 30],
    ["Stephanie", 66],
    ["Sarah", 55],
    ["Aaron", 22],
    ["Erin", 28],
    ["Elsa", 37],
], columns=["name", "age"])
df = df.convert_dtypes(dtype_backend="numpy_nullable")
df.head()

Unnamed: 0,name,age
0,Jane,34
1,John,18
2,Jamie,22
3,Jessica,36
4,Jackie,33


In [4]:
pd.cut(df['age'], bins=4)

0       (30.0, 42.0]
1     (17.952, 30.0]
2     (17.952, 30.0]
3       (30.0, 42.0]
4       (30.0, 42.0]
5       (30.0, 42.0]
6     (17.952, 30.0]
7       (54.0, 66.0]
8       (54.0, 66.0]
9     (17.952, 30.0]
10    (17.952, 30.0]
11      (30.0, 42.0]
Name: age, dtype: category
Categories (4, interval[float64, right]): [(17.952, 30.0] < (30.0, 42.0] < (42.0, 54.0] < (54.0, 66.0]]

无论 pandas 选择哪种算法来确定存储桶，第一个存储箱中的年龄 17.952 对 pandas 内部来说都是有意义的，但对我们来说，这最终是无趣的，因为我们知道我们处理的是整数。幸运的是，这可以通过 `precision` 关键字参数来控制，以删除任何小数位：

In [5]:
pd.cut(df['age'], bins=4, precision=0)

0     (30.0, 42.0]
1     (18.0, 30.0]
2     (18.0, 30.0]
3     (30.0, 42.0]
4     (30.0, 42.0]
5     (30.0, 42.0]
6     (18.0, 30.0]
7     (54.0, 66.0]
8     (54.0, 66.0]
9     (18.0, 30.0]
10    (18.0, 30.0]
11    (30.0, 42.0]
Name: age, dtype: category
Categories (4, interval[float64, right]): [(18.0, 30.0] < (30.0, 42.0] < (42.0, 54.0] < (54.0, 66.0]]

pd.cut并不限制我们生成大小相同的容器。如果我们想手动指定范围，可以将这些范围作为第二个参数提供：

In [6]:
pd.cut(df['age'], [10, 20, 30, 40, 50, 60, 70])

0     (30, 40]
1     (10, 20]
2     (20, 30]
3     (30, 40]
4     (30, 40]
5     (30, 40]
6     (20, 30]
7     (60, 70]
8     (50, 60]
9     (20, 30]
10    (20, 30]
11    (30, 40]
Name: age, dtype: category
Categories (6, interval[int64, right]): [(10, 20] < (20, 30] < (30, 40] < (40, 50] < (50, 60] < (60, 70]]

可以用 `np.inf` 来指定大于和小于

In [7]:
pd.cut(df["age"], [10, 20, 30, 40, 50, 60, np.inf])[10, 20, 30, 40, 50, 60, np.inf]

0     (30.0, 40.0]
1     (10.0, 20.0]
2     (20.0, 30.0]
3     (30.0, 40.0]
4     (30.0, 40.0]
5     (30.0, 40.0]
6     (20.0, 30.0]
7      (60.0, inf]
8     (50.0, 60.0]
9     (20.0, 30.0]
10    (20.0, 30.0]
11    (30.0, 40.0]
Name: age, dtype: category
Categories (6, interval[float64, right]): [(10.0, 20.0] < (20.0, 30.0] < (30.0, 40.0] < (40.0, 50.0] < (50.0, 60.0] < (60.0, inf]]

但是 `(60.0, inf]` 对阅读来说不是好事情，可以手动添加 `labels`。

In [8]:
pd.cut(
    df['age'], [10, 20, 30, 40, 50, 60, np.inf],
    labels=["10-20", "20-30", "30-40", "40-50", "50-60", "60+"]
)

0     30-40
1     10-20
2     20-30
3     30-40
4     30-40
5     30-40
6     20-30
7       60+
8     50-60
9     20-30
10    20-30
11    30-40
Name: age, dtype: category
Categories (6, object): ['10-20' < '20-30' < '30-40' < '40-50' < '50-60' < '60+']

但是，我们上面的标签并不完全正确。请注意，我们提供了 `30‑40` 和 `40‑50`，但如果某人正好是 40 岁，会发生什么情况？他们应该被放在哪个箱子里？

In [9]:
df.assign(age_bin=lambda x: pd.cut(x["age"], [10, 20, 30, 40, 50, 60, 999]))

Unnamed: 0,name,age,age_bin
0,Jane,34,"(30, 40]"
1,John,18,"(10, 20]"
2,Jamie,22,"(20, 30]"
3,Jessica,36,"(30, 40]"
4,Jackie,33,"(30, 40]"
5,Steve,40,"(30, 40]"
6,Sam,30,"(20, 30]"
7,Stephanie,66,"(60, 999]"
8,Sarah,55,"(50, 60]"
9,Aaron,22,"(20, 30]"


默认情况下，分箱是右闭区间的，这意味着每个分箱都可以被认为最多包含一个特定值。如果我们想要最多但不包括的行为，我们可以使用 `right` 来控制它：

In [10]:
df.assign(
    age_bin=lambda x: pd.cut(x["age"], [10, 20, 30, 40, 50, 60, 999],
                             right=False)
)

Unnamed: 0,name,age,age_bin
0,Jane,34,"[30, 40)"
1,John,18,"[10, 20)"
2,Jamie,22,"[20, 30)"
3,Jessica,36,"[30, 40)"
4,Jackie,33,"[30, 40)"
5,Steve,40,"[40, 50)"
6,Sam,30,"[30, 40)"
7,Stephanie,66,"[60, 999)"
8,Sarah,55,"[50, 60)"
9,Aaron,22,"[20, 30)"
