### 离散化与分箱

In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

In [2]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
# 将这些年龄分为18-25，26-35，36-60以及60以上几个组
bins = [18, 25, 36, 60, 100]
group_names = ['youth', 'youthadult', 'middleaged', 'senior']
groups = pd.cut(ages, bins, labels=group_names)               # 传入right=False, 左闭右开
groups

['youth', 'youth', 'youth', 'youthadult', 'youth', ..., 'youthadult', 'senior', 'middleaged', 'middleaged', 'youthadult']
Length: 12
Categories (4, object): ['youth' < 'youthadult' < 'middleaged' < 'senior']

In [3]:
# pd.cut返回一个特殊的Categorical对象
groups.categories

Index(['youth', 'youthadult', 'middleaged', 'senior'], dtype='object')

In [4]:
groups.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [5]:
pd.value_counts(groups)

youth         5
middleaged    3
youthadult    3
senior        1
dtype: int64

如果传给cut整数个的箱来代替显式的箱边，pandas将根据数据中最小值和最大值计算出等长的箱:

In [6]:
frame = pd.DataFrame(np.random.randn(1000), columns=['data'])
frame.head()

Unnamed: 0,data
0,0.496714
1,-0.138264
2,0.647689
3,1.52303
4,-0.234153


In [7]:
groups = pd.cut(frame.data, 5, precision=2)
groups

0       (-0.4, 1.02]
1       (-0.4, 1.02]
2       (-0.4, 1.02]
3       (1.02, 2.43]
4       (-0.4, 1.02]
           ...      
995     (-0.4, 1.02]
996     (1.02, 2.43]
997     (-0.4, 1.02]
998    (-1.82, -0.4]
999     (-0.4, 1.02]
Name: data, Length: 1000, dtype: category
Categories (5, interval[float64]): [(-3.25, -1.82] < (-1.82, -0.4] < (-0.4, 1.02] < (1.02, 2.43] < (2.43, 3.85]]

cut返回的Categorical对象可以直接传递给groupby:

In [8]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}


grouped = frame.data.groupby(groups)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,min,max,count,mean
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.25, -1.82]",-3.241267,-1.840874,26.0,-2.173346
"(-1.82, -0.4]",-1.804882,-0.408075,324.0,-0.918304
"(-0.4, 1.02]",-0.40122,1.01437,498.0,0.270195
"(1.02, 2.43]",1.029156,2.314659,142.0,1.489463
"(2.43, 3.85]",2.445752,3.852731,10.0,2.730883


为了根据样本分位数计算出等大小的桶，则需要使用qcut，通过传递labels=False来获得分位数数值:

In [9]:
grouping = pd.qcut(frame.data, 5, labels=False)
grouped = frame.data.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,min,max,count,mean
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-3.241267,-0.808298,200.0,-1.327204
1,-0.802277,-0.241236,200.0,-0.521204
2,-0.240325,0.248221,200.0,0.019183
3,0.249384,0.81351,200.0,0.515885
4,0.813517,3.852731,200.0,1.41


qcut基于样本分位数进行分箱，可以通过qcut获得等长的箱:

In [10]:
data = np.random.randn(2000)
cuts = pd.qcut(data, 4)
pd.value_counts(cuts)

(0.683, 3.926]      500
(0.0241, 0.683]     500
(-0.624, 0.0241]    500
(-3.021, -0.624]    500
dtype: int64

In [11]:
pd.value_counts(pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.0]))   # 传入自定义的分位数(0到1之间)

(0.0241, 1.302]     800
(-1.221, 0.0241]    800
(1.302, 3.926]      200
(-3.021, -1.221]    200
dtype: int64