In [1]:
import pandas as pd

# 数据划分区间(cut函数)

**pd.cut(x, bins, right=True, labels=None)**<br/>x：要划分区间的数据(必须是一维的)<br/>bins：区间边界值<br/>right：默认为True，表示划分区间时左开右闭，如果左闭右开则设置为False<br/>labels：默认为None,设置每个区间对应的显示label

In [2]:
tips = pd.read_csv('./data/tips.csv')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
# 查看数据列的信息
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


**查看 total_bill 这一列的各种统计值**

In [4]:
# 查看数据中 total_bill 列的统计值
tips['total_bill'].describe()

count    244.000000
mean      19.785943
std        8.902412
min        3.070000
25%       13.347500
50%       17.795000
75%       24.127500
max       50.810000
Name: total_bill, dtype: float64

**将 tips 数据按照 total_bill 列的值划分为 3 个区间：0-10、10-30、30-60**

In [5]:
# 指定划分区间的边界值
bins = [0, 10, 30, 60]
pd.cut(tips['total_bill'], bins)

0      (10, 30]
1      (10, 30]
2      (10, 30]
3      (10, 30]
4      (10, 30]
         ...   
239    (10, 30]
240    (10, 30]
241    (10, 30]
242    (10, 30]
243    (10, 30]
Name: total_bill, Length: 244, dtype: category
Categories (3, interval[int64]): [(0, 10] < (10, 30] < (30, 60]]

In [6]:
# right=False：设置划分区间时左闭右开
pd.cut(tips['total_bill'], bins, right=False)

0      [10, 30)
1      [10, 30)
2      [10, 30)
3      [10, 30)
4      [10, 30)
         ...   
239    [10, 30)
240    [10, 30)
241    [10, 30)
242    [10, 30)
243    [10, 30)
Name: total_bill, Length: 244, dtype: category
Categories (3, interval[int64]): [[0, 10) < [10, 30) < [30, 60)]

In [7]:
# 设置划分区间的 labels
labels = ['<10', '<30', '<60']
pd.cut(tips['total_bill'], bins, right=False, labels=labels)

0      <30
1      <30
2      <30
3      <30
4      <30
      ... 
239    <30
240    <30
241    <30
242    <30
243    <30
Name: total_bill, Length: 244, dtype: category
Categories (3, object): ['<10' < '<30' < '<60']

In [8]:
# 在 tips 数据中增加消费金额区间这一列
tips['bill_group'] = pd.cut(tips['total_bill'], bins, right=False, labels=labels)
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,bill_group
0,16.99,1.01,Female,No,Sun,Dinner,2,<30
1,10.34,1.66,Male,No,Sun,Dinner,3,<30
2,21.01,3.50,Male,No,Sun,Dinner,3,<30
3,23.68,3.31,Male,No,Sun,Dinner,2,<30
4,24.59,3.61,Female,No,Sun,Dinner,4,<30
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,<30
240,27.18,2.00,Female,Yes,Sat,Dinner,2,<30
241,22.67,2.00,Male,Yes,Sat,Dinner,2,<30
242,17.82,1.75,Male,No,Sat,Dinner,2,<30


In [9]:
# 按照消费区间列统计每组消费数据的数目
tips.groupby('bill_group').size()

bill_group
<10     17
<30    195
<60     32
dtype: int64