In [1]:
import pandas as pd

# 数据分组操作

## MultiIndex 多级标签

**加载 gapminder.tsv 数据**

In [2]:
gapminder = pd.read_csv('./data/gapminder.tsv', sep='\t')
gapminder

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


**将 year 和 country 两列设置为行标签**

In [6]:
multiindex_gapminder = gapminder.set_index(['year', 'country'])
multiindex_gapminder.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,continent,lifeExp,pop,gdpPercap
year,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,Afghanistan,Asia,28.801,8425333,779.445314
1957,Afghanistan,Asia,30.332,9240934,820.85303
1962,Afghanistan,Asia,31.997,10267083,853.10071
1967,Afghanistan,Asia,34.02,11537966,836.197138
1972,Afghanistan,Asia,36.088,13079460,739.981106


In [7]:
# 查看行标签
multiindex_gapminder.index

MultiIndex([(1952, 'Afghanistan'),
            (1957, 'Afghanistan'),
            (1962, 'Afghanistan'),
            (1967, 'Afghanistan'),
            (1972, 'Afghanistan'),
            (1977, 'Afghanistan'),
            (1982, 'Afghanistan'),
            (1987, 'Afghanistan'),
            (1992, 'Afghanistan'),
            (1997, 'Afghanistan'),
            ...
            (1962,    'Zimbabwe'),
            (1967,    'Zimbabwe'),
            (1972,    'Zimbabwe'),
            (1977,    'Zimbabwe'),
            (1982,    'Zimbabwe'),
            (1987,    'Zimbabwe'),
            (1992,    'Zimbabwe'),
            (1997,    'Zimbabwe'),
            (2002,    'Zimbabwe'),
            (2007,    'Zimbabwe')],
           names=['year', 'country'], length=1704)

**根据一级行标签获取数据，示例：获取 1952 年的数据**

In [8]:
# 示例：获取 1952 年的数据
multiindex_gapminder.loc[[1952]]

Unnamed: 0_level_0,Unnamed: 1_level_0,continent,lifeExp,pop,gdpPercap
year,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,Afghanistan,Asia,28.801,8425333,779.445314
1952,Albania,Europe,55.230,1282697,1601.056136
1952,Algeria,Africa,43.077,9279525,2449.008185
1952,Angola,Africa,30.015,4232095,3520.610273
1952,Argentina,Americas,62.485,17876956,5911.315053
1952,...,...,...,...,...
1952,Vietnam,Asia,40.412,26246839,605.066492
1952,West Bank and Gaza,Asia,43.160,1030585,1515.592329
1952,"Yemen, Rep.",Asia,32.548,4963829,781.717576
1952,Zambia,Africa,42.038,2672000,1147.388831


**根据一级、二级行标签获取数据，示例：获取 1952 年中国的数据**

In [9]:
# 示例：获取 1952 年中国的数据
multiindex_gapminder.loc[[(1952, 'China')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,continent,lifeExp,pop,gdpPercap
year,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,China,Asia,44.0,556263527,400.448611


## 分组聚合操作

**加载 gapminder.tsv 数据集**

In [3]:
gapminder = pd.read_csv('./data/gapminder.tsv', sep='\t')
gapminder.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


### 内置聚合函数使用

**示例：计算每年期望年龄的平均值**

In [11]:
# 示例：计算每年期望年龄的平均值
# gapminder.groupby('year')['lifeExp'].mean()
gapminder.groupby('year').lifeExp.mean()

gapminder.groupby('year').agg({'lifeExp': 'mean'})

import numpy as np
gapminder.groupby('year').agg({'lifeExp': np.mean})

Unnamed: 0_level_0,lifeExp
year,Unnamed: 1_level_1
1952,49.05762
1957,51.507401
1962,53.609249
1967,55.67829
1972,57.647386
1977,59.570157
1982,61.533197
1987,63.212613
1992,64.160338
1997,65.014676


**示例：统计每年预期寿命的最小值、最大值和平均值**

In [12]:
# 示例：统计每年预期寿命的最小值、最大值和平均值
# gapminder.groupby('year')['lifeExp'].agg(['min', 'max', 'mean'])
gapminder.groupby('year').lifeExp.agg(['min', 'max', 'mean'])

Unnamed: 0_level_0,min,max,mean
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,28.801,72.67,49.05762
1957,30.332,73.47,51.507401
1962,31.997,73.68,53.609249
1967,34.02,74.16,55.67829
1972,35.4,74.72,57.647386
1977,31.22,76.11,59.570157
1982,38.445,77.11,61.533197
1987,39.906,78.67,63.212613
1992,23.599,79.36,64.160338
1997,36.087,80.69,65.014676


**示例：统计每年的人均寿命和GDP的最大值**

In [13]:
# 示例：统计每年的人均寿命和GDP的最大值
ret = gapminder.groupby('year').agg({'lifeExp': 'mean', 'gdpPercap': 'max'})
ret.rename(columns={'lifeExp': '人均寿命', 'gdpPercap': '最高GDP'})

Unnamed: 0_level_0,人均寿命,最高GDP
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1952,49.05762,108382.3529
1957,51.507401,113523.1329
1962,53.609249,95458.11176
1967,55.67829,80894.88326
1972,57.647386,109347.867
1977,59.570157,59265.47714
1982,61.533197,33693.17525
1987,63.212613,31540.9748
1992,64.160338,34932.91959
1997,65.014676,41283.16433


### 自定义聚合函数使用

**示例：计算每年期望年龄的平均值(自定义聚合函数)**

In [14]:
def my_mean(values):
    """计算平均值"""
    # 获取数据条目数
    n = len(values)
    _sum = 0
    for value in values:
        _sum += value
    return _sum/n

# gapminder.groupby('year')['lifeExp'].agg(my_mean)
gapminder.groupby('year').lifeExp.agg(my_mean)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

**示例：统计每年的平均年龄和所有平均年龄的差值(自定义聚合函数)**

## 分组 transform 转换

##### transform 转换，需要把 DataFrame 中的值传递给一个函数， 而后由该函数"转换"数据
##### aggregate(聚合) 返回单个聚合值，但 transform 不会减少数据量

### transform 基本使用

**需求：按年分组，并计算组内每个人的预期寿命和该组平均年龄的差值**

### transform 案例：分组填充缺失值

**加载 tips.csv 数据集，并从其中随机取出 10 条数据**

**构建缺失值**

**分组查看缺失情况**

**定义函数，按性别分组填充缺失值**

## 分组 filter 过滤

##### 使用 groupby 方法还可以过滤分组区数据，调用 filter 方法，传入一个返回布尔值的函数，返回 False 的分组数据会被过滤掉

**使用 tips.csv 用餐数据集，加载数据并不同用餐人数的数量**

In [4]:
tips = pd.read_csv('./data/tips.csv')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


**统计不同用餐人数的数量**

## DataFrameGroupBy 对象

### 分组对象的基本属性和方法

**加载 tips.csv 数据集，随机取出其中的 10 条数据**

In [5]:
tips_10 = pd.read_csv('./data/tips.csv').sample(10, random_state=42)
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
24,19.82,3.18,Male,No,Sat,Dinner,2
6,8.77,2.0,Male,No,Sun,Dinner,2
153,24.55,2.0,Male,No,Sun,Dinner,4
211,25.89,5.16,Male,Yes,Sat,Dinner,4
198,13.0,2.0,Female,Yes,Thur,Lunch,2
176,17.89,2.0,Male,Yes,Sun,Dinner,2
192,28.44,2.56,Male,Yes,Thur,Lunch,2
124,12.48,2.52,Female,No,Thur,Lunch,2
9,14.78,3.23,Male,No,Sun,Dinner,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


### 遍历分组对象

### 按照多列进行分组