In [1]:
import numpy as np
import pandas as pd

#### DataFrame.dropna(axis=0, how='any', thresh=None, subset=None,inplace=False)
- 删除缺失值所在的行或列
- axis：0表示删除包含缺失值的行，1表示删除包含缺失值的列
- how：'any'表示如果存在任何缺失值，则删除该行或列；'all'表示如果所有值都是缺失值，则删除该行或列
- thresh：只保留至少n个非NaN值的行或列，n由该参数指定
- subset：定义要根据哪些列（行）中的缺失值来删除行（列），和axis成行列对应关系
- inplace：如果为 True 表示对原数据操作，返回 None

In [2]:
d = {'name': ['Tom', np.nan, 'Bob'], 
     'age': [np.nan, np.nan, 19], 
     'height': [177, 182, 179]}
df = pd.DataFrame(data=d)
print(df)

  name   age  height
0  Tom   NaN     177
1  NaN   NaN     182
2  Bob  19.0     179


In [3]:
# 删除缺失值所在的行
print(df.dropna())
print('*' * 30)
print(df.dropna(how='any'))
print('*' * 30)
# 删除缺失值所在的列
print(df.dropna(axis=1))
print('*' * 30)
print(df.dropna(axis=1, how='any'))

  name   age  height
2  Bob  19.0     179
******************************
  name   age  height
2  Bob  19.0     179
******************************
   height
0     177
1     182
2     179
******************************
   height
0     177
1     182
2     179


In [4]:
# 修改数据
df.loc[2, 'age'] = np.nan
df.loc[1, 'height'] = np.nan
print(df)
print('*' * 30)
# 删除所有值都是缺失值的行
print(df.dropna(how='all'))
print('*' * 30)
print(df.dropna(axis=1, how='all'))

  name  age  height
0  Tom  NaN   177.0
1  NaN  NaN     NaN
2  Bob  NaN   179.0
******************************
  name  age  height
0  Tom  NaN   177.0
2  Bob  NaN   179.0
******************************
  name  height
0  Tom   177.0
1  NaN     NaN
2  Bob   179.0


#### threshold 值，门槛

In [5]:
df.loc[2, 'age'] = 19
df.loc[1, 'height'] = 182
print(df)
print('*' * 30)
# 只保留至少2个非NaN值的列
print(df.dropna(thresh=2))
print('*' * 30)
print(df.dropna(axis=1, thresh=2))

  name   age  height
0  Tom   NaN   177.0
1  NaN   NaN   182.0
2  Bob  19.0   179.0
******************************
  name   age  height
0  Tom   NaN   177.0
2  Bob  19.0   179.0
******************************
  name  height
0  Tom   177.0
1  NaN   182.0
2  Bob   179.0


In [6]:
# 根据'name'、'height'列中的缺失值来删除行
print(df.dropna(subset=['name', 'height']))
print('*' * 30)
# 根据0、2行中的缺失值来删除列
print(df.dropna(axis=1, subset=[0, 2]))

  name   age  height
0  Tom   NaN   177.0
2  Bob  19.0   179.0
******************************
  name  height
0  Tom   177.0
1  NaN   182.0
2  Bob   179.0


In [7]:
print(df)
print('*' * 30)
# 对原数据操作
df.dropna(axis=1, inplace=True)
print(df)

  name   age  height
0  Tom   NaN   177.0
1  NaN   NaN   182.0
2  Bob  19.0   179.0
******************************
   height
0   177.0
1   182.0
2   179.0


#### DataFrame.fillna(value=None, method=None, axis=None, inplace=False,limit=None)
- value：需要填充的数据
- method：填充方式。'pad'/'ffill' 表示用前一个非缺失值去填充该缺失值；'backfill'/'bfill' 表示用后一个非缺失值填充该缺失值
- axis：指定填充方向
- inplace：如果为 True 表示对原数据操作，返回 None
- limit：限制填充个数

In [8]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                   [3, 4, np.nan, 1],
                   [np.nan, np.nan, np.nan, np.nan],
                   [np.nan, 3, np.nan, 4]],
                  columns=list("ABCD"))
print(df)
print('*' * 30)
# 用标量填充：将所有NaN填充为0
print(df.fillna(0))

     A    B   C    D
0  NaN  2.0 NaN  0.0
1  3.0  4.0 NaN  1.0
2  NaN  NaN NaN  NaN
3  NaN  3.0 NaN  4.0
******************************
     A    B    C    D
0  0.0  2.0  0.0  0.0
1  3.0  4.0  0.0  1.0
2  0.0  0.0  0.0  0.0
3  0.0  3.0  0.0  4.0


In [9]:
# 用字典填充：指定不同列的NaN填充值
dic = {'A': 6, 'B': 7}
print(df.fillna(dic))
print('*' * 30)
# 指定'A'列填充为6,'C'列填充7, 且限制每列最多填充2个
print(df.fillna({'A': 6, 'C': 7}, limit=2))

     A    B   C    D
0  6.0  2.0 NaN  0.0
1  3.0  4.0 NaN  1.0
2  6.0  7.0 NaN  NaN
3  6.0  3.0 NaN  4.0
******************************
     A    B    C    D
0  6.0  2.0  7.0  0.0
1  3.0  4.0  7.0  1.0
2  6.0  NaN  NaN  NaN
3  NaN  3.0  NaN  4.0


In [10]:
# 当使用DataFrame填充时，替换会沿着相同的行列索引进行
np.random.seed(3)
arr = np.random.randint(100, 200, size=(3, 5))
df2 = pd.DataFrame(arr, columns=list("DHATC"))
print(df2)
print('*' * 30)
print(df.fillna(df2))

     D    H    A    T    C
0  124  103  156  172  100
1  121  119  174  141  110
2  121  138  196  120  144
******************************
       A    B      C      D
0  156.0  2.0  100.0    0.0
1    3.0  4.0  110.0    1.0
2  196.0  NaN  144.0  121.0
3    NaN  3.0    NaN    4.0


In [11]:
# 用前一个非缺失值去填充该缺失值
print(df.fillna(method='pad'))
print(df.fillna(method='ffill', axis=1))

     A    B   C    D
0  NaN  2.0 NaN  0.0
1  3.0  4.0 NaN  1.0
2  3.0  4.0 NaN  1.0
3  3.0  3.0 NaN  4.0
     A    B    C    D
0  NaN  2.0  2.0  0.0
1  3.0  4.0  4.0  1.0
2  NaN  NaN  NaN  NaN
3  NaN  3.0  3.0  4.0


In [12]:
print(df)

     A    B   C    D
0  NaN  2.0 NaN  0.0
1  3.0  4.0 NaN  1.0
2  NaN  NaN NaN  NaN
3  NaN  3.0 NaN  4.0


In [13]:
# 用后一个非缺失值去填充该缺失值
print(df.fillna(method='backfill'))
print(df.fillna(method='bfill', axis=1))

     A    B   C    D
0  3.0  2.0 NaN  0.0
1  3.0  4.0 NaN  1.0
2  NaN  3.0 NaN  4.0
3  NaN  3.0 NaN  4.0
     A    B    C    D
0  2.0  2.0  0.0  0.0
1  3.0  4.0  1.0  1.0
2  NaN  NaN  NaN  NaN
3  3.0  3.0  4.0  4.0


#### DataFrame.info(verbose=None, show_counts=None)
- 打印 DataFrame 的简明摘要
- verbose：是否打印完整的摘要，为None时表示打印完整摘要，为False则打印简短摘要
- show_counts：是否显示Non-Null Count，为None时表示显示，为False则不显示

In [14]:
df = pd.DataFrame(data={'name': ['Tom', 'Bob', np.nan],
                        'age': [18, 19, 17],
                        'height': [167, 177, 178]},
                  index=['n1', 'n2', 'n3'])
print(df)
print('*' * 30)
df.info()

   name  age  height
n1  Tom   18     167
n2  Bob   19     177
n3  NaN   17     178
******************************
<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, n1 to n3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    2 non-null      object
 1   age     3 non-null      int64 
 2   height  3 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 96.0+ bytes


In [15]:
df.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, n1 to n3
Columns: 3 entries, name to height
dtypes: int64(2), object(1)
memory usage: 96.0+ bytes


In [16]:
df.info(show_counts=False)

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, n1 to n3
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   name    object
 1   age     int64 
 2   height  int64 
dtypes: int64(2), object(1)
memory usage: 96.0+ bytes


#### DataFrame.describe(percentiles=None, include=None, exclude=None)
- 返回描述性统计
- percentiles：默认值为 [.25, .5, .75]，它返回第 25、第 50 和第 75个百分位数
- include：包含在结果中的数据类型；默认None表示结果将包括所有数字列；'all'表示包括所有列；'number'表示包括所有数字列；'object'表示包括所有字符列
- exclude：不包含在结果中的数据类型；默认None表示结果不会排除任何列；'number'表示不包括所有数字列；'object'表示不包括所有字符列

In [17]:
df = pd.DataFrame(data={'name': ['Tom', 'Bob', 'Bob'],
                        'age': [18, 19, 17],
                        'height': [167, 177, 178]},
                  index=['n1', 'n2', 'n3'])
print(df)

   name  age  height
n1  Tom   18     167
n2  Bob   19     177
n3  Bob   17     178


In [18]:
print(df.describe())
print('*' * 30)
print(df.describe(exclude='object'))

        age      height
count   3.0    3.000000
mean   18.0  174.000000
std     1.0    6.082763
min    17.0  167.000000
25%    17.5  172.000000
50%    18.0  177.000000
75%    18.5  177.500000
max    19.0  178.000000
******************************
        age      height
count   3.0    3.000000
mean   18.0  174.000000
std     1.0    6.082763
min    17.0  167.000000
25%    17.5  172.000000
50%    18.0  177.000000
75%    18.5  177.500000
max    19.0  178.000000


In [19]:
print(df.describe(include='object'))
print('*' * 30)
print(df.describe(exclude='number'))

       name
count     3
unique    2
top     Bob
freq      2
******************************
       name
count     3
unique    2
top     Bob
freq      2


In [20]:
print(df.describe(include='all'))
print('*' * 30)
# 也可以是数据类型的组合
print(df.describe(include=['number', 'object']))

       name   age      height
count     3   3.0    3.000000
unique    2   NaN         NaN
top     Bob   NaN         NaN
freq      2   NaN         NaN
mean    NaN  18.0  174.000000
std     NaN   1.0    6.082763
min     NaN  17.0  167.000000
25%     NaN  17.5  172.000000
50%     NaN  18.0  177.000000
75%     NaN  18.5  177.500000
max     NaN  19.0  178.000000
******************************
       name   age      height
count     3   3.0    3.000000
unique    2   NaN         NaN
top     Bob   NaN         NaN
freq      2   NaN         NaN
mean    NaN  18.0  174.000000
std     NaN   1.0    6.082763
min     NaN  17.0  167.000000
25%     NaN  17.5  172.000000
50%     NaN  18.0  177.000000
75%     NaN  18.5  177.500000
max     NaN  19.0  178.000000


- DataFrame.count(axis=0) 返回指定轴的非缺失值的数量
- DataFrame.max(axis=0) 返回指定轴的最大值
- DataFrame.min(axis=0) 返回指定轴的最小值
- DataFrame.mean(axis=0) 返回指定轴的平均值
- DataFrame.var(axis=0) 返回指定轴的方差
- DataFrame.std(axis=0) 返回指定轴的标准差

In [21]:
np.random.seed(3)
arr = np.random.randint(0, 10, size=(3, 5))
df = pd.DataFrame(arr, columns=list("DHATC"))
print(df)
print('*' * 30)
print(df.count())
print('*' * 30)
print(df.count(axis=1))

   D  H  A  T  C
0  8  9  3  8  8
1  0  5  3  9  9
2  5  7  6  0  4
******************************
D    3
H    3
A    3
T    3
C    3
dtype: int64
******************************
0    5
1    5
2    5
dtype: int64


In [22]:
print(df.max(axis=0))  # 返回每列最大值
print('*' * 30)
print(df.max(axis=1))  # 返回每行最大值

D    8
H    9
A    6
T    9
C    9
dtype: int32
******************************
0    9
1    9
2    7
dtype: int32


In [23]:
print(df.min())  # 返回每列最小值
print('*' * 30)
print(df.min(axis=1))  # 返回每行最小值

D    0
H    5
A    3
T    0
C    4
dtype: int32
******************************
0    3
1    0
2    0
dtype: int32


In [24]:
print(df.mean())  # 返回每列平均值
print('*' * 30)
print(df.mean(axis=1))  # 返回每行平均值

D    4.333333
H    7.000000
A    4.000000
T    5.666667
C    7.000000
dtype: float64
******************************
0    7.2
1    5.2
2    4.4
dtype: float64


In [25]:
print(df.var())  # 返回每列方差
print('*' * 30)
print(df.var(axis=1))  # 返回每行方差

D    16.333333
H     4.000000
A     3.000000
T    24.333333
C     7.000000
dtype: float64
******************************
0     5.7
1    15.2
2     7.3
dtype: float64


In [26]:
print(df.std())  # 返回每列标准差
print('*' * 30)
print(df.std(axis=1))  # 返回每行标准差

D    4.041452
H    2.000000
A    1.732051
T    4.932883
C    2.645751
dtype: float64
******************************
0    2.387467
1    3.898718
2    2.701851
dtype: float64


#### DataFrame.apply(func, axis=0)
- 沿着给定的 DataFrame 轴应用 func 的结果
- func：应用于每一个列或行的函数
- axis：0 or 'index'表示函数处理的是每一列；1 or 'columns'表示函数处理的是每一行

In [27]:
print(df)

   D  H  A  T  C
0  8  9  3  8  8
1  0  5  3  9  9
2  5  7  6  0  4


In [28]:
print(df.median())
print('*' * 30)
print(df.apply(np.median))
print('*' * 30)
print(df.apply(np.median, axis=1))

D    5.0
H    7.0
A    3.0
T    8.0
C    8.0
dtype: float64
******************************
D    5.0
H    7.0
A    3.0
T    8.0
C    8.0
dtype: float64
******************************
0    8.0
1    5.0
2    5.0
dtype: float64


In [29]:
def f(seq):
    return [i ** 2 for i in seq]


print(df.apply(f))

    D   H   A   T   C
0  64  81   9  64  64
1   0  25   9  81  81
2  25  49  36   0  16


#### DataFrame.sample(n=None, frac=None, replace=False,random_state=None, axis=None)
- 从指定的轴返回随机样本
- n：默认为1，表示要采样的行数或列数，不能和 frac 参数一起使用
- frac：表示要采用的比例，不能和 n 参数一起使用
- replace：表示是否有放回采样
- random_state：随机数种子
- axis：表示采样的方向，默认为0，行采样

In [30]:
df = pd.DataFrame(data={'name': ['Tom', 'Bob', 'Jack', 'Linda'],
                        'age': [18, 19, 17, 21],
                        'height': [167, 177, 178, 188]},
                  index=['n1', 'n2', 'n3', 'n4'])
print(df)

     name  age  height
n1    Tom   18     167
n2    Bob   19     177
n3   Jack   17     178
n4  Linda   21     188


In [31]:
# 默认n=1, 随机采样一行数据
print(df.sample())
print('*' * 30)
# 默认n=1, 随机采样一列数据
print(df.sample(axis=1))
print('*' * 30)
# 随机采样75%的数据, 即这里即3行
print(df.sample(frac=0.70))

     name  age  height
n4  Linda   21     188
******************************
    age
n1   18
n2   19
n3   17
n4   21
******************************
     name  age  height
n4  Linda   21     188
n1    Tom   18     167
n2    Bob   19     177


In [32]:
# 随机采样2列数据
print(df.sample(n=2, axis=1))
print('*' * 30)
# 随机采样2行数据
print(df.sample(n=2, replace=False))
print('*' * 30)
# 随机有放回采样2行数据(有可能会重复采样)
print(df.sample(n=2, replace=True))

    age   name
n1   18    Tom
n2   19    Bob
n3   17   Jack
n4   21  Linda
******************************
     name  age  height
n4  Linda   21     188
n2    Bob   19     177
******************************
     name  age  height
n4  Linda   21     188
n3   Jack   17     178


In [33]:
# 随机采样2行数据, 设置随机数种子seed为3
print(df.sample(n=2, random_state=3))
print(df.sample(frac=0.5, random_state=3))

     name  age  height
n4  Linda   21     188
n2    Bob   19     177
     name  age  height
n4  Linda   21     188
n2    Bob   19     177


#### DataFrame.drop_duplicates(subset=None, keep='first', inplace=False)
- 返回去重（删除重复行）之后的 DataFrame
- subset：表示要进行去重的列名，默认为 None，表示所有列
- keep：保留哪些副本。'first'表示只保留第一次出现的重复项，删除其余重复项；'last '表示只保留最后一次出现的重复项；False 则表示删除所有重复项
- inplace：False 表示删除重复项后返回一个副本；Ture 表示直接在原数据上删除重复项

In [34]:
d = {'A': [1, 3, 3, 1],
     'B': [0, 2, 5, 0],
     'C': [4, 0, 4, 4],
     'D': [1, 0, 0, 1]}
df = pd.DataFrame(data=d)
print(df)

   A  B  C  D
0  1  0  4  1
1  3  2  0  0
2  3  5  4  0
3  1  0  4  1


In [35]:
# 第一行和第四行重复, 删除第4行
print(df.drop_duplicates())
print('*' * 30)
# 第一行和第四行重复, 删除第1行
print(df.drop_duplicates(keep='last'))
print('*' * 30)
# 第一行和第四行重复, 两行都删除
print(df.drop_duplicates(keep=False))

   A  B  C  D
0  1  0  4  1
1  3  2  0  0
2  3  5  4  0
******************************
   A  B  C  D
1  3  2  0  0
2  3  5  4  0
3  1  0  4  1
******************************
   A  B  C  D
1  3  2  0  0
2  3  5  4  0


In [36]:
# 指定columns为'A'和'D'的两列去重
# 第二行和第三行重复, 保留第三行
# 第一行和第四行重复, 保留第四行
print(df.drop_duplicates(subset=['A', 'D'], keep='first'))
print('*' * 30)
# 直接在原数据上删除重复项
df.drop_duplicates(subset=['A', 'D'], keep='last', inplace=True)
print(df)

   A  B  C  D
0  1  0  4  1
1  3  2  0  0
******************************
   A  B  C  D
2  3  5  4  0
3  1  0  4  1


#### DataFrame.sort_values(by, axis=0, ascending=True, inplace=False,na_position='last')
- by：要排序的名称或名称列表
- 如果 axis=0 或 'index'，by可指定列标签
- 如果 axis=1 或 'columns'，by可指定行标签
- axis：要排序的轴，可选择 0 或 'index', 1 或 'columns'
- ascending：False 则为降序；如果这是一个 bool 列表，则必须匹配 by 的长度
- inplace：是否原地操作
- na_position：设置缺失值的排序位置，'first'表示开头，'last'表示结尾

In [37]:
df = pd.DataFrame({'col1': [4, 1, 2, np.nan, 5, 2],
                   'col2': [2, 1, 9, 8, 7, 6],
                   'col3': [0, 1, 9, 4, 2, 3],
                   'col4': ['a', 'B', 'c', 'D', 'e', 1]
                   })
print(df)

   col1  col2  col3 col4
0   4.0     2     0    a
1   1.0     1     1    B
2   2.0     9     9    c
3   NaN     8     4    D
4   5.0     7     2    e
5   2.0     6     3    1


In [38]:
print(df.sort_values(by=['col1']))
print('*' * 30)
print(df.sort_values(by='col1', ascending=False))

   col1  col2  col3 col4
1   1.0     1     1    B
2   2.0     9     9    c
5   2.0     6     3    1
0   4.0     2     0    a
4   5.0     7     2    e
3   NaN     8     4    D
******************************
   col1  col2  col3 col4
4   5.0     7     2    e
0   4.0     2     0    a
2   2.0     9     9    c
5   2.0     6     3    1
1   1.0     1     1    B
3   NaN     8     4    D


In [39]:
print(df.sort_values(by='col1', na_position='first'))
print('*' * 30)
print(df.sort_values(by=['col1', 'col2']))

   col1  col2  col3 col4
3   NaN     8     4    D
1   1.0     1     1    B
2   2.0     9     9    c
5   2.0     6     3    1
0   4.0     2     0    a
4   5.0     7     2    e
******************************
   col1  col2  col3 col4
1   1.0     1     1    B
5   2.0     6     3    1
2   2.0     9     9    c
0   4.0     2     0    a
4   5.0     7     2    e
3   NaN     8     4    D


In [40]:
print(df.sort_values(by=5, axis=1))
print('*' * 30)
print(df.sort_values(['col1', 'col2'], ascending=[True, False]))

  col4  col1  col3  col2
0    a   4.0     0     2
1    B   1.0     1     1
2    c   2.0     9     9
3    D   NaN     4     8
4    e   5.0     2     7
5    1   2.0     3     6
******************************
   col1  col2  col3 col4
1   1.0     1     1    B
2   2.0     9     9    c
5   2.0     6     3    1
0   4.0     2     0    a
4   5.0     7     2    e
3   NaN     8     4    D


In [41]:
df.sort_values(by='col1', inplace=True)
print(df)

   col1  col2  col3 col4
1   1.0     1     1    B
2   2.0     9     9    c
5   2.0     6     3    1
0   4.0     2     0    a
4   5.0     7     2    e
3   NaN     8     4    D


#### DataFrame.groupby(by=None, as_index=True, sort=True, dropna=True)
- 返回一个包含分组信息的 DataFrameGroupBy 对象
- by：指定根据哪个或者哪些标签分组
- as_index：对于聚合操作的输出结果，默认将分组列的值作为索引，如果将 as_index 设置为 False，可以重置索引（0, 1, 2...）
- sort：结果按分组标签的值升序排列，设置为False则不排序
- dropna：默认为 True 时，分组标签那列的 NaN 在分组结果中不保留，设置为 False，可以保留 NaN 分组

In [42]:
d = {'company': ['A', 'B', 'A', 'C', 'C', 'B', 'C', 'A'],
     'salary': [8, 15, 10, 15, np.nan, 28, 30, 15],
     'age': [26, 29, 26, 30, 50, 30, 30, 35]}
df = pd.DataFrame(data=d)
print(df)

  company  salary  age
0       A     8.0   26
1       B    15.0   29
2       A    10.0   26
3       C    15.0   30
4       C     NaN   50
5       B    28.0   30
6       C    30.0   30
7       A    15.0   35


In [43]:
df_gb = df.groupby(by='age')
print(df_gb)
for i in df_gb:
    print(i)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000021AD132D4F0>
(26,   company  salary  age
0       A     8.0   26
2       A    10.0   26)
(29,   company  salary  age
1       B    15.0   29)
(30,   company  salary  age
3       C    15.0   30
5       B    28.0   30
6       C    30.0   30)
(35,   company  salary  age
7       A    15.0   35)
(50,   company  salary  age
4       C     NaN   50)


In [44]:
df_gb = df.groupby('age', sort=False)
print(df_gb)
for i in df_gb:
    print(i)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000021AD13EB520>
(26,   company  salary  age
0       A     8.0   26
2       A    10.0   26)
(29,   company  salary  age
1       B    15.0   29)
(30,   company  salary  age
3       C    15.0   30
5       B    28.0   30
6       C    30.0   30)
(50,   company  salary  age
4       C     NaN   50)
(35,   company  salary  age
7       A    15.0   35)


In [45]:
df_gb = df.groupby(by=['salary'], dropna=False, as_index=False)
for g, data in df_gb:
    print(g)
    print(data)

8.0
  company  salary  age
0       A     8.0   26
10.0
  company  salary  age
2       A    10.0   26
15.0
  company  salary  age
1       B    15.0   29
3       C    15.0   30
7       A    15.0   35
28.0
  company  salary  age
5       B    28.0   30
30.0
  company  salary  age
6       C    30.0   30
nan
  company  salary  age
4       C     NaN   50


In [46]:
# 根据'salary'列的数据对行分组, 返回DataFrameGroupBy的实例对象
df_gb = df.groupby(by=['age', 'company'])
# 该实例对象是iterable, 迭代操作可以得到各个分组
for g, data in df_gb:
    print(g)
    print(data)

(26, 'A')
  company  salary  age
0       A     8.0   26
2       A    10.0   26
(29, 'B')
  company  salary  age
1       B    15.0   29
(30, 'B')
  company  salary  age
5       B    28.0   30
(30, 'C')
  company  salary  age
3       C    15.0   30
6       C    30.0   30
(35, 'A')
  company  salary  age
7       A    15.0   35
(50, 'C')
  company  salary  age
4       C     NaN   50


In [47]:
df_gb = df.groupby(by=['company', 'age'])
for g, data in df_gb:
    print(g)
    print(data)

('A', 26)
  company  salary  age
0       A     8.0   26
2       A    10.0   26
('A', 35)
  company  salary  age
7       A    15.0   35
('B', 29)
  company  salary  age
1       B    15.0   29
('B', 30)
  company  salary  age
5       B    28.0   30
('C', 30)
  company  salary  age
3       C    15.0   30
6       C    30.0   30
('C', 50)
  company  salary  age
4       C     NaN   50


#### DataFrameGroupBy相关属性

In [48]:
df_gb = df.groupby(by=['company'])
for g, data in df_gb:
    print(g)
    print(data)

A
  company  salary  age
0       A     8.0   26
2       A    10.0   26
7       A    15.0   35
B
  company  salary  age
1       B    15.0   29
5       B    28.0   30
C
  company  salary  age
3       C    15.0   30
4       C     NaN   50
6       C    30.0   30


In [49]:
# 分成了几组
print(df_gb.ngroups)
print('*' * 30)
# 各个分组的index
print(df_gb.groups)
print('*' * 30)
# 各个分组的index
print(df_gb.indices)

3
******************************
{'A': [0, 2, 7], 'B': [1, 5], 'C': [3, 4, 6]}
******************************
{'A': array([0, 2, 7], dtype=int64), 'B': array([1, 5], dtype=int64), 'C': array([3, 4, 6], dtype=int64)}


In [50]:
# 获取指定组的数据
print(df.loc[[0, 2, 7]])
print('*' * 30)
print(df.loc[df_gb.groups['A']])
print('*' * 30)
print(df_gb.get_group('A'))
# print(df_gb.get_group('B'))
# print(df_gb.get_group('C'))

  company  salary  age
0       A     8.0   26
2       A    10.0   26
7       A    15.0   35
******************************
  company  salary  age
0       A     8.0   26
2       A    10.0   26
7       A    15.0   35
******************************
  company  salary  age
0       A     8.0   26
2       A    10.0   26
7       A    15.0   35


In [51]:
# 聚合操作(对各个组的数据分别操作)
print(df_gb.agg('mean'))
print('*' * 30)
print(df_gb.agg(np.mean))

         salary        age
company                   
A          11.0  29.000000
B          21.5  29.500000
C          22.5  36.666667
******************************
         salary        age
company                   
A          11.0  29.000000
B          21.5  29.500000
C          22.5  36.666667


In [52]:
print(df_gb.agg('max'))
print('*' * 30)
print(df_gb.agg('min'))

         salary  age
company             
A          15.0   35
B          28.0   30
C          30.0   50
******************************
         salary  age
company             
A           8.0   26
B          15.0   29
C          15.0   30


In [53]:
print(df_gb.agg('median'))
print('*' * 30)
print(df_gb.agg('sum'))

         salary   age
company              
A          10.0  26.0
B          21.5  29.5
C          22.5  30.0
******************************
         salary  age
company             
A          33.0   87
B          43.0   59
C          45.0  110


In [54]:
print(df_gb.agg('std'))
print('*' * 30)
print(df_gb.agg('var'))

            salary        age
company                      
A         3.605551   5.196152
B         9.192388   0.707107
C        10.606602  11.547005
******************************
         salary         age
company                    
A          13.0   27.000000
B          84.5    0.500000
C         112.5  133.333333


In [55]:
print(df_gb.agg('count'))

         salary  age
company             
A             3    3
B             2    2
C             2    3


In [56]:
# 变换操作(在聚合操作的结果之上, 还将值变换到分组前的对应位置上)
print(df_gb.transform('mean'))
print(df_gb.transform(np.mean))

   salary        age
0    11.0  29.000000
1    21.5  29.500000
2    11.0  29.000000
3    22.5  36.666667
4    22.5  36.666667
5    21.5  29.500000
6    22.5  36.666667
7    11.0  29.000000
   salary        age
0    11.0  29.000000
1    21.5  29.500000
2    11.0  29.000000
3    22.5  36.666667
4    22.5  36.666667
5    21.5  29.500000
6    22.5  36.666667
7    11.0  29.000000


In [57]:
df['avg_age'] = df_gb.transform('mean')['age']
print(df)

  company  salary  age    avg_age
0       A     8.0   26  29.000000
1       B    15.0   29  29.500000
2       A    10.0   26  29.000000
3       C    15.0   30  36.666667
4       C     NaN   50  36.666667
5       B    28.0   30  29.500000
6       C    30.0   30  36.666667
7       A    15.0   35  29.000000


In [58]:
# 新增两列数据
df[['avg_salary', 'avg_age']] = df_gb.transform('mean')
print(df)

  company  salary  age    avg_age  avg_salary
0       A     8.0   26  29.000000        11.0
1       B    15.0   29  29.500000        21.5
2       A    10.0   26  29.000000        11.0
3       C    15.0   30  36.666667        22.5
4       C     NaN   50  36.666667        22.5
5       B    28.0   30  29.500000        21.5
6       C    30.0   30  36.666667        22.5
7       A    15.0   35  29.000000        11.0


In [59]:
lst = [['Tom', 18, 188, 75],
       ['Bob', 19, 179, 68],
       ['Linda', 17, 177, 62]]
df = pd.DataFrame(lst, columns=['A', 'B', 'C', 'D'])
print(df)
print('*' * 30)
df.index.name = '行'
df.columns.name = '列'
print(df)

       A   B    C   D
0    Tom  18  188  75
1    Bob  19  179  68
2  Linda  17  177  62
******************************
列      A   B    C   D
行                    
0    Tom  18  188  75
1    Bob  19  179  68
2  Linda  17  177  62


In [60]:
df.index = pd.Int64Index([0, 1, 2], dtype='int64', name='行')
df.columns = pd.Index(list('ABCD'), dtype=object, name='列')
print(df)

列      A   B    C   D
行                    
0    Tom  18  188  75
1    Bob  19  179  68
2  Linda  17  177  62
