# 第8章 分类数据

In [12]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/table.csv')
df.head()

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
4,S_1,C_1,1105,F,street_4,159,64,84.8,B+


## 一、category的创建及其性质
### 1. 分类变量的创建
#### （a）用Series创建

In [13]:
df = pd.Series(["a", "b", "c", "a"], dtype="category")
df

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [14]:
df.dtype

CategoricalDtype(categories=['a', 'b', 'c'], ordered=False)

#### （b）对DataFrame指定类型创建

In [15]:
temp_df = pd.DataFrame(
    {'A': pd.Series(["a", "b", "c", "a"], dtype="category"), 'B': list('abcd')})
temp_df.dtypes

A    category
B      object
dtype: object

#### （c）利用内置Categorical类型创建

In [30]:
# 创建无序变量
# 蔬菜、水果、家具
cat = pd.Categorical(["a", "b", "c", "a"], categories=['a', 'b', 'c'])
pd.Series(cat)

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [48]:
# 创建有序变量
# 小-->中-->大
person = pd.Categorical(["bigger", "middle", "smalle"],
                        ordered=True, categories=["middle", "bigger", "smalle"])
person

['bigger', 'middle', 'smalle']
Categories (3, object): ['middle' < 'bigger' < 'smalle']

In [47]:
person.categories

Index(['middle', 'bigger', 'smalle'], dtype='object')

In [34]:
person.ordered

True

In [41]:
# 设置有效的类别表示
person.set_categories(["middle", "bigger", "A"])

['bigger', 'middle', NaN]
Categories (3, object): ['middle' < 'bigger' < 'A']

#### （d）利用cut函数创建

# 默认使用区间类型为标签

In [20]:
pd.cut(np.random.randint(0, 60, 5), [0, 10, 30, 60])

[(10, 30], (10, 30], (30, 60], (30, 60], (0, 10]]
Categories (3, interval[int64]): [(0, 10] < (10, 30] < (30, 60]]

#### 可指定字符为标签

In [52]:
df = pd.cut(np.random.randint(0, 60, 5), [
            0, 10, 30, 60], right=False, labels=['0-10', '10-30', '30-60'])
df

['30-60', '30-60', '30-60', '10-30', '10-30']
Categories (3, object): ['0-10' < '10-30' < '30-60']

In [53]:
# 默认有序变量
df.ordered

True

### 2. 分类变量的结构
#### 一个分类变量包括三个部分，元素值（values）、分类类别（categories）、是否有序（order）
#### 从上面可以看出，使用cut函数创建的分类变量默认为有序分类变量
#### 下面介绍如何获取或修改这些属性
#### （a）describe方法
#### 该方法描述了一个分类序列的情况，包括非缺失值个数、元素值类别数（不是分类类别数）、最多次出现的元素及其频数

In [62]:
s = pd.Series(pd.Categorical(
    ["a", "b", "c", "a", np.nan], categories=['a', 'b', 'c', 'd']))
s

0      a
1      b
2      c
3      a
4    NaN
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [63]:
s.describe()

count     4
unique    3
top       a
freq      2
dtype: object

#### （b）categories和ordered属性
#### 查看分类类别和是否排序

In [50]:
s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [50]:
s.cat.ordered

False

### 3. 类别的修改

#### （a）利用set_categories修改
#### 修改分类，但本身值不会变化

In [56]:
s = pd.Series(pd.Categorical(
    ["a", "b", "c", "a", np.nan], categories=['a', 'b', 'c', 'd']))
s

0      a
1      b
2      c
3      a
4    NaN
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [59]:
# 设置有效类别
s.cat.set_categories(['new_a', 'c'])

0    NaN
1    NaN
2      c
3    NaN
4    NaN
dtype: category
Categories (2, object): ['new_a', 'c']

In [62]:
s.dtype

CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=False)

In [42]:
# s的数据存在于一个Series中, 设置类别数目，可以只保留与目标有关的类别，其它值设置为NaN
s.cat.set_categories(["缺失值", "b", "c"])

0    NaN
1      b
2      c
3    NaN
4    NaN
dtype: category
Categories (3, object): ['缺失值', 'b', 'c']

#### （b）利用rename_categories修改
#### 需要注意的是该方法会把值和分类同时修改

In [66]:
s = pd.Series(pd.Categorical(
    ["a", "b", "c", "a", np.nan], categories=['a', 'b', 'c', 'd']))
s

0      a
1      b
2      c
3      a
4    NaN
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [68]:
# 顺序对其进行修改
s.cat.rename_categories(['new_%s' % i for i in s.cat.categories])

0    new_a
1    new_b
2    new_c
3    new_a
4      NaN
dtype: category
Categories (4, object): ['new_a', 'new_b', 'new_c', 'new_d']

In [88]:
s.cat.rename_categories(["1", "2", "3", "4"])

0      1
1      2
2      3
3      1
4    NaN
dtype: category
Categories (4, object): ['1', '2', '3', '4']

#### 利用字典修改值

In [69]:
s.cat.rename_categories({'a': 'new_a', 'b': 'new_b'})

0    new_a
1    new_b
2        c
3    new_a
4      NaN
dtype: category
Categories (4, object): ['new_a', 'new_b', 'c', 'd']

#### （c）利用add_categories添加

In [53]:
s = pd.Series(pd.Categorical(
    ["a", "b", "c", "a", np.nan], categories=['a', 'b', 'c', 'd']))
s.cat.add_categories(['e'])

0      a
1      b
2      c
3      a
4    NaN
dtype: category
Categories (5, object): ['a', 'b', 'c', 'd', 'e']

#### （d）利用remove_categories移除

In [54]:
s = pd.Series(pd.Categorical(
    ["a", "b", "c", "a", np.nan], categories=['a', 'b', 'c', 'd']))
s.cat.remove_categories(['d'])

0      a
1      b
2      c
3      a
4    NaN
dtype: category
Categories (3, object): ['a', 'b', 'c']

#### （e）删除元素值未出现的分类类型

In [89]:
# 删除多余类别表示
s = pd.Series(pd.Categorical(
    ["a", "b", "c", "a", np.nan], categories=['a', 'b', 'c', 'd']))
s.cat.remove_unused_categories()

0      a
1      b
2      c
3      a
4    NaN
dtype: category
Categories (3, object): ['a', 'b', 'c']

## 二、分类变量的排序
#### 前面提到，分类数据类型被分为有序和无序，这非常好理解，例如分数区间的高低是有序变量，考试科目的类别一般看做无序变量

### 1. 序的建立

#### （a）一般来说会将一个序列转为有序变量，可以利用as_ordered方法

In [91]:
s = pd.Series(["a", "d", "c", "a"]).astype('category').cat.as_ordered()
s

0    a
1    d
2    c
3    a
dtype: category
Categories (3, object): ['a' < 'c' < 'd']

#### 退化为无序变量，只需要使用as_unordered

In [17]:
s.cat.as_unordered()

0    a
1    d
2    c
3    a
dtype: category
Categories (3, object): [a, c, d]

#### （b）利用set_categories方法中的order参数

In [92]:
pd.Series(["a", "d", "c", "a"]).astype(
    'category').cat.set_categories(['a', 'c', 'd'], ordered=True)

0    a
1    d
2    c
3    a
dtype: category
Categories (3, object): ['a' < 'c' < 'd']

#### （c）利用reorder_categories方法
#### 这个方法的特点在于，新设置的分类必须与原分类为同一集合

In [94]:
# 原始类别变量
s = pd.Series(["a", "d", "c", "a"]).astype('category')
s

0    a
1    d
2    c
3    a
dtype: category
Categories (3, object): ['a', 'c', 'd']

In [19]:
s.cat.reorder_categories(['a', 'c', 'd'], ordered=True)

0    a
1    d
2    c
3    a
dtype: category
Categories (3, object): [a < c < d]

In [20]:
# s.cat.reorder_categories(['a','c'],ordered=True) #报错
# s.cat.reorder_categories(['a','c','d','e'],ordered=True) #报错

### 2. 排序

#### 先前在第1章介绍的值排序和索引排序都是适用的

In [102]:
# 未设置类别，自动检测可能的类别 unique
s = pd.Series(np.random.choice(
    ['perfect', 'good', 'fair', 'bad', 'awful'], 50)).astype('category')
s[:5]

0    awful
1      bad
2    awful
3    awful
4     fair
dtype: category
Categories (5, object): ['awful', 'bad', 'fair', 'good', 'perfect']

In [103]:
s.cat.set_categories(['perfect', 'good', 'fair', 'bad',
                      'awful'][::-1], ordered=True).head()

0    awful
1      bad
2    awful
3    awful
4     fair
dtype: category
Categories (5, object): ['awful' < 'bad' < 'fair' < 'good' < 'perfect']

In [104]:
['perfect', 'good', 'fair', 'bad', 'awful'][::-1]

['awful', 'bad', 'fair', 'good', 'perfect']

In [100]:
# 建立序关系之后，可以直接按照序列关系进行排序
# 有序变量进行排序
s.sort_values(ascending=False).head(10)

25    perfect
31    perfect
2     perfect
27    perfect
19    perfect
29    perfect
7     perfect
41    perfect
9     perfect
10    perfect
dtype: category
Categories (5, object): ['awful', 'bad', 'fair', 'good', 'perfect']

In [114]:
df_sort = pd.DataFrame(
    {'cat': s.values, 'index': s.index}).set_index('cat')
df_sort.head()

Unnamed: 0_level_0,index
cat,Unnamed: 1_level_1
awful,0
bad,1
awful,2
awful,3
fair,4


In [115]:
df_sort.sort_index().head()

Unnamed: 0_level_0,index
cat,Unnamed: 1_level_1
awful,0
awful,40
awful,37
awful,27
awful,25


## 三、分类变量的比较操作

### 1. 与标量或等长序列的比较

#### （a）标量比较

In [116]:
# 类似是否存在该字符变量
s = pd.Series(["a", "d", "c", "a"]).astype('category')
s == 'a'

0     True
1    False
2    False
3     True
dtype: bool

#### （b）等长序列比较

In [118]:
# 对其是否相等
s == list('abcd')

0     True
1    False
2     True
3    False
dtype: bool

### 2. 与另一分类变量的比较

#### （a）等式判别（包含等号和不等号）
#### 两个分类变量的等式判别需要满足分类完全相同

In [119]:
s = pd.Series(["a", "d", "c", "a"]).astype('category')
s == s

0    True
1    True
2    True
3    True
dtype: bool

In [120]:
s != s

0    False
1    False
2    False
3    False
dtype: bool

In [121]:
s_new = s.cat.set_categories(['a', 'd', 'e'])
# s == s_new #报错

#### （b）不等式判别（包含>=,<=,<,>）
#### 两个分类变量的不等式判别需要满足两个条件：① 分类完全相同 ② 排序完全相同

In [74]:
s = pd.Series(["a", "d", "c", "a"]).astype('category')
# s >= s #报错

In [31]:
s = pd.Series(["a", "d", "c", "a"]).astype(
    'category').cat.reorder_categories(['a', 'c', 'd'], ordered=True)
s >= s

0    True
1    True
2    True
3    True
dtype: bool

## 四、问题与练习

#### 【问题一】 如何使用union_categoricals方法？它的作用是什么？

In [142]:
# 当类别不一致时，对两种类别求解并集
blood_type1 = pd.Categorical(["A", "AB"])
blood_type2 = pd.Categorical(["B", "O"])
print(pd.Series(pd.api.types.union_categoricals([blood_type1, blood_type2])))

0     A
1    AB
2     B
3     O
dtype: category
Categories (4, object): ['A', 'AB', 'B', 'O']


#### 【问题二】 利用concat方法将两个序列纵向拼接，它的结果一定是分类变量吗？什么情况下不是？

In [140]:
# 拼接类别不一致，为object类型
blood_type1 = pd.Categorical(["A", "AB"])
blood_type2 = pd.Categorical(["B", "O"])
# 必须是Series类型的数据
print(pd.concat([pd.Series(blood_type1), pd.Series(blood_type2)]))

0     A
1    AB
0     B
1     O
dtype: object


In [139]:
# 拼接类别一致，为相同类型
blood_type1 = pd.Categorical(["A", "B"])
blood_type2 = pd.Categorical(["B", "A"])
# 必须是Series类型的数据
print(pd.concat([pd.Series(blood_type1), pd.Series(blood_type2)]))

0    A
1    B
0    B
1    A
dtype: category
Categories (2, object): ['A', 'B']


#### 【问题三】 当使用groupby方法或者value_counts方法时，分类变量的统计结果和普通变量有什么区别？

In [165]:
df = pd.read_csv('data/table.csv')
df.groupby(by="Class")["Math"].mean()

Class
C_1    61.17
C_2    63.55
C_3    63.11
C_4    53.80
Name: Math, dtype: float64

In [166]:
df.value_counts("Class")

Class
C_1    10
C_2    10
C_3    10
C_4     5
dtype: int64

In [167]:
df["Class"] = df["Class"].astype("category")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   School   35 non-null     object  
 1   Class    35 non-null     category
 2   ID       35 non-null     int64   
 3   Gender   35 non-null     object  
 4   Address  35 non-null     object  
 5   Height   35 non-null     int64   
 6   Weight   35 non-null     int64   
 7   Math     35 non-null     float64 
 8   Physics  35 non-null     object  
dtypes: category(1), float64(1), int64(3), object(4)
memory usage: 2.5+ KB


In [168]:
df.groupby(by="Class")["Math"].mean()

Class
C_1    61.17
C_2    63.55
C_3    63.11
C_4    53.80
Name: Math, dtype: float64

In [169]:
df.value_counts("Class")

Class
C_1    10
C_2    10
C_3    10
C_4     5
dtype: int64

#### 【问题四】 下面的代码说明了Series创建分类变量的什么“缺陷”？如何避免？（提示：使用Series中的copy参数）

In [173]:
cat = pd.Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10])
s = pd.Series(cat, name="cat")
cat

[1, 2, 3, 10]
Categories (5, int64): [1, 2, 3, 4, 10]

In [174]:
s

0     1
1     2
2     3
3    10
Name: cat, dtype: category
Categories (5, int64): [1, 2, 3, 4, 10]

In [177]:
# 改变序列值时，类别标识未改变
s.iloc[0:2] = 10
cat

[10, 10, 3, 10]
Categories (5, int64): [1, 2, 3, 4, 10]

#### 【练习一】 现继续使用第四章中的地震数据集，请解决以下问题：
#### （a）现在将深度分为七个等级：[0,5,10,15,20,30,50,np.inf]，请以深度等级Ⅰ,Ⅱ,Ⅲ,Ⅳ,Ⅴ,Ⅵ,Ⅶ为索引并按照由浅到深的顺序进行排序。
#### （b）在（a）的基础上，将烈度分为4个等级：[0,3,4,5,np.inf]，依次对南部地区的深度和烈度等级建立多级索引排序。

In [188]:
df = pd.read_csv('data/Earthquake.csv')
df.head()

Unnamed: 0,日期,时间,维度,经度,方向,距离,深度,烈度
0,2003.05.20,12:17:44 AM,39.04,40.38,west,0.1,10.0,0.0
1,2007.08.01,12:03:08 AM,40.79,30.09,west,0.1,5.2,4.0
2,1978.05.07,12:41:37 AM,38.58,27.61,south_west,0.1,0.0,0.0
3,1997.03.22,12:31:45 AM,39.47,36.44,south_west,0.1,10.0,0.0
4,2000.04.02,12:57:38 AM,40.8,30.24,south_west,0.1,7.0,0.0


In [189]:
depth = pd.cut(df["深度"], [-0.1, 5, 10, 15, 20, 30, 50, np.inf],
               labels=["Ⅰ", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ"])

In [190]:
depth.name = "深度等级"
df = df.join(depth)

In [199]:
# 有序类别
df["深度等级"].cat.ordered

True

In [203]:
# 按照类别等级进行排序
df.set_index("深度等级").sort_index()

Unnamed: 0_level_0,日期,时间,维度,经度,方向,距离,深度,烈度
深度等级,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Ⅰ,2009.09.09,12:54:13 AM,42.42,43.03,north_east,95.4,5.0,0.0
Ⅰ,1997.06.16,12:18:04 AM,37.92,29.17,north_east,3.2,0.0,0.0
Ⅰ,2011.10.25,12:29:45 AM,38.96,43.64,south_east,1.6,5.0,3.9
Ⅰ,1995.07.23,12:05:04 AM,37.61,29.29,north_east,3.2,0.0,0.0
Ⅰ,2013.06.10,12:39:19 AM,38.53,43.85,south_east,1.6,1.0,3.7
...,...,...,...,...,...,...,...,...
Ⅶ,1969.07.23,12:54:11 AM,38.90,41.00,north_east,2.5,169.0,4.2
Ⅶ,1976.12.04,12:10:37 AM,39.31,43.66,north_west,7.6,53.0,0.0
Ⅶ,1981.09.04,12:52:19 AM,38.90,37.00,north,3.7,84.0,0.0
Ⅶ,1976.11.24,12:11:08 AM,39.00,44.19,south_east,4.9,62.0,0.0


In [187]:
df = df.join(pd.cut(df["烈度"], [-1.0, 3, 4, 5, np.inf],
                    labels=["A", "B", "C", "D"]), rsuffix="等级")
df

Unnamed: 0,日期,时间,维度,经度,方向,距离,深度,烈度,深度等级,烈度等级
0,2003.05.20,12:17:44 AM,39.04,40.38,west,0.1,10.0,0.0,Ⅱ,A
1,2007.08.01,12:03:08 AM,40.79,30.09,west,0.1,5.2,4.0,Ⅱ,B
2,1978.05.07,12:41:37 AM,38.58,27.61,south_west,0.1,0.0,0.0,Ⅰ,A
3,1997.03.22,12:31:45 AM,39.47,36.44,south_west,0.1,10.0,0.0,Ⅱ,A
4,2000.04.02,12:57:38 AM,40.80,30.24,south_west,0.1,7.0,0.0,Ⅱ,A
...,...,...,...,...,...,...,...,...,...,...
10057,2015.11.18,12:17:48 AM,42.31,42.94,north,81.6,5.0,3.8,Ⅰ,B
10058,1990.01.28,12:22:43 AM,42.70,26.20,north_west,89.5,2.0,0.0,Ⅰ,A
10059,2001.08.09,12:58:14 AM,42.77,26.47,north,90.6,5.0,0.0,Ⅰ,A
10060,1994.06.05,12:20:03 AM,42.41,43.06,north_east,94.3,33.0,0.0,Ⅵ,A


In [143]:
df.set_index(["深度等级", "烈度等级"]).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,日期,时间,维度,经度,方向,距离,深度,烈度
深度等级,烈度等级,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ⅰ,A,1978.05.07,12:41:37 AM,38.58,27.61,south_west,0.1,0.0,0.0
Ⅰ,A,2000.02.07,12:11:45 AM,40.05,34.07,south_east,0.1,1.0,0.0
Ⅰ,A,1971.05.20,12:08:46 AM,37.72,30.00,north_east,0.1,5.0,0.0
Ⅰ,A,1985.01.28,12:20:56 AM,38.85,29.06,north_east,0.1,4.0,0.0
Ⅰ,A,1990.07.05,12:43:04 AM,37.87,29.18,east,0.1,5.0,0.0
...,...,...,...,...,...,...,...,...,...
Ⅶ,D,1925.09.01,12:16:30 AM,37.56,29.17,north,5.4,130.0,5.3
Ⅶ,D,1965.03.26,12:29:23 AM,36.82,30.94,south_east,6.6,111.0,5.1
Ⅶ,D,1966.08.19,12:15:14 AM,39.41,41.30,south_east,7.0,62.0,5.2
Ⅶ,D,1958.05.09,12:40:57 AM,36.61,27.60,south_east,10.5,67.0,5.1


#### 【练习二】 对于分类变量而言，调用第4章中的变形函数会出现一个BUG（目前的版本下还未修复）：例如对于crosstab函数，按照[官方文档的说法](https://pandas.pydata.org/pandas-docs/version/1.0.0/user_guide/reshaping.html#cross-tabulations)，即使没有出现的变量也会在变形后的汇总结果中出现，但事实上并不是这样，比如下面的例子就缺少了原本应该出现的行'c'和列'f'。基于这一问题，请尝试设计my_crosstab函数，在功能上能够返回正确的结果。

In [144]:
foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
pd.crosstab(foo, bar)

col_0,d,e
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,0
b,0,1
