# Chapter 3  索引
参考DataWhale：https://datawhalechina.github.io/joyful-pandas/build/html/%E7%9B%AE%E5%BD%95/ch3.html#id15

In [1]:
import numpy as np
import pandas as pd

## 1. 索引器
### 1.1 表的列索引
列索引可以直接df['colnames'].

In [2]:
df=pd.read_csv('/Users/jie/Documents/Python/joyful-pandas-master/data/learn_pandas.csv',
              usecols=['School','Grade','Name','Gender','Weight','Transfer'])
df['Gender'] #return to a Series. Or df.Gender

0      Female
1        Male
2        Male
3      Female
4        Male
        ...  
195    Female
196    Female
197    Female
198      Male
199      Male
Name: Gender, Length: 200, dtype: object

In [3]:
df[['School','Grade']] #return to a DataFrame

Unnamed: 0,School,Grade
0,Shanghai Jiao Tong University,Freshman
1,Peking University,Freshman
2,Shanghai Jiao Tong University,Senior
3,Fudan University,Sophomore
4,Fudan University,Sophomore
...,...,...
195,Fudan University,Junior
196,Tsinghua University,Senior
197,Shanghai Jiao Tong University,Senior
198,Shanghai Jiao Tong University,Senior


### 1.2 序列的行索引

In [4]:
s=pd.Series([1,2,3,4,5,6], index=['a','b','a','a','a','c'])
s['b']

2

In [5]:
s[['c','b']]

c    6
b    2
dtype: int64

In [6]:
s['b':'c':1] #endpoints must be unique, included

b    2
a    3
a    4
a    5
c    6
dtype: int64

In [7]:
s=pd.Series(['a','b','c','d','e','f'], index=[1,2,3,4,5,6])
s[1:3:1] #position!! Right endpoint is excluded

2    b
3    c
dtype: object

### 1.3 loc索引器
基于元素索引。loc[row, col].

In [8]:
df_demo=df.set_index('Name')
df_demo.loc['Qiang Sun','Gender']

Name
Qiang Sun    Female
Qiang Sun    Female
Qiang Sun    Female
Name: Gender, dtype: object

In [9]:
df_demo.loc[['Qiang Sun','Quan Zhao'], ['School','Gender']]

Unnamed: 0_level_0,School,Gender
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Qiang Sun,Tsinghua University,Female
Qiang Sun,Tsinghua University,Female
Qiang Sun,Shanghai Jiao Tong University,Female
Quan Zhao,Shanghai Jiao Tong University,Female


In [10]:
df_demo.loc['Gaojuan You':'Gaoqiang Qian', 'School':'Gender'] #slice

Unnamed: 0_level_0,School,Grade,Gender
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Gaojuan You,Fudan University,Sophomore,Male
Xiaoli Qian,Tsinghua University,Freshman,Female
Qiang Chu,Shanghai Jiao Tong University,Freshman,Female
Gaoqiang Qian,Tsinghua University,Junior,Female


In [11]:
df_demo.loc[df_demo.Weight>70].head() #condition adding

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mei Sun,Shanghai Jiao Tong University,Senior,Male,89.0,N
Gaojuan You,Fudan University,Sophomore,Male,74.0,N
Xiaopeng Zhou,Shanghai Jiao Tong University,Freshman,Male,74.0,N
Xiaofeng Sun,Tsinghua University,Senior,Male,71.0,N
Qiang Zheng,Shanghai Jiao Tong University,Senior,Male,87.0,N


In [12]:
df_demo.loc[df_demo.Grade.isin(['Freshman','Senior'])].head() #condition 'isin'

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Gaopeng Yang,Shanghai Jiao Tong University,Freshman,Female,46.0,N
Changqiang You,Peking University,Freshman,Male,70.0,N
Mei Sun,Shanghai Jiao Tong University,Senior,Male,89.0,N
Xiaoli Qian,Tsinghua University,Freshman,Female,51.0,N
Qiang Chu,Shanghai Jiao Tong University,Freshman,Female,52.0,N


复合条件用 ｜（或）、&（且）、～（取反）来实现。

In [13]:
#定义一个函数的形式
def condition(x):
    condition_1_1=x.School=='Fudan University'
    condition_1_2=x.Grade=='Senior'
    condition_1_3=x.Weight>70
    condition_1=condition_1_1 & condition_1_2 & condition_1_3
    condition_2_1=x.School=='Peking University'
    condition_2_2=x.Grade=='Senior'
    condition_2_3=x.Weight>80
    condition_2=condition_2_1 & (~condition_2_2) & condition_2_3
    result=condition_1 | condition_2
    return result

df_demo.loc[condition]

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Qiang Han,Peking University,Freshman,Male,87.0,N
Chengpeng Zhou,Fudan University,Senior,Male,81.0,N
Changpeng Zhao,Peking University,Freshman,Male,83.0,N
Chengpeng Qian,Fudan University,Senior,Male,73.0,Y


In [14]:
#用lambdaa实现
df_demo.loc[lambda x:'Quan Zhao', lambda x:'Gender']

'Female'

In [15]:
#用slice结合start：end：step的形式提取
df_demo.loc[lambda x: slice('Gaojuan You','Gaoqiang Qian')]

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Gaojuan You,Fudan University,Sophomore,Male,74.0,N
Xiaoli Qian,Tsinghua University,Freshman,Female,51.0,N
Qiang Chu,Shanghai Jiao Tong University,Freshman,Female,52.0,N
Gaoqiang Qian,Tsinghua University,Junior,Female,50.0,N


In [16]:
#记住不要用链式赋值（这点与R极其不同！！）而是应该先用索引器再赋值
df_chain=pd.DataFrame([[0,0],[1,0],[-1,0]], columns=list('AB'))
df_chain[df_chain.A!=0].B=1 #这样只会更改临时返回的副本上，而不会真正的修改表中元素值

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [17]:
#正确的赋值方法
df_chain.loc[df_chain.A!=0,'B']=1

### 1.4 iloc索引器
基于位置索引。

In [18]:
df_demo.iloc[[0,1],[0,1]] #first two rows and cols

Unnamed: 0_level_0,School,Grade
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Gaopeng Yang,Shanghai Jiao Tong University,Freshman
Changqiang You,Peking University,Freshman


In [19]:
df_demo.iloc[1:4,2:4]

Unnamed: 0_level_0,Gender,Weight
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Changqiang You,Male,70.0
Mei Sun,Male,89.0
Xiaojuan Sun,Female,41.0


In [20]:
df_demo.iloc[lambda x: slice(1,4)]

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Changqiang You,Peking University,Freshman,Male,70.0,N
Mei Sun,Shanghai Jiao Tong University,Senior,Male,89.0,N
Xiaojuan Sun,Fudan University,Sophomore,Female,41.0,N


In [21]:
df_demo.iloc[(df_demo.Weight>80).values].head()

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mei Sun,Shanghai Jiao Tong University,Senior,Male,89.0,N
Qiang Zheng,Shanghai Jiao Tong University,Senior,Male,87.0,N
Qiang Han,Peking University,Freshman,Male,87.0,N
Chengpeng Zhou,Fudan University,Senior,Male,81.0,N
Feng Han,Shanghai Jiao Tong University,Sophomore,Male,82.0,N


### 1.5 query方法
用query查询数据，df.query(' ')

In [22]:
df.query('Weight>Weight.mean()').head()

Unnamed: 0,School,Grade,Name,Gender,Weight,Transfer
1,Peking University,Freshman,Changqiang You,Male,70.0,N
2,Shanghai Jiao Tong University,Senior,Mei Sun,Male,89.0,N
4,Fudan University,Sophomore,Gaojuan You,Male,74.0,N
10,Shanghai Jiao Tong University,Freshman,Xiaopeng Zhou,Male,74.0,N
14,Tsinghua University,Senior,Xiaomei Zhou,Female,57.0,N


In [23]:
#or, and, is in/==, not in/!=
df.query('Grade not in ["Freshman","Sophomore"] and Gender=="Male"').head()

Unnamed: 0,School,Grade,Name,Gender,Weight,Transfer
2,Shanghai Jiao Tong University,Senior,Mei Sun,Male,89.0,N
16,Tsinghua University,Junior,Xiaoqiang Qin,Male,68.0,N
17,Tsinghua University,Junior,Peng Wang,Male,65.0,N
18,Tsinghua University,Senior,Xiaofeng Sun,Male,71.0,N
21,Shanghai Jiao Tong University,Senior,Xiaopeng Shen,Male,62.0,


In [24]:
#引用外部变量
low, high=70, 80
df.query('Weight.between(@low, @high)', engine='python').head()
#需要加上engine=‘python'才可以

Unnamed: 0,School,Grade,Name,Gender,Weight,Transfer
1,Peking University,Freshman,Changqiang You,Male,70.0,N
4,Fudan University,Sophomore,Gaojuan You,Male,74.0,N
10,Shanghai Jiao Tong University,Freshman,Xiaopeng Zhou,Male,74.0,N
18,Tsinghua University,Senior,Xiaofeng Sun,Male,71.0,N
35,Peking University,Freshman,Gaoli Zhao,Male,78.0,N


### 3.6 随机抽样
使用sample函数，主要参数为n、axis、frac、replace、weights

In [25]:
df_sample=pd.DataFrame({'id':list('abcde'),'value':[1,2,3,4,90]})
df_sample.sample(3,replace=True, weights=df_sample.value)

Unnamed: 0,id,value
2,c,3
4,e,90
3,d,4


## 2. 多级索引
### 2.1 多级索引及其表的结构

In [26]:
np.random.seed(0)
multi_index=pd.MultiIndex.from_product([list('ABCD'), df.Gender.unique()], names=('School','Gender'))
multi_column = pd.MultiIndex.from_product([['Height','Weight'], df.Grade.unique()], names=('Indicator','Grade'))
df_multi = pd.DataFrame(np.c_[(np.random.randn(8,4)*5 + 163).tolist(), (np.random.randn(8,4)*5 + 65).tolist()],
                        index = multi_index, columns = multi_column).round(1)

In [27]:
df_multi.index.values

array([('A', 'Female'), ('A', 'Male'), ('B', 'Female'), ('B', 'Male'),
       ('C', 'Female'), ('C', 'Male'), ('D', 'Female'), ('D', 'Male')],
      dtype=object)

In [28]:
df_multi.columns.values

array([('Height', 'Freshman'), ('Height', 'Senior'),
       ('Height', 'Sophomore'), ('Height', 'Junior'),
       ('Weight', 'Freshman'), ('Weight', 'Senior'),
       ('Weight', 'Sophomore'), ('Weight', 'Junior')], dtype=object)

In [29]:
#get_level_values
df_multi.index.get_level_values(0)

Index(['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], dtype='object', name='School')

### 2.2 多层索引中的loc索引器

In [30]:
df_multi=df.set_index(['School','Grade'])

In [31]:
#在索引前对MultiIndex进行排序
df_multi=df_multi.sort_index()
df_multi.loc[('Fudan University','Junior')].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Gender,Weight,Transfer
School,Grade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fudan University,Junior,Yanli You,Female,48.0,N
Fudan University,Junior,Chunqiang Chu,Male,72.0,N
Fudan University,Junior,Changfeng Lv,Male,76.0,N
Fudan University,Junior,Yanjuan Lv,Female,49.0,
Fudan University,Junior,Gaoqiang Zhou,Female,43.0,N


In [32]:
df_multi.loc[df_multi.Weight>70].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Gender,Weight,Transfer
School,Grade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fudan University,Freshman,Feng Wang,Male,74.0,N
Fudan University,Junior,Chunqiang Chu,Male,72.0,N
Fudan University,Junior,Changfeng Lv,Male,76.0,N
Fudan University,Senior,Chengpeng Zhou,Male,81.0,N
Fudan University,Senior,Chengpeng Qian,Male,73.0,Y


In [33]:
df_multi.loc[lambda x:('Fudan University','Junior')].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Gender,Weight,Transfer
School,Grade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fudan University,Junior,Yanli You,Female,48.0,N
Fudan University,Junior,Chunqiang Chu,Male,72.0,N
Fudan University,Junior,Changfeng Lv,Male,76.0,N
Fudan University,Junior,Yanjuan Lv,Female,49.0,
Fudan University,Junior,Gaoqiang Zhou,Female,43.0,N


In [34]:
#练一练：如果存在重复元素，则不能使用切片，请去除重复索引后给出一个元素切片的例子
df_multi.drop_duplicates().loc[('Fudan University','Freshman'):('Fudan University','Senior')]

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Gender,Weight,Transfer
School,Grade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fudan University,Freshman,Changqiang Yang,Female,49.0,N
Fudan University,Freshman,Gaoqiang Qin,Female,63.0,N
Fudan University,Freshman,Gaofeng Zhao,Female,43.0,N
Fudan University,Freshman,Yanquan Wang,Female,55.0,N
Fudan University,Freshman,Feng Wang,Male,74.0,N
Fudan University,Freshman,Qiang Shi,Female,52.0,N
Fudan University,Freshman,Yanqiang Xu,Female,38.0,N
Fudan University,Freshman,Xiaoli Lv,Female,45.0,N
Fudan University,Freshman,Yanjuan Zhao,Female,53.0,N
Fudan University,Junior,Yanli You,Female,48.0,N


In [35]:
#多层元素交叉组合索引
df_multi.loc[(['Peking University','Fudan University'],['Sophomore','Junior']),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Gender,Weight,Transfer
School,Grade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Peking University,Sophomore,Changmei Xu,Female,43.0,N
Peking University,Sophomore,Xiaopeng Qin,Male,,N
Peking University,Sophomore,Mei Xu,Female,39.0,N
Peking University,Sophomore,Xiaoli Zhou,Female,55.0,N
Peking University,Sophomore,Peng Han,Female,34.0,
Peking University,Junior,Juan Xu,Female,,N
Peking University,Junior,Changjuan You,Female,47.0,N
Peking University,Junior,Gaoli Xu,Female,48.0,N
Peking University,Junior,Gaoquan Zhou,Male,70.0,N
Peking University,Junior,Qiang You,Female,56.0,N


### 2.3 IndexSlice对象
对每层进行切片。两种形式：  
loc[idx[*,*];   
loc[idx[*,*],idx[*,*]];

In [36]:
np.random.seed(0)
L1,L2=['A','B','C'],['a','b','c']
mul_index1=pd.MultiIndex.from_product([L1,L2],names=('Upper','Lower'))
L3,L4 = ['D','E','F'],['d','e','f']
mul_index2 = pd.MultiIndex.from_product([L3,L4],names=('Big', 'Small'))
df_ex = pd.DataFrame(np.random.randint(-9,10,(9,9)), index=mul_index1, columns=mul_index2)
df_ex

Unnamed: 0_level_0,Big,D,D,D,E,E,E,F,F,F
Unnamed: 0_level_1,Small,d,e,f,d,e,f,d,e,f
Upper,Lower,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
A,a,3,6,-9,-6,-6,-2,0,9,-5
A,b,-3,3,-8,-3,-2,5,8,-4,4
A,c,-1,0,7,-4,6,6,-9,9,-6
B,a,8,5,-2,-9,-8,0,-9,1,-6
B,b,2,9,-7,-9,-9,-5,-4,-3,-1
B,c,8,6,-5,0,1,-8,-8,-2,0
C,a,-6,-3,2,5,9,-9,5,-6,3
C,b,1,2,-5,-3,-5,6,-6,3,-5
C,c,-1,5,6,-6,6,4,7,8,-4


In [37]:
#先定义函数
idx=pd.IndexSlice

In [38]:
df_ex.loc[idx["C":,('D','f'):]] #找到C之后，且D_f之后的元素

Unnamed: 0_level_0,Big,D,E,E,E,F,F,F
Unnamed: 0_level_1,Small,f,d,e,f,d,e,f
Upper,Lower,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
C,a,2,5,9,-9,5,-6,3
C,b,-5,-3,-5,6,-6,3,-5
C,c,6,-6,6,4,7,8,-4


In [39]:
df_ex.loc[idx[:'A', lambda x:x.sum()>0]] #找到A之前，且列和>0的元素

Unnamed: 0_level_0,Big,D,D,F
Unnamed: 0_level_1,Small,d,e,e
Upper,Lower,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,a,3,6,9
A,b,-3,3,-4
A,c,-1,0,9


In [40]:
#用两个idx，第一个是行索引，第二个是列索引
df_ex.loc[idx[:'A','b':], idx['E':,'e':]]

Unnamed: 0_level_0,Big,E,E,F,F
Unnamed: 0_level_1,Small,e,f,e,f
Upper,Lower,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
A,b,-2,5,-4,4
A,c,6,6,9,-6


### 2.4 多级索引的构造
除了set_index以外的构造所及索引的方法：  
from_tuples;  
from_arrays;     
from_produce;

In [41]:
my_tuple=[('a','cat'),('a','dog'),('b','cat'),('b','dog')]
pd.MultiIndex.from_tuples(my_tuple,names=['First','Second'])

MultiIndex([('a', 'cat'),
            ('a', 'dog'),
            ('b', 'cat'),
            ('b', 'dog')],
           names=['First', 'Second'])

In [42]:
my_array=[list('aabb'),['cat','dog']*2]
pd.MultiIndex.from_arrays(my_array,names=['First','Second'])

MultiIndex([('a', 'cat'),
            ('a', 'dog'),
            ('b', 'cat'),
            ('b', 'dog')],
           names=['First', 'Second'])

In [43]:
my_list1=['a','b']
my_list2=['cat','dog']
pd.MultiIndex.from_product([my_list1,my_list2], names=['First','Second'])

MultiIndex([('a', 'cat'),
            ('a', 'dog'),
            ('b', 'cat'),
            ('b', 'dog')],
           names=['First', 'Second'])

## 3. 索引的常用方法
### 3.1 索引层的交换和删除

In [44]:
np.random.seed(0)
L1,L2,L3 = ['A','B'],['a','b'],['alpha','beta']
mul_index1 = pd.MultiIndex.from_product([L1,L2,L3], names=('Upper', 'Lower','Extra'))
L4,L5,L6 = ['C','D'],['c','d'],['cat','dog']
mul_index2 = pd.MultiIndex.from_product([L4,L5,L6], names=('Big', 'Small', 'Other'))
df_ex = pd.DataFrame(np.random.randint(-9,10,(8,8)), index=mul_index1, columns=mul_index2)
df_ex

Unnamed: 0_level_0,Unnamed: 1_level_0,Big,C,C,C,C,D,D,D,D
Unnamed: 0_level_1,Unnamed: 1_level_1,Small,c,c,d,d,c,c,d,d
Unnamed: 0_level_2,Unnamed: 1_level_2,Other,cat,dog,cat,dog,cat,dog,cat,dog
Upper,Lower,Extra,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
A,a,alpha,3,6,-9,-6,-6,-2,0,9
A,a,beta,-5,-3,3,-8,-3,-2,5,8
A,b,alpha,-4,4,-1,0,7,-4,6,6
A,b,beta,-9,9,-6,8,5,-2,-9,-8
B,a,alpha,0,-9,1,-6,2,9,-7,-9
B,a,beta,-9,-5,-4,-3,-1,8,6,-5
B,b,alpha,0,1,-8,-8,-2,0,-6,-3
B,b,beta,2,5,9,-9,5,-6,3,1


In [45]:
#交换索引层：swaplevel（交换两个层）和reorder_level（交换任意层）
#axis=0或axis=1
df_ex.swaplevel(0,2,axis=1).head() #列索引的第一层和第三层交换

Unnamed: 0_level_0,Unnamed: 1_level_0,Other,cat,dog,cat,dog,cat,dog,cat,dog
Unnamed: 0_level_1,Unnamed: 1_level_1,Small,c,c,d,d,c,c,d,d
Unnamed: 0_level_2,Unnamed: 1_level_2,Big,C,C,C,C,D,D,D,D
Upper,Lower,Extra,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
A,a,alpha,3,6,-9,-6,-6,-2,0,9
A,a,beta,-5,-3,3,-8,-3,-2,5,8
A,b,alpha,-4,4,-1,0,7,-4,6,6
A,b,beta,-9,9,-6,8,5,-2,-9,-8
B,a,alpha,0,-9,1,-6,2,9,-7,-9


In [46]:
df_ex.reorder_levels([2,0,1],axis=0).head() #指定交换层的顺序

Unnamed: 0_level_0,Unnamed: 1_level_0,Big,C,C,C,C,D,D,D,D
Unnamed: 0_level_1,Unnamed: 1_level_1,Small,c,c,d,d,c,c,d,d
Unnamed: 0_level_2,Unnamed: 1_level_2,Other,cat,dog,cat,dog,cat,dog,cat,dog
Extra,Upper,Lower,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
alpha,A,a,3,6,-9,-6,-6,-2,0,9
beta,A,a,-5,-3,3,-8,-3,-2,5,8
alpha,A,b,-4,4,-1,0,7,-4,6,6
beta,A,b,-9,9,-6,8,5,-2,-9,-8
alpha,B,a,0,-9,1,-6,2,9,-7,-9


In [47]:
#用droplevel来删除某一层的索引
df_ex.droplevel(1,axis=1).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Big,C,C,C,C,D,D,D,D
Unnamed: 0_level_1,Unnamed: 1_level_1,Other,cat,dog,cat,dog,cat,dog,cat,dog
Upper,Lower,Extra,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
A,a,alpha,3,6,-9,-6,-6,-2,0,9
A,a,beta,-5,-3,3,-8,-3,-2,5,8
A,b,alpha,-4,4,-1,0,7,-4,6,6
A,b,beta,-9,9,-6,8,5,-2,-9,-8
B,a,alpha,0,-9,1,-6,2,9,-7,-9


### 3.2 索引属性的修改

In [48]:
#通过rename_axis修改索引层名字 'old':'new'
df_ex.rename_axis(index={'Upper':'Changed_row'}, columns={'Other':'Changed_col'}).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Big,C,C,C,C,D,D,D,D
Unnamed: 0_level_1,Unnamed: 1_level_1,Small,c,c,d,d,c,c,d,d
Unnamed: 0_level_2,Unnamed: 1_level_2,Changed_col,cat,dog,cat,dog,cat,dog,cat,dog
Changed_row,Lower,Extra,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
A,a,alpha,3,6,-9,-6,-6,-2,0,9
A,a,beta,-5,-3,3,-8,-3,-2,5,8
A,b,alpha,-4,4,-1,0,7,-4,6,6
A,b,beta,-9,9,-6,8,5,-2,-9,-8
B,a,alpha,0,-9,1,-6,2,9,-7,-9


In [49]:
#用rename改索引值，指定层号level
df_ex.rename(columns={'cat':'not-cat'}, level=2).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Big,C,C,C,C,D,D,D,D
Unnamed: 0_level_1,Unnamed: 1_level_1,Small,c,c,d,d,c,c,d,d
Unnamed: 0_level_2,Unnamed: 1_level_2,Other,not-cat,dog,not-cat,dog,not-cat,dog,not-cat,dog
Upper,Lower,Extra,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
A,a,alpha,3,6,-9,-6,-6,-2,0,9
A,a,beta,-5,-3,3,-8,-3,-2,5,8
A,b,alpha,-4,4,-1,0,7,-4,6,6
A,b,beta,-9,9,-6,8,5,-2,-9,-8
B,a,alpha,0,-9,1,-6,2,9,-7,-9


In [50]:
df_ex.rename(index=lambda x:str.upper(x), level=2).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Big,C,C,C,C,D,D,D,D
Unnamed: 0_level_1,Unnamed: 1_level_1,Small,c,c,d,d,c,c,d,d
Unnamed: 0_level_2,Unnamed: 1_level_2,Other,cat,dog,cat,dog,cat,dog,cat,dog
Upper,Lower,Extra,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
A,a,ALPHA,3,6,-9,-6,-6,-2,0,9
A,a,BETA,-5,-3,3,-8,-3,-2,5,8
A,b,ALPHA,-4,4,-1,0,7,-4,6,6
A,b,BETA,-9,9,-6,8,5,-2,-9,-8
B,a,ALPHA,0,-9,1,-6,2,9,-7,-9


In [51]:
#练一练：在rename_axis中用函数完成一样的功能
df_ex.rename_axis({x:y for x,y in zip(Upper,Other)})

NameError: name 'Upper' is not defined

In [52]:
#替换整个索引的元素，用迭代
new_values=iter(list('abcdefgh'))
df_ex.rename(index=lambda x:next(new_values), level=2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Big,C,C,C,C,D,D,D,D
Unnamed: 0_level_1,Unnamed: 1_level_1,Small,c,c,d,d,c,c,d,d
Unnamed: 0_level_2,Unnamed: 1_level_2,Other,cat,dog,cat,dog,cat,dog,cat,dog
Upper,Lower,Extra,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
A,a,a,3,6,-9,-6,-6,-2,0,9
A,a,b,-5,-3,3,-8,-3,-2,5,8
A,b,c,-4,4,-1,0,7,-4,6,6
A,b,d,-9,9,-6,8,5,-2,-9,-8
B,a,e,0,-9,1,-6,2,9,-7,-9
B,a,f,-9,-5,-4,-3,-1,8,6,-5
B,b,g,0,1,-8,-8,-2,0,-6,-3
B,b,h,2,5,9,-9,5,-6,3,1


In [53]:
#定义在Index上的map函数
df_temp=df_ex.copy()
new_idx=df_temp.index.map(lambda x: (x[0], x[1], str.upper(x[2])))
df_temp.index=new_idx
df_temp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Big,C,C,C,C,D,D,D,D
Unnamed: 0_level_1,Unnamed: 1_level_1,Small,c,c,d,d,c,c,d,d
Unnamed: 0_level_2,Unnamed: 1_level_2,Other,cat,dog,cat,dog,cat,dog,cat,dog
Upper,Lower,Extra,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
A,a,ALPHA,3,6,-9,-6,-6,-2,0,9
A,a,BETA,-5,-3,3,-8,-3,-2,5,8
A,b,ALPHA,-4,4,-1,0,7,-4,6,6
A,b,BETA,-9,9,-6,8,5,-2,-9,-8
B,a,ALPHA,0,-9,1,-6,2,9,-7,-9


### 2.3 索引的设置和充值

In [54]:
df_new = pd.DataFrame({'A':list('aacd'), 'B':list('PQRT'), 'C':[1,2,3,4]})

In [55]:
#用set_index来设置索引
df_new.set_index('A', append=True) #append=True 保留原来的索引，默认为False。

Unnamed: 0_level_0,Unnamed: 1_level_0,B,C
Unnamed: 0_level_1,A,Unnamed: 2_level_1,Unnamed: 3_level_1
0,a,P,1
1,a,Q,2
2,c,R,3
3,d,T,4


In [56]:
#新增一列索引列
my_index=pd.Series(list('WXYZ'),name='D')
df_new=df_new.set_index(['A',my_index])
df_new

Unnamed: 0_level_0,Unnamed: 1_level_0,B,C
A,D,Unnamed: 2_level_1,Unnamed: 3_level_1
a,W,P,1
a,X,Q,2
c,Y,R,3
d,Z,T,4


In [57]:
#去掉索引列，是set_index的逆函数
df_new.reset_index(['D'], drop=True) #drop=True，不仅不是索引列，且把该列删去，默认为False。

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
a,P,1
a,Q,2
c,R,3
d,T,4


In [58]:
#重置所有的索引，则会重新生成一个默认索引
df_new.reset_index()

Unnamed: 0,A,D,B,C
0,a,W,P,1
1,a,X,Q,2
2,c,Y,R,3
3,d,Z,T,4


### 3.4 索引的变形
df_reindex().

In [59]:
df_reindex=pd.DataFrame({"Weight":[60,70,80], "Height":[176,180,179]}, index=['1001','1003','1002'])
df_reindex

Unnamed: 0,Weight,Height
1001,60,176
1003,70,180
1002,80,179


In [60]:
df_reindex.reindex(index=['1001','1002','1003','1004'], columns=['Weight','Gender']) #自动根据index补全

Unnamed: 0,Weight,Gender
1001,60.0,
1002,80.0,
1003,70.0,
1004,,


In [61]:
#reindex_like()根据另一个已知的df格式进行相同的变形
df_existed=pd.DataFrame(index=['1001','1002','1003','1004'], columns=['Weight','Gender'])
df_reindex.reindex_like(df_existed)

Unnamed: 0,Weight,Gender
1001,60.0,
1002,80.0,
1003,70.0,
1004,,


## 4. 索引运算
### 4.1 集合的运算法则
intersection(交）; union（并）; difference（差）; symmetric_difference（非交）   
例：A.difference(B)=A-B

### 4.2 一般的索引运算

In [62]:
df_set_1 = pd.DataFrame([[0,1],[1,2],[3,4]], index = pd.Index(['a','b','a'], name='id1'))
df_set_2 = pd.DataFrame([[4,5],[2,6],[7,1]], index = pd.Index(['b','b','c'], name='id2'))
id1, id2 = df_set_1.index.unique(), df_set_2.index.unique()

In [63]:
id1.intersection(id2) #&

Index(['b'], dtype='object')

In [64]:
id1.union(id2) #|

Index(['a', 'b', 'c'], dtype='object')

In [65]:
id1.difference(id2) #^&

Index(['a'], dtype='object')

In [66]:
id1.symmetric_difference(id2) #^

Index(['a', 'c'], dtype='object')

## 5. 练习
### Ex1：公司员工数据集

In [67]:
df1=pd.read_csv('/Users/jie/Documents/Python/joyful-pandas-master/data/company.csv')
df1.head(3)

Unnamed: 0,EmployeeID,birthdate_key,age,city_name,department,job_title,gender
0,1318,1/3/1954,61,Vancouver,Executive,CEO,M
1,1319,1/3/1957,58,Vancouver,Executive,VP Stores,F
2,1320,1/2/1955,60,Vancouver,Executive,Legal Counsel,F


In [68]:
#1.
#loc
df1.loc[(df1.age<=40) & (df1.department.isin(['Bakery','Dairy'])) & (df1.gender=='M')]

Unnamed: 0,EmployeeID,birthdate_key,age,city_name,department,job_title,gender
3611,5791,1/14/1975,40,Kelowna,Dairy,Dairy Person,M
3613,5793,1/22/1975,40,Richmond,Bakery,Baker,M
3615,5795,1/30/1975,40,Nanaimo,Dairy,Dairy Person,M
3617,5797,2/3/1975,40,Nanaimo,Dairy,Dairy Person,M
3618,5798,2/4/1975,40,Surrey,Dairy,Dairy Person,M
...,...,...,...,...,...,...,...
6108,8307,10/20/1994,21,Burnaby,Dairy,Dairy Person,M
6113,8312,11/12/1994,21,Burnaby,Dairy,Dairy Person,M
6137,8336,12/31/1994,21,Vancouver,Dairy,Dairy Person,M
6270,6312,5/14/1979,36,Grand Forks,Dairy,Dairy Person,M


In [69]:
#query
df1.query('age <= 40 and department in ["Dairy","Bakery"] and gender == "M"')

Unnamed: 0,EmployeeID,birthdate_key,age,city_name,department,job_title,gender
3611,5791,1/14/1975,40,Kelowna,Dairy,Dairy Person,M
3613,5793,1/22/1975,40,Richmond,Bakery,Baker,M
3615,5795,1/30/1975,40,Nanaimo,Dairy,Dairy Person,M
3617,5797,2/3/1975,40,Nanaimo,Dairy,Dairy Person,M
3618,5798,2/4/1975,40,Surrey,Dairy,Dairy Person,M
...,...,...,...,...,...,...,...
6108,8307,10/20/1994,21,Burnaby,Dairy,Dairy Person,M
6113,8312,11/12/1994,21,Burnaby,Dairy,Dairy Person,M
6137,8336,12/31/1994,21,Vancouver,Dairy,Dairy Person,M
6270,6312,5/14/1979,36,Grand Forks,Dairy,Dairy Person,M


In [70]:
#2.
df1.iloc[(df1.EmployeeID % 2==1).values,[0,2,-2]]
#用位置索引

Unnamed: 0,EmployeeID,age,job_title
1,1319,58,VP Stores
3,1321,56,VP Human Resources
5,1323,53,"Exec Assistant, VP Stores"
6,1325,51,"Exec Assistant, Legal Counsel"
8,1329,48,Store Manager
...,...,...,...
6276,7659,26,Cashier
6277,7741,25,Cashier
6278,7801,25,Dairy Person
6280,8181,22,Cashier


In [71]:
#3.
#Step 1
df1_3=df1.set_index(['department','job_title','gender']).swaplevel(0,2)
df1_3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EmployeeID,birthdate_key,age,city_name
gender,job_title,department,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M,CEO,Executive,1318,1/3/1954,61,Vancouver
F,VP Stores,Executive,1319,1/3/1957,58,Vancouver
F,Legal Counsel,Executive,1320,1/2/1955,60,Vancouver
M,VP Human Resources,Executive,1321,1/2/1959,56,Vancouver
M,VP Finance,Executive,1322,1/9/1958,57,Vancouver


In [72]:
#Step 2
df1_3=df1_3.reset_index([1])
df1_3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title,EmployeeID,birthdate_key,age,city_name
gender,department,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M,Executive,CEO,1318,1/3/1954,61,Vancouver
F,Executive,VP Stores,1319,1/3/1957,58,Vancouver
F,Executive,Legal Counsel,1320,1/2/1955,60,Vancouver
M,Executive,VP Human Resources,1321,1/2/1959,56,Vancouver
M,Executive,VP Finance,1322,1/9/1958,57,Vancouver


In [73]:
#Step 3
df1_3=df1_3.rename_axis(index={'gender':'Gender'})
df1_3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title,EmployeeID,birthdate_key,age,city_name
Gender,department,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M,Executive,CEO,1318,1/3/1954,61,Vancouver
F,Executive,VP Stores,1319,1/3/1957,58,Vancouver
F,Executive,Legal Counsel,1320,1/2/1955,60,Vancouver
M,Executive,VP Human Resources,1321,1/2/1959,56,Vancouver
M,Executive,VP Finance,1322,1/9/1958,57,Vancouver


In [74]:
#Step 4
df1_3.index=df1_3.index.map(lambda x: x[0]+'_'+x[1])
df1_3.head()

Unnamed: 0,job_title,EmployeeID,birthdate_key,age,city_name
M_Executive,CEO,1318,1/3/1954,61,Vancouver
F_Executive,VP Stores,1319,1/3/1957,58,Vancouver
F_Executive,Legal Counsel,1320,1/2/1955,60,Vancouver
M_Executive,VP Human Resources,1321,1/2/1959,56,Vancouver
M_Executive,VP Finance,1322,1/9/1958,57,Vancouver


In [75]:
#Step 5
df1_3.index=df1_3.index.map(lambda x: tuple(x.split('_')))
df1_3.head()

Unnamed: 0,Unnamed: 1,job_title,EmployeeID,birthdate_key,age,city_name
M,Executive,CEO,1318,1/3/1954,61,Vancouver
F,Executive,VP Stores,1319,1/3/1957,58,Vancouver
F,Executive,Legal Counsel,1320,1/2/1955,60,Vancouver
M,Executive,VP Human Resources,1321,1/2/1959,56,Vancouver
M,Executive,VP Finance,1322,1/9/1958,57,Vancouver


In [76]:
#Step 6
df1_3=df1_3.rename_axis(index={'gender','department'})
df1_3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title,EmployeeID,birthdate_key,age,city_name
gender,department,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M,Executive,CEO,1318,1/3/1954,61,Vancouver
F,Executive,VP Stores,1319,1/3/1957,58,Vancouver
F,Executive,Legal Counsel,1320,1/2/1955,60,Vancouver
M,Executive,VP Human Resources,1321,1/2/1959,56,Vancouver
M,Executive,VP Finance,1322,1/9/1958,57,Vancouver


In [77]:
#Step 7
df1_3=df1_3.reset_index().reindex(df1.columns,axis=1)
df1_3.head()
#yong reindex()恢复列顺序！！

Unnamed: 0,EmployeeID,birthdate_key,age,city_name,department,job_title,gender
0,1318,1/3/1954,61,Vancouver,Executive,CEO,M
1,1319,1/3/1957,58,Vancouver,Executive,VP Stores,F
2,1320,1/2/1955,60,Vancouver,Executive,Legal Counsel,F
3,1321,1/2/1959,56,Vancouver,Executive,VP Human Resources,M
4,1322,1/9/1958,57,Vancouver,Executive,VP Finance,M


### Ex2: 巧克力数据集

In [78]:
df2=pd.read_csv('/Users/jie/Documents/Python/joyful-pandas-master/data/chocolate.csv')
df2.head(3)

Unnamed: 0,Company,Review\nDate,Cocoa\nPercent,Company\nLocation,Rating
0,A. Morin,2016,63%,France,3.75
1,A. Morin,2015,70%,France,2.75
2,A. Morin,2015,70%,France,3.0


In [79]:
#1.
df2_1=df2.copy()
df2_1.columns=[' '.join(x.split('\n')) for x in df2_1.columns]
df2_1.head()

Unnamed: 0,Company,Review Date,Cocoa Percent,Company Location,Rating
0,A. Morin,2016,63%,France,3.75
1,A. Morin,2015,70%,France,2.75
2,A. Morin,2015,70%,France,3.0
3,A. Morin,2015,70%,France,3.5
4,A. Morin,2015,70%,France,3.5


In [80]:
#2.
df2_2=df2_1.copy()
df2_2['Cocoa Percent']=df2_2['Cocoa Percent'].apply(lambda x: float(x.strip('%'))/100)
condition1=df2_2.Rating<=2.75
condition2=df2_2['Cocoa Percent']>df2_2['Cocoa Percent'].median()
df2_2.loc[condition1 & condition2]

Unnamed: 0,Company,Review Date,Cocoa Percent,Company Location,Rating
33,Akesson's (Pralus),2010,0.75,Switzerland,2.75
34,Akesson's (Pralus),2010,0.75,Switzerland,2.75
36,Alain Ducasse,2014,0.75,France,2.75
38,Alain Ducasse,2013,0.75,France,2.50
39,Alain Ducasse,2013,0.75,France,2.50
...,...,...,...,...,...
1736,Wilkie's Organic,2013,0.89,Ireland,2.75
1738,Wilkie's Organic,2013,0.75,Ireland,2.75
1741,Willie's Cacao,2013,1.00,U.K.,2.25
1769,Zart Pralinen,2016,0.85,Austria,2.75


In [81]:
#3.
df2_3=df2_1.copy()
df2_3=df2_3.set_index(['Review Date','Company Location'])
df2_3=df2_3.sort_index()
idx=pd.IndexSlice ##这步不要忘！！

test=df2_3.loc[idx[2012:,['France', 'Canada', 'Amsterdam', 'Belgium']],:]
test1=df2_3.loc[idx[2012:,:],:]
id_test, id_test1=test.index.unique(),test1.index.unique()
index2_3=id_test1.difference(id_test)

df2_3.loc[index2_3]

Unnamed: 0_level_0,Unnamed: 1_level_0,Company,Cocoa Percent,Rating
Review Date,Company Location,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012,Australia,Bahen & Co.,70%,3.00
2012,Australia,Bahen & Co.,70%,2.50
2012,Australia,Bahen & Co.,70%,2.50
2012,Australia,Cravve,75%,3.25
2012,Australia,Cravve,65%,3.25
...,...,...,...,...
2017,U.S.A.,Spencer,70%,3.75
2017,U.S.A.,Spencer,70%,3.50
2017,U.S.A.,Spencer,70%,2.75
2017,U.S.A.,Xocolla,70%,2.75


In [82]:
#Answer
df2_3.loc[idx[2012:,~df2_3.index.get_level_values(1).isin(['France', 'Canada', 'Amsterdam', 'Belgium'])],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,Company,Cocoa Percent,Rating
Review Date,Company Location,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012,Australia,Bahen & Co.,70%,3.00
2012,Australia,Bahen & Co.,70%,2.50
2012,Australia,Bahen & Co.,70%,2.50
2012,Australia,Cravve,75%,3.25
2012,Australia,Cravve,65%,3.25
...,...,...,...,...
2017,U.S.A.,Spencer,70%,3.75
2017,U.S.A.,Spencer,70%,3.50
2017,U.S.A.,Spencer,70%,2.75
2017,U.S.A.,Xocolla,70%,2.75
