In [1]:
import numpy as np
import pandas as pd

## A Multiply indexed Series

### The bad way
(use python dictionary which keys are tuple as index)

In [2]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [3]:
pop[('New York',2000):('Texas', 2000)]

(New York, 2000)    18976457
(New York, 2010)    19378102
(Texas, 2000)       20851820
dtype: int64

If we need to select all values from 2010, we have to access the tuple index first, and then access data from 2010.  
This is very inflexible

In [4]:
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

### The better way: Pandas MultiIndex

In [5]:
# we can create a multi-index from the tuples as follows

In [6]:
print('index1 :', index)
index = pd.MultiIndex.from_tuples(index)
print('index2 :', index)

index1 : [('California', 2000), ('California', 2010), ('New York', 2000), ('New York', 2010), ('Texas', 2000), ('Texas', 2010)]
index2 : MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])


In [7]:
# re-index our series
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [8]:
pop[1]

37253956

Some entries are missing in the first column: in this multi-index representation, any blank entry indicates the same value as the line above it 

In [9]:
pop[:,2010]


California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [10]:
pop['California']

2000    33871648
2010    37253956
dtype: int64

### MultiIndex as extra dimension

The `unstack()` method will quickly convert a multiply indexed `Series` into a conventionally indexed `DataFrame`

In [11]:
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


Naturally, the `stack()` method provides the opposite operation

In [12]:
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

Using multi-index we will have much more flexiblity in the types of data we can represent.  
As multi-index can be used to represent two-dimensional data within a one-dimensional `Series`, we can use it to represent data of three or more dimensions in a `Series` or `DataFrame`

In [13]:
pop_df = pd.DataFrame({'total':pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df     

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


The ufuncs and other functionality also work with hierachical index

In [14]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


## Creating MultiIndex

The more straightforward way to construct a multiply indexed `Series` or `DataFrame` is to simply pass a list of index arrays

In [15]:
df = pd.DataFrame(
    np.random.rand(4, 2),
    index=[['a', 'a', 'b', 'b'], ['0', '1', '0', '1']],
    columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,0,0.219944,0.230192
a,1,0.363897,0.251871
b,0,0.070257,0.222828
b,1,0.448896,0.833094


We can also pass a dictionary with appropriate tuples as kes, Pandas will automatically recognize this and use a `MultiIndex` by default

In [16]:
data = {
    ('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102
}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

### Explicit MultiIndex constructors

In [17]:
# from a list of arrays
pd.MultiIndex.from_arrays([['a','a','b','b'], ['0','1','0','1']])


MultiIndex(levels=[['a', 'b'], ['0', '1']],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [18]:
# from a list tuple
pd.MultiIndex.from_tuples([('a',0),('a',1),('b',0),('b',1)])

MultiIndex(levels=[['a', 'b'], [0, 1]],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [19]:
# from a Cartesian product of single indices
pd.MultiIndex.from_product([['a','b'],[0,1]])

MultiIndex(levels=[['a', 'b'], [0, 1]],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [20]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
              codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

### MultiIndex level names

In [21]:
pop.index.names = ['state', 'year']
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

### MultiIndex for columns

Rows and columns are complete symmetric, so the columns can have multiple levels as well

In [22]:
# Hierachical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
data

array([[49. , 37. , 37. , 37.5, 32. , 37.5],
       [32. , 39. , 42. , 36.2, 36. , 37. ],
       [30. , 38.3, 33. , 37.2, 26. , 35.8],
       [43. , 36.3, 33. , 36.1, 31. , 37.4]])

In [23]:
# create the DataFrame
health_data = pd.DataFrame(data, index=index,
                           columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,49.0,37.0,37.0,37.5,32.0,37.5
2013,2,32.0,39.0,42.0,36.2,36.0,37.0
2014,1,30.0,38.3,33.0,37.2,26.0,35.8
2014,2,43.0,36.3,33.0,36.1,31.0,37.4


In [24]:
health_data.loc[::2,'Guido':'Sue']

Unnamed: 0_level_0,subject,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2013,1,37.0,37.5,32.0,37.5
2014,1,33.0,37.2,26.0,35.8


## Indexing and Slicing a MultiIndex


### Multiply indexed Series

In [25]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [26]:
# We can access single elements by indexing with multiple terms
pop['California',2010]

37253956

In [27]:
# Partial indexing
# The result is another `Series`, with the lower-level indices maintained
pop['California']

year
2000    33871648
2010    37253956
dtype: int64

In [28]:
pop.loc['California':'New York', 2000]

state       year
California  2000    33871648
New York    2000    18976457
dtype: int64

In [29]:
pop.iloc[0:2]

state       year
California  2000    33871648
            2010    37253956
dtype: int64

In [30]:
pop[:,2010]

state
California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [31]:
pop[pop>22000000]

state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [32]:
pop[['California', 'Texas']]

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

### Multiply indexed DataFrames

In [33]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,49.0,37.0,37.0,37.5,32.0,37.5
2013,2,32.0,39.0,42.0,36.2,36.0,37.0
2014,1,30.0,38.3,33.0,37.2,26.0,35.8
2014,2,43.0,36.3,33.0,36.1,31.0,37.4


Remember that columns are **primary** in a `DataFrame`, and the syntax used for multiply indexed `Series` applies to the columns

In [34]:
health_data['Guido', 'HR']

year  visit
2013  1        37.0
      2        42.0
2014  1        33.0
      2        33.0
Name: (Guido, HR), dtype: float64

In [35]:
# xxx.iloc[rows, columns]
health_data.iloc[:2, :3]

Unnamed: 0_level_0,subject,Bob,Bob,Guido
Unnamed: 0_level_1,type,HR,Temp,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,49.0,37.0,37.0
2013,2,32.0,39.0,42.0


These indexers provide an array-like view of the underlying two-dimensional data,but each individual index in `loc` or `iloc` can be passed a tuple of multiple indices

In [36]:
health_data.loc[:,('Guido', 'HR')]

year  visit
2013  1        37.0
      2        42.0
2014  1        33.0
      2        33.0
Name: (Guido, HR), dtype: float64

In [37]:
health_data.loc[(2013, 1),('Guido', 'HR')]

37.0

In [38]:
health_data.loc[2013:,(['Guido','Bob'], 'HR')]

Unnamed: 0_level_0,subject,Bob,Guido
Unnamed: 0_level_1,type,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,49.0,37.0
2013,2,32.0,42.0
2014,1,30.0,33.0
2014,2,43.0,33.0


Working with slices within these index tuples is not especially convenient; trying to create a slice within a tuple will lead to a syntax error

In [39]:
health_data.loc[(:,1),(:, 'HR')]

SyntaxError: invalid syntax (<ipython-input-39-186aa6537cbc>, line 1)

Getting around this problem by using Pandas's `IndexSlice` object

In [42]:
idx = pd.IndexSlice
health_data.loc[idx[:,::2], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,49.0,37.0,32.0
2014,1,30.0,33.0,26.0


## Rearranging Multi-Indices

### Sorted and unsorted indices

Many of the `MultiIndex` slicing operations will `fail` if the index is not sorted

Let's start by creating some simple multiply indexed data where the indices are **not lexographically** sorted

If we try to take a partial slice of this index, it will result in an `error`

In [43]:
# we can see the index has not benn sorted
index = pd.MultiIndex.from_product([['a','c','b'],[1,2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names=['char', 'int']
data

char  int
a     1      0.309068
      2      0.389323
c     1      0.362703
      2      0.521367
b     1      0.528231
      2      0.649981
dtype: float64

In [45]:
try:
    data['a':'b']
except KeyError as e:
    print(type(e))
    print(e)


<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


For various reasons, partial slicess and other similar operations require the leveks in the `MultiIndex` to be **in sorted order**  
Pandas provides a number of convenience routines to perform this type of sorting.  
e.g.:  
`sort_index()` and `sortlevel()` methods of the `DataFrame`.

In [47]:
data = data.sort_index()
data

char  int
a     1      0.309068
      2      0.389323
b     1      0.528231
      2      0.649981
c     1      0.362703
      2      0.521367
dtype: float64

In [48]:
data['a':'b']

char  int
a     1      0.309068
      2      0.389323
b     1      0.528231
      2      0.649981
dtype: float64

### Stacking and unstacking indices

In [50]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

There are some options specifying the level.


In [52]:
pop.unstack(level=0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [53]:
pop.unstack(level=1)

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


### Index setting and resetting

In [55]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [58]:
pop_flat = pop.reset_index(name='population')
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [59]:
pop_flat.set_index(['state', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


## Data Aggregations on Multi-Indices

We've previously seen that Pandas has built-in data aggregation methods, such as ``mean()``, ``sum()``, and ``max()``.
For hierarchically indexed data, these can be passed a ``level`` parameter that controls which subset of the data the aggregate is computed on.

For example, let's return to our health data:

In [60]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,49.0,37.0,37.0,37.5,32.0,37.5
2013,2,32.0,39.0,42.0,36.2,36.0,37.0
2014,1,30.0,38.3,33.0,37.2,26.0,35.8
2014,2,43.0,36.3,33.0,36.1,31.0,37.4


If we'd like to average-out the measurements in the two visit each year. We can do this by **naming the index level**

In [61]:
data_mean = health_data.mean(level='year')
data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,40.5,38.0,39.5,36.85,34.0,37.25
2014,36.5,37.3,33.0,36.65,28.5,36.6


In [None]:
health_data.mean(level='visit')

By further making use of the `aixs` keyword, we can take the mean among levels on the columns as well

In [None]:
health_data.sum(axis=1, level='type')

In [None]:
health_data.sum(axis=1, level='subject')

## 补充

### 创建多级索引

In [105]:
# 创建两层索引
index = pd.MultiIndex.from_product([
    ['小明', '小华', '小刚','里斯'],
    ['一月', '二月', '三月', '四月']],
    names=['name', 'month'])

columns = pd.MultiIndex.from_product([
    ['python', 'math', 'english'],
    ['期中考', '期末考']],
    names=['subject', '考试'])

In [106]:
df = pd.DataFrame(np.random.randint(0,151,(16, 6)),
                 index=index, columns=columns)
df

Unnamed: 0_level_0,subject,python,python,math,math,english,english
Unnamed: 0_level_1,考试,期中考,期末考,期中考,期末考,期中考,期末考
name,month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
小明,一月,66,84,39,15,62,131
小明,二月,19,148,36,127,85,93
小明,三月,126,45,95,119,11,40
小明,四月,121,74,120,38,86,114
小华,一月,40,9,51,66,88,52
小华,二月,82,54,136,125,18,119
小华,三月,131,51,45,61,22,106
小华,四月,58,7,130,132,128,5
小刚,一月,69,9,35,112,71,61
小刚,二月,93,20,149,34,68,63


In [12]:
# 创建三层索引
columns = pd.MultiIndex.from_product([
    ['python', 'math', 'english'],
    ['广东', '上海'],
    ['期中考', '期末考']],
    names=['subject', 'address', '考试'])
df2 = pd.DataFrame(np.random.randint(0,151, (16, 12)),
                  index=index,columns=columns)
df2

Unnamed: 0_level_0,subject,python,python,python,python,math,math,math,math,english,english,english,english
Unnamed: 0_level_1,address,广东,广东,上海,上海,广东,广东,上海,上海,广东,广东,上海,上海
Unnamed: 0_level_2,考试,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考
name,month,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
小明,一月,99,4,52,139,111,10,28,60,5,18,79,114
小明,二月,109,65,6,99,132,0,39,43,26,72,47,56
小明,三月,29,28,144,100,150,94,113,1,16,0,7,89
小明,四月,8,42,73,100,12,75,68,85,56,108,1,15
小华,一月,81,7,118,125,84,145,78,8,128,29,4,94
小华,二月,29,1,112,4,11,21,81,32,61,42,106,98
小华,三月,101,90,45,12,77,46,69,71,121,23,119,117
小华,四月,146,135,125,18,31,84,49,64,21,74,30,17
小刚,一月,28,53,102,5,0,23,14,116,21,73,64,96
小刚,二月,94,49,36,11,20,49,76,147,65,114,97,111


### 索引与切片


In [27]:
df3 = df2.sort_index()
df3

Unnamed: 0_level_0,subject,python,python,python,python,math,math,math,math,english,english,english,english
Unnamed: 0_level_1,address,广东,广东,上海,上海,广东,广东,上海,上海,广东,广东,上海,上海
Unnamed: 0_level_2,考试,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考
name,month,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
小刚,一月,28,53,102,5,0,23,14,116,21,73,64,96
小刚,三月,98,147,100,50,100,141,55,102,86,73,146,78
小刚,二月,94,49,36,11,20,49,76,147,65,114,97,111
小刚,四月,43,43,41,113,133,53,125,47,11,23,138,107
小华,一月,81,7,118,125,84,145,78,8,128,29,4,94
小华,三月,101,90,45,12,77,46,69,71,121,23,119,117
小华,二月,29,1,112,4,11,21,81,32,61,42,106,98
小华,四月,146,135,125,18,31,84,49,64,21,74,30,17
小明,一月,99,4,52,139,111,10,28,60,5,18,79,114
小明,三月,29,28,144,100,150,94,113,1,16,0,7,89


In [32]:
df3.loc['小刚':'小明','python']

Unnamed: 0_level_0,address,广东,广东,上海,上海
Unnamed: 0_level_1,考试,期中考,期末考,期中考,期末考
name,month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
小刚,一月,28,53,102,5
小刚,三月,98,147,100,50
小刚,二月,94,49,36,11
小刚,四月,43,43,41,113
小华,一月,81,7,118,125
小华,三月,101,90,45,12
小华,二月,29,1,112,4
小华,四月,146,135,125,18
小明,一月,99,4,52,139
小明,三月,29,28,144,100


In [44]:
df3.iloc[0:5, 0:1]

Unnamed: 0_level_0,subject,python
Unnamed: 0_level_1,address,广东
Unnamed: 0_level_2,考试,期中考
name,month,Unnamed: 2_level_3
小刚,一月,28
小刚,三月,98
小刚,二月,94
小刚,四月,43
小华,一月,81


In [70]:
df3.sort_index(axis=0, level=[0,1], inplace=True)
df3.sort_index(axis=1, level=[0,1,2], inplace=True)
df3.head(5)

Unnamed: 0_level_0,subject,english,english,english,english,math,math,math,math,python,python,python,python
Unnamed: 0_level_1,address,上海,上海,广东,广东,上海,上海,广东,广东,上海,上海,广东,广东
Unnamed: 0_level_2,考试,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考
name,month,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
小刚,一月,64,96,21,73,14,116,0,23,102,5,28,53
小刚,三月,146,78,86,73,55,102,100,141,100,50,98,147
小刚,二月,97,111,65,114,76,147,20,49,36,11,94,49
小刚,四月,138,107,11,23,125,47,133,53,41,113,43,43
小华,一月,4,94,128,29,78,8,84,145,118,125,81,7


In [75]:
idx = pd.IndexSlice
df3.loc[idx['小华':'小明', '一月':'三月'],
        idx['english':'math','广东','期中考']]

Unnamed: 0_level_0,subject,english,math
Unnamed: 0_level_1,address,广东,广东
Unnamed: 0_level_2,考试,期中考,期中考
name,month,Unnamed: 2_level_3,Unnamed: 3_level_3
小华,一月,128,84
小华,三月,121,77
小明,一月,5,111
小明,三月,16,150


### 聚合操作
`axis`：指定要操作的轴  
`level`：指定要操作的层级，指定哪一个保留哪一个

In [77]:
df3

Unnamed: 0_level_0,subject,english,english,english,english,math,math,math,math,python,python,python,python
Unnamed: 0_level_1,address,上海,上海,广东,广东,上海,上海,广东,广东,上海,上海,广东,广东
Unnamed: 0_level_2,考试,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考
name,month,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
小刚,一月,64,96,21,73,14,116,0,23,102,5,28,53
小刚,三月,146,78,86,73,55,102,100,141,100,50,98,147
小刚,二月,97,111,65,114,76,147,20,49,36,11,94,49
小刚,四月,138,107,11,23,125,47,133,53,41,113,43,43
小华,一月,4,94,128,29,78,8,84,145,118,125,81,7
小华,三月,119,117,121,23,69,71,77,46,45,12,101,90
小华,二月,106,98,61,42,81,32,11,21,112,4,29,1
小华,四月,30,17,21,74,49,64,31,84,125,18,146,135
小明,一月,79,114,5,18,28,60,111,10,52,139,99,4
小明,三月,7,89,16,0,113,1,150,94,144,100,29,28


In [81]:
df3.mean(axis=0)

subject  address  考试 
english  上海       期中考    70.6875
                  期末考    88.4375
         广东       期中考    61.6250
                  期末考    68.3750
math     上海       期中考    68.2500
                  期末考    70.8125
         广东       期中考    80.3125
                  期末考    65.1875
python   上海       期中考    74.2500
                  期末考    65.6250
         广东       期中考    72.9375
                  期末考    60.0625
dtype: float64

In [85]:
df3.mean(axis=0, level=0)

subject,english,english,english,english,math,math,math,math,python,python,python,python
address,上海,上海,广东,广东,上海,上海,广东,广东,上海,上海,广东,广东
考试,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考,期中考,期末考
name,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
小刚,111.25,98.0,45.75,70.75,67.5,103.0,63.25,66.5,69.75,44.75,65.75,73.0
小华,64.75,81.5,82.75,42.0,69.25,43.75,50.75,74.0,100.0,39.75,89.25,58.25
小明,33.5,68.5,25.75,49.5,62.0,47.25,101.25,44.75,68.75,109.5,61.25,34.75
里斯,73.25,105.75,92.25,111.25,74.25,89.25,106.0,75.5,58.5,68.5,75.5,74.25


In [89]:
df3.mean(axis=1,level=1)

Unnamed: 0_level_0,address,上海,广东
name,month,Unnamed: 2_level_1,Unnamed: 3_level_1
小刚,一月,66.166667,33.0
小刚,三月,88.5,107.5
小刚,二月,79.666667,65.166667
小刚,四月,95.166667,51.0
小华,一月,71.166667,79.0
小华,三月,72.166667,76.333333
小华,二月,72.166667,27.5
小华,四月,50.5,81.833333
小明,一月,78.666667,41.166667
小明,三月,75.666667,52.833333


### 堆
+ `stack()`: 列变行
+ `unstack()`: 行变列

In [107]:
df

Unnamed: 0_level_0,subject,python,python,math,math,english,english
Unnamed: 0_level_1,考试,期中考,期末考,期中考,期末考,期中考,期末考
name,month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
小明,一月,66,84,39,15,62,131
小明,二月,19,148,36,127,85,93
小明,三月,126,45,95,119,11,40
小明,四月,121,74,120,38,86,114
小华,一月,40,9,51,66,88,52
小华,二月,82,54,136,125,18,119
小华,三月,131,51,45,61,22,106
小华,四月,58,7,130,132,128,5
小刚,一月,69,9,35,112,71,61
小刚,二月,93,20,149,34,68,63


In [108]:
df4 = df.iloc[0:8]
df4

Unnamed: 0_level_0,subject,python,python,math,math,english,english
Unnamed: 0_level_1,考试,期中考,期末考,期中考,期末考,期中考,期末考
name,month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
小明,一月,66,84,39,15,62,131
小明,二月,19,148,36,127,85,93
小明,三月,126,45,95,119,11,40
小明,四月,121,74,120,38,86,114
小华,一月,40,9,51,66,88,52
小华,二月,82,54,136,125,18,119
小华,三月,131,51,45,61,22,106
小华,四月,58,7,130,132,128,5


In [109]:
df4.unstack()

subject,python,python,python,python,python,python,python,python,math,math,math,math,math,english,english,english,english,english,english,english,english
考试,期中考,期中考,期中考,期中考,期末考,期末考,期末考,期末考,期中考,期中考,...,期末考,期末考,期中考,期中考,期中考,期中考,期末考,期末考,期末考,期末考
month,一月,三月,二月,四月,一月,三月,二月,四月,一月,三月,...,二月,四月,一月,三月,二月,四月,一月,三月,二月,四月
name,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
小明,66,126,19,121,84,45,148,74,39,95,...,127,38,62,11,85,86,131,40,93,114
小华,40,131,82,58,9,51,54,7,51,45,...,125,132,88,22,18,128,52,106,119,5


In [116]:
df4.unstack(level=0)


subject,python,python,python,python,math,math,math,math,english,english,english,english
考试,期中考,期中考,期末考,期末考,期中考,期中考,期末考,期末考,期中考,期中考,期末考,期末考
name,小明,小华,小明,小华,小明,小华,小明,小华,小明,小华,小明,小华
month,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
一月,66,40,84,9,39,51,15,66,62,88,131,52
三月,126,131,45,51,95,45,119,61,11,22,40,106
二月,19,82,148,54,36,136,127,125,85,18,93,119
四月,121,58,74,7,120,130,38,132,86,128,114,5


In [119]:
df5 = pd.DataFrame(np.random.randint(0, 151, (16,2)),
                   index=index,columns=['python','math'])
df5

Unnamed: 0_level_0,Unnamed: 1_level_0,python,math
name,month,Unnamed: 2_level_1,Unnamed: 3_level_1
小明,一月,46,104
小明,二月,123,0
小明,三月,99,9
小明,四月,112,127
小华,一月,82,130
小华,二月,133,76
小华,三月,63,134
小华,四月,9,21
小刚,一月,42,95
小刚,二月,3,43


In [130]:
df6 = df5.stack()
df6

name  month        
小明    一月     python     46
             math      104
      二月     python    123
             math        0
      三月     python     99
             math        9
      四月     python    112
             math      127
小华    一月     python     82
             math      130
      二月     python    133
             math       76
      三月     python     63
             math      134
      四月     python      9
             math       21
小刚    一月     python     42
             math       95
      二月     python      3
             math       43
      三月     python     47
             math       83
      四月     python     29
             math       66
里斯    一月     python    125
             math       32
      二月     python     91
             math        8
      三月     python     66
             math       70
      四月     python     17
             math       82
dtype: int32

In [132]:
df6.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,python,math
name,month,Unnamed: 2_level_1,Unnamed: 3_level_1
小刚,一月,42,95
小刚,三月,47,83
小刚,二月,3,43
小刚,四月,29,66
小华,一月,82,130
小华,三月,63,134
小华,二月,133,76
小华,四月,9,21
小明,一月,46,104
小明,三月,99,9


In [134]:
df6.unstack(level=1)

Unnamed: 0_level_0,month,一月,三月,二月,四月
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
小刚,python,42,47,3,29
小刚,math,95,83,43,66
小华,python,82,63,133,9
小华,math,130,134,76,21
小明,python,46,99,123,112
小明,math,104,9,0,127
里斯,python,125,66,91,17
里斯,math,32,70,8,82
