### 层次化索引

In [2]:
from pandas import Series, DataFrame
import numpy as np
import pandas as pd

In [4]:
data = Series(np.random.randn(10),
             index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                   [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])

In [5]:
data

a  1    0.831371
   2    0.190507
   3    0.453282
b  1   -0.174731
   2    0.523766
   3    0.530994
c  1   -0.310161
   2    0.932921
d  2    0.181883
   3   -0.419640
dtype: float64

In [6]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [7]:
data['b']

1   -0.174731
2    0.523766
3    0.530994
dtype: float64

In [8]:
data['b': 'c']

b  1   -0.174731
   2    0.523766
   3    0.530994
c  1   -0.310161
   2    0.932921
dtype: float64

In [10]:
data.ix[['b', 'd']]

b  1   -0.174731
   2    0.523766
   3    0.530994
d  2    0.181883
   3   -0.419640
dtype: float64

In [11]:
data[:, 2]

a    0.190507
b    0.523766
c    0.932921
d    0.181883
dtype: float64

In [13]:
data.unstack()

Unnamed: 0,1,2,3
a,0.831371,0.190507,0.453282
b,-0.174731,0.523766,0.530994
c,-0.310161,0.932921,
d,,0.181883,-0.41964


In [15]:
data.unstack().stack()

a  1    0.831371
   2    0.190507
   3    0.453282
b  1   -0.174731
   2    0.523766
   3    0.530994
c  1   -0.310161
   2    0.932921
d  2    0.181883
   3   -0.419640
dtype: float64

In [18]:
frame = DataFrame(np.arange(12).reshape((4, 3)),
                    index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                    columns=[['Ohio', 'Ohio', 'Colorado'],
                            ['Green', 'Red', 'Green']])

In [19]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [20]:
frame.index.names = ['key1', 'key2']

In [21]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [22]:
frame.columns.names = ['state', 'color']

In [23]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [24]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [28]:
pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'],['Green', 'Red', 'Green']],
                      names=['state', 'color'])

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           labels=[[1, 1, 0], [0, 1, 0]],
           names=['state', 'color'])

#### 重排分级顺序

In [31]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [29]:
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [30]:
frame.sortlevel(1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [32]:
frame.swaplevel(0, 1).sortlevel(0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


#### 根据级别汇总统计

In [33]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [34]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


#### 使用 DataFrame 的列

In [35]:
frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),
                  'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                  'd': [0, 1, 2, 0, 1, 2, 3]})

In [36]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [37]:
frame2 = frame.set_index(['c', 'd'])

In [38]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [39]:
frame.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [40]:
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [41]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [45]:
import tushare as ts
data = ts.get_k_data('000001', start='2017-01-01', end='2017-06-30')[['date', 'close', 'volume']]

In [46]:
data

Unnamed: 0,date,close,volume
0,2017-01-03,9.16,459840.0
1,2017-01-04,9.16,449329.0
2,2017-01-05,9.17,344372.0
3,2017-01-06,9.13,358154.0
4,2017-01-09,9.15,361081.0
5,2017-01-10,9.15,241053.0
6,2017-01-11,9.14,303430.0
7,2017-01-12,9.15,428006.0
8,2017-01-13,9.16,434301.0
9,2017-01-16,9.14,683165.0


In [47]:
data.set_index(['date'])

Unnamed: 0_level_0,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-03,9.16,459840.0
2017-01-04,9.16,449329.0
2017-01-05,9.17,344372.0
2017-01-06,9.13,358154.0
2017-01-09,9.15,361081.0
2017-01-10,9.15,241053.0
2017-01-11,9.14,303430.0
2017-01-12,9.15,428006.0
2017-01-13,9.16,434301.0
2017-01-16,9.14,683165.0


In [48]:
data

Unnamed: 0,date,close,volume
0,2017-01-03,9.16,459840.0
1,2017-01-04,9.16,449329.0
2,2017-01-05,9.17,344372.0
3,2017-01-06,9.13,358154.0
4,2017-01-09,9.15,361081.0
5,2017-01-10,9.15,241053.0
6,2017-01-11,9.14,303430.0
7,2017-01-12,9.15,428006.0
8,2017-01-13,9.16,434301.0
9,2017-01-16,9.14,683165.0


## 其他有关 pandas 的话题

### 整数索引

In [49]:
ser = Series(np.arange(3.))

In [50]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [51]:
ser[-1]

KeyError: -1

In [52]:
ser2 = Series(np.arange(3.), index=['a', 'b', 'c'])

In [53]:
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [54]:
ser2[-1]

2.0

In [55]:
ser.ix[:1]

0    0.0
1    1.0
dtype: float64

In [56]:
ser3 = Series(range(3), index=[-5, 1, 3])

In [58]:
ser3

-5    0
 1    1
 3    2
dtype: int64

In [81]:
ser3.iloc[2]

2

In [66]:
help(Series.iloc)

Help on property:

    Purely integer-location based indexing for selection by position.
    
    ``.iloc[]`` is primarily integer position based (from ``0`` to
    ``length-1`` of the axis), but may also be used with a boolean
    array.
    
    Allowed inputs are:
    
    - An integer, e.g. ``5``.
    - A list or array of integers, e.g. ``[4, 3, 0]``.
    - A slice object with ints, e.g. ``1:7``.
    - A boolean array.
    - A ``callable`` function with one argument (the calling Series, DataFrame
      or Panel) and that returns valid output for indexing (one of the above)
    
    ``.iloc`` will raise ``IndexError`` if a requested indexer is
    out-of-bounds, except *slice* indexers which allow out-of-bounds
    indexing (this conforms with python/numpy *slice* semantics).
    
    See more at :ref:`Selection by Position <indexing.integer>`



In [71]:
frame = DataFrame(np.arange(6).reshape(3, 2), index=[2, 0, 1])

In [72]:
frame

Unnamed: 0,0,1
2,0,1
0,2,3
1,4,5


In [75]:
frame.iloc[0]

0    0
1    1
Name: 2, dtype: int64

#### 面板数据

In [90]:
pdata = pd.Panel(dict((stk, ts.get_k_data(stk, start='2017-01-01', end='2017-06-30')[['date', 'close', 'volume']].set_index(['date'])
                     for stk in ['000001', '000002', '000004', '000005']))

SyntaxError: invalid syntax (<ipython-input-90-0d54712c78ec>, line 2)

In [86]:
pdata

<class 'pandas.core.panel.Panel'>
Dimensions: 4 (items) x 119 (major_axis) x 3 (minor_axis)
Items axis: 000001 to 000005
Major_axis axis: 0 to 118
Minor_axis axis: date to volume

In [87]:
pdata = pdata.swapaxes('items', 'minor')

In [92]:
pdata['close']

Unnamed: 0,000001,000002,000004,000005
0,9.16,20.73,44.45,6.83
1,9.16,20.85,44.7,7.01
2,9.17,20.93,44.44,6.97
3,9.13,20.64,43.96,7.03
4,9.15,20.66,43.01,7.07
5,9.15,20.58,43.25,7.01
6,9.14,20.4,42.45,6.86
7,9.15,21.81,42.05,6.79
8,9.16,21,41,6.68
9,9.14,20.8,38.26,6.44


In [93]:
pdata[:, 10, :]

Unnamed: 0,date,close,volume
1,2017-01-17,9.15,545552
2,2017-01-18,20.92,324772
4,2017-01-17,37.37,21613
5,2017-01-17,6.48,53327


In [99]:
pdata.ix['close', 115:, :]

Unnamed: 0,000001,000002,000004,000005
115,9.36,,,5.28
116,9.43,,,5.23
117,9.43,,,5.25
118,9.39,,,5.24


In [97]:
stacked = pdata.ix[:, 115:, :].to_frame()

In [98]:
stacked

Unnamed: 0_level_0,Unnamed: 1_level_0,date,close,volume
major,minor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
115,1,2017-06-27,9.36,546016.0
115,5,2017-06-27,5.28,42828.0
116,1,2017-06-28,9.43,1168800.0
116,5,2017-06-28,5.23,39284.0
117,1,2017-06-29,9.43,488804.0
117,5,2017-06-29,5.25,29376.0
118,1,2017-06-30,9.39,499633.0
118,5,2017-06-30,5.24,25352.0
