<a href="https://colab.research.google.com/github/JakeOh/202007_itw_bd18/blob/master/lab_python/python55_multi_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

# 단일 계층 인덱스

In [72]:
s = pd.Series(data=np.random.randn(6),
              index=['a', 'b', 'c'] * 2)
s

a    0.879414
b   -0.695447
c   -0.901145
a   -0.419428
b    0.496285
c   -0.889477
dtype: float64

In [3]:
s.index  # pandas.Series의 index 속성

Index(['a', 'b', 'c', 'a', 'b', 'c'], dtype='object')

In [4]:
s.index.nlevels  # 인덱스의 계층 개수

1

In [73]:
s

a    0.879414
b   -0.695447
c   -0.901145
a   -0.419428
b    0.496285
c   -0.889477
dtype: float64

In [74]:
s.loc['a']

a    0.879414
a   -0.419428
dtype: float64

In [78]:
s = pd.Series(data=np.arange(11, 17),
              index=['a', 'b', 'c', 'd', 'e', 'f'])
s

a    11
b    12
c    13
d    14
e    15
f    16
dtype: int64

In [79]:
s.loc['b':'d']  # slicing

b    12
c    13
d    14
dtype: int64

# 계층적 색인(hierarchical indexing)
여러개의 level을 갖는 인덱스

In [80]:
s = pd.Series(data=np.random.randn(6),
              index=[['a', 'a', 'b', 'b', 'c', 'c'],
                     [1, 2, 3, 1, 2, 3]])
s

a  1   -0.240720
   2    0.196588
b  3    1.262857
   1    0.901552
c  2    1.978623
   3    0.417179
dtype: float64

In [6]:
s.index

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 3),
            ('b', 1),
            ('c', 2),
            ('c', 3)],
           )

In [7]:
s.index.nlevels

2

In [9]:
# iloc을 사용한 인덱싱
s.iloc[1]

-0.7758430666902969

In [11]:
# loc을 사용한 인덱싱
s.loc['b']

3   -2.627836
1   -2.116532
dtype: float64

In [46]:
# s.loc[1]  # 두번째 레벨의 인덱스만으로는 인덱싱되지 않음

In [17]:
s.loc[('a', 2)]

-0.7758430666902969

In [82]:
# slicing
s.loc['b':'c']

b  3    1.262857
   1    0.901552
c  2    1.978623
   3    0.417179
dtype: float64

In [83]:
# multi-index(계층적 색인)에서 계층(level)의 순서를 변경.
s.swaplevel()

1  a   -0.240720
2  a    0.196588
3  b    1.262857
1  b    0.901552
2  c    1.978623
3  c    0.417179
dtype: float64

In [85]:
s.swaplevel().loc[1]

a   -0.240720
b    0.901552
dtype: float64

In [88]:
s.swaplevel().sort_index().loc[1:2]

1  a   -0.240720
   b    0.901552
2  a    0.196588
   c    1.978623
dtype: float64

# DataFrame에서 multi-index

In [96]:
# row에 multi-index가 있는 경우
df = pd.DataFrame(data=np.arange(18).reshape((6, 3)),
                  columns=['a', 'b', 'c'],
                  index=[['A', 'A', 'B', 'B', 'C', 'C'],
                         ['a1', 'a2', 'a1', 'a2', 'a1', 'a2']])
df

Unnamed: 0,Unnamed: 1,a,b,c
A,a1,0,1,2
A,a2,3,4,5
B,a1,6,7,8
B,a2,9,10,11
C,a1,12,13,14
C,a2,15,16,17


In [97]:
df.index

MultiIndex([('A', 'a1'),
            ('A', 'a2'),
            ('B', 'a1'),
            ('B', 'a2'),
            ('C', 'a1'),
            ('C', 'a2')],
           )

In [98]:
df.index.nlevels

2

In [99]:
df.loc['B']  # 첫번째 level의 인덱스로 row 선택.

Unnamed: 0,a,b,c
a1,6,7,8
a2,9,10,11


In [100]:
df.loc['A':'B']  # 첫번째 level의 인덱스로 slicing.

Unnamed: 0,Unnamed: 1,a,b,c
A,a1,0,1,2
A,a2,3,4,5
B,a1,6,7,8
B,a2,9,10,11


In [104]:
# df.loc['a1']  # KeyError
df.swaplevel().loc['a1']

Unnamed: 0,a,b,c
A,0,1,2
B,6,7,8
C,12,13,14


In [112]:
# df.loc['a1':'a2']
# df.swaplevel().loc['a1':'a2']
df.swaplevel().sort_index().loc['a1':'a2']

Unnamed: 0,Unnamed: 1,a,b,c
a1,A,0,1,2
a1,B,6,7,8
a1,C,12,13,14
a2,A,3,4,5
a2,B,9,10,11
a2,C,15,16,17


In [113]:
# row와 column 모두 multi-index를 갖는 경우
df = pd.DataFrame(data=np.arange(18).reshape((6, 3)),
                  columns=[['a', 'a', 'b'], 
                           ['b1', 'b2', 'b1']],
                  index=[['A', 'A', 'B', 'B', 'C', 'C'],
                         ['a1', 'a2', 'a1', 'a2', 'a1', 'a2']])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b1,b2,b1
A,a1,0,1,2
A,a2,3,4,5
B,a1,6,7,8
B,a2,9,10,11
C,a1,12,13,14
C,a2,15,16,17


In [114]:
df.columns

MultiIndex([('a', 'b1'),
            ('a', 'b2'),
            ('b', 'b1')],
           )

In [116]:
df.columns.nlevels

2

데이터 프레임의 특정 컬럼을 인덱스로 만들기

In [117]:
df = pd.DataFrame(data={'class': [1, 1, 1, 2, 2, 2],
                        'stu_id': [1, 2, 3, 1, 2, 3],
                        'kor': np.random.randint(0, 101, size=6),
                        'eng': np.random.randint(0, 101, size=6)})
df

Unnamed: 0,class,stu_id,kor,eng
0,1,1,97,48
1,1,2,98,0
2,1,3,94,74
3,2,1,8,2
4,2,2,94,50
5,2,3,74,98


In [119]:
df.shape  #> (6, 4)

(6, 4)

In [120]:
df2 = df.set_index(keys='class')
df2

Unnamed: 0_level_0,stu_id,kor,eng
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,97,48
1,2,98,0
1,3,94,74
2,1,8,2
2,2,94,50
2,3,74,98


In [121]:
df2.shape  #> (6, 3)

(6, 3)

In [122]:
df3 = df.set_index(keys=['class', 'stu_id'])
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,kor,eng
class,stu_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,97,48
1,2,98,0
1,3,94,74
2,1,8,2
2,2,94,50
2,3,74,98


In [123]:
df3.shape  #> (6, 2)

(6, 2)

In [146]:
df3.loc[1]  # 0번째 level의 인덱스로 row 선택.

Unnamed: 0_level_0,kor,eng
stu_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,97,48
2,98,0
3,94,74


In [147]:
df3.loc[1:2]  # 0번째 level의 인덱스로 slicing.

Unnamed: 0_level_0,Unnamed: 1_level_0,kor,eng
class,stu_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,97,48
1,2,98,0
1,3,94,74
2,1,8,2
2,2,94,50
2,3,74,98


In [149]:
df3.swaplevel().sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,kor,eng
stu_id,class,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,97,48
1,2,8,2
2,1,98,0
2,2,94,50
3,1,94,74
3,2,74,98


In [125]:
df2.reset_index()  # 인덱스를 컬럼으로 만듦.

Unnamed: 0,class,stu_id,kor,eng
0,1,1,97,48
1,1,2,98,0
2,1,3,94,74
3,2,1,8,2
4,2,2,94,50
5,2,3,74,98


In [127]:
df3.reset_index()

Unnamed: 0,class,stu_id,kor,eng
0,1,1,97,48
1,1,2,98,0
2,1,3,94,74
3,2,1,8,2
4,2,2,94,50
5,2,3,74,98


In [130]:
# multi-index인 경우에는 특정 레벨의 인덱스만 컬럼으로 전환.
df3.reset_index(level=1)  # level: 0, 1, 2, ...

Unnamed: 0_level_0,class,kor,eng
stu_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,97,48
2,1,98,0
3,1,94,74
1,2,8,2
2,2,94,50
3,2,74,98


In [134]:
df3.reset_index(level='stu_id')

Unnamed: 0_level_0,stu_id,kor,eng
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,97,48
1,2,98,0
1,3,94,74
2,1,8,2
2,2,94,50
2,3,74,98


In [136]:
df3.reset_index(level=['class', 'stu_id'])

Unnamed: 0,class,stu_id,kor,eng
0,1,1,97,48
1,1,2,98,0
2,1,3,94,74
3,2,1,8,2
4,2,2,94,50
5,2,3,74,98


https://github.com/pandas-dev/pandas/tree/master/doc/data/ 의 데이터 파일들 중에서 air_quality_no2.csv 이용

In [139]:
no2_data = 'https://github.com/pandas-dev/pandas/raw/master/doc/data/air_quality_no2.csv'

# no2_df = pd.read_csv(no2_data)
# no2_df.set_index(keys='datetime')

# set_index를 read_csv를 하면서 동시에 수행.
no2_df = pd.read_csv(no2_data, index_col='datetime')
no2_df

Unnamed: 0_level_0,station_antwerp,station_paris,station_london
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-05-07 02:00:00,,,23.0
2019-05-07 03:00:00,50.5,25.0,19.0
2019-05-07 04:00:00,45.0,27.7,19.0
2019-05-07 05:00:00,,50.4,16.0
2019-05-07 06:00:00,,61.9,
...,...,...,...
2019-06-20 22:00:00,,21.4,
2019-06-20 23:00:00,,24.9,
2019-06-21 00:00:00,,26.5,
2019-06-21 01:00:00,,21.8,
