<a href="https://colab.research.google.com/github/JakeOh/202105_itw_bd26/blob/main/lab_da/da12_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

# Series 단일 계층 인덱스

In [2]:
s = pd.Series(data=np.arange(1, 6))
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [3]:
# Series 객체를 생성할 때 index를 설정하지 않으면 RangeIndex가 자동으로 만들어짐.
s.index

RangeIndex(start=0, stop=5, step=1)

In [4]:
s = pd.Series(data=np.random.randn(5),
              index=['a', 'b', 'c', 'd', 'e'])
s

a    0.934757
b    0.832580
c    0.126667
d   -2.100427
e    0.716984
dtype: float64

In [5]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [6]:
# Index 객체는 nlevels 속성을 가지고 있음.
s.index.nlevels

1

In [8]:
# Series에서 index는 값을 참조하기 위해서 사용.
print(s.loc['a'])  # 한 개의 값을 참조
print(s.loc['a':'c'])  # slicing: 부분집합

0.9347567975208791
a    0.934757
b    0.832580
c    0.126667
dtype: float64


# Series 계층적 인덱스(Hierachical Index)

* Multi-level index

In [9]:
s = pd.Series(data=np.random.randn(6),
              index=[['a', 'a', 'b', 'b', 'c', 'c'], 
                     [1, 2, 3, 1, 2, 3]])
s

a  1    0.951530
   2    1.477701
b  3    0.289296
   1   -1.616832
c  2   -0.425163
   3    0.324009
dtype: float64

In [10]:
s.index

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 3),
            ('b', 1),
            ('c', 2),
            ('c', 3)],
           )

In [11]:
s.index.nlevels

2

nlevels가 2 이상인 MultiIndex를 사용해서 loc를 사용하는 방법:

* 첫번째 레벨의 인덱스만 가지고 indexing, slicing을 할 수 있음.
* 두번째 레벨의 인덱스만 가지고는 인덱싱을 할 수없음!
* 튜플 형태의 인덱스로 인덱싱을 할 수 있음.

In [13]:
s.loc['a']

1    0.951530
2    1.477701
dtype: float64

In [16]:
s.loc['a':'b']

a  1    0.951530
   2    1.477701
b  3    0.289296
   1   -1.616832
dtype: float64

In [None]:
# s.loc[1]  #> KeyError 발생

In [15]:
s.loc[('a', 1)]  # index은 가능

0.9515304112118564

In [None]:
# s.loc[('a', 1):('b', 3)]  #> UnsortedIndexError 에러 발생 - slicing은 불가능

`pd.Series.swaplevel()`: 인덱스의 레벨을 바꿔줌.

In [20]:
s.swaplevel()

1  a    0.951530
2  a    1.477701
3  b    0.289296
1  b   -1.616832
2  c   -0.425163
3  c    0.324009
dtype: float64

두번째 레벨의 인덱스만 가지고 indexing과 slicing을 할 수는 없고, 첫번째와 두번째 레벨의 인덱스를 서로 위치를 바꾼 후 첫번째 레벨 인덱스로 indexing과 slicing을 하면 됨.

In [22]:
s.swaplevel().loc[1]

a    0.951530
b   -1.616832
dtype: float64

* `pd.Series.sort_index()`: Series 객체의 index를 정렬.
* `pd.Series.sort_values()`: Series 객체의 values를 정렬.

In [25]:
s.swaplevel().sort_index().loc[1:2]
#> 정렬되지 않은 인덱스로는 slicing을 할 수 없기 때문에, 인덱스들을 먼저 정렬한 후 slicing을 함.

1  a    0.951530
   b   -1.616832
2  a    1.477701
   c   -0.425163
dtype: float64

# DataFrame의 계층적 인덱스

In [28]:
df = pd.DataFrame(data=np.random.randn(6, 3),
                  columns=['a', 'b', 'c'],
                  index=[['Fri', 'Fri', 'Sat', 'Sat', 'Sun', 'Sun'], 
                         ['Lunch', 'Dinner'] * 3])
df

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,1.249391,0.714065,1.210146
Fri,Dinner,1.367137,-1.153915,1.134513
Sat,Lunch,-1.573767,1.080544,-0.872395
Sat,Dinner,-0.827423,0.643445,-0.218825
Sun,Lunch,-2.595002,1.287724,1.175542
Sun,Dinner,-0.488819,-0.026829,-0.342128


In [29]:
df.loc['Fri']  # 첫번째 레벨의 인덱스를 사용한 indexing

Unnamed: 0,a,b,c
Lunch,1.249391,0.714065,1.210146
Dinner,1.367137,-1.153915,1.134513


In [30]:
df.loc['Sat':'Sun']  # 첫번째 레벨의 인덱스를 사용한 slicing

Unnamed: 0,Unnamed: 1,a,b,c
Sat,Lunch,-1.573767,1.080544,-0.872395
Sat,Dinner,-0.827423,0.643445,-0.218825
Sun,Lunch,-2.595002,1.287724,1.175542
Sun,Dinner,-0.488819,-0.026829,-0.342128


In [31]:
df.loc[('Fri', 'Lunch')]

a    1.249391
b    0.714065
c    1.210146
Name: (Fri, Lunch), dtype: float64

In [33]:
# 'Lunch'만 선택
df.swaplevel().loc['Lunch']

Unnamed: 0,a,b,c
Fri,1.249391,0.714065,1.210146
Sat,-1.573767,1.080544,-0.872395
Sun,-2.595002,1.287724,1.175542


# DataFrame column <--> row index

* `pd.DataFrame.set_index`
    * DataFrame의 컬럼(들)을 row index로 변환한 DataFrame을 리턴.
* `pd.DataFrame.reset_index`
    * DataFrame의 row index(들)을 컬럼으로 변환한 DataFrame을 리턴.

In [34]:
df

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,1.249391,0.714065,1.210146
Fri,Dinner,1.367137,-1.153915,1.134513
Sat,Lunch,-1.573767,1.080544,-0.872395
Sat,Dinner,-0.827423,0.643445,-0.218825
Sun,Lunch,-2.595002,1.287724,1.175542
Sun,Dinner,-0.488819,-0.026829,-0.342128


In [36]:
df.reset_index()
# level=None 생략 가능(default argument): 모든 레벨의 인덱스를 모두 컬럼으로 변환

Unnamed: 0,level_0,level_1,a,b,c
0,Fri,Lunch,1.249391,0.714065,1.210146
1,Fri,Dinner,1.367137,-1.153915,1.134513
2,Sat,Lunch,-1.573767,1.080544,-0.872395
3,Sat,Dinner,-0.827423,0.643445,-0.218825
4,Sun,Lunch,-2.595002,1.287724,1.175542
5,Sun,Dinner,-0.488819,-0.026829,-0.342128


In [38]:
df.reset_index(level=1)  # 레벨 1의 인덱스만 컬럼으로 변환

Unnamed: 0,level_1,a,b,c
Fri,Lunch,1.249391,0.714065,1.210146
Fri,Dinner,1.367137,-1.153915,1.134513
Sat,Lunch,-1.573767,1.080544,-0.872395
Sat,Dinner,-0.827423,0.643445,-0.218825
Sun,Lunch,-2.595002,1.287724,1.175542
Sun,Dinner,-0.488819,-0.026829,-0.342128


In [39]:
df.reset_index(level=0)

Unnamed: 0,level_0,a,b,c
Lunch,Fri,1.249391,0.714065,1.210146
Dinner,Fri,1.367137,-1.153915,1.134513
Lunch,Sat,-1.573767,1.080544,-0.872395
Dinner,Sat,-0.827423,0.643445,-0.218825
Lunch,Sun,-2.595002,1.287724,1.175542
Dinner,Sun,-0.488819,-0.026829,-0.342128


In [40]:
df.reset_index(level=[0, 1])  # 레벨0, 레벨1의 인덱스들을 컬럼으로 변환

Unnamed: 0,level_0,level_1,a,b,c
0,Fri,Lunch,1.249391,0.714065,1.210146
1,Fri,Dinner,1.367137,-1.153915,1.134513
2,Sat,Lunch,-1.573767,1.080544,-0.872395
3,Sat,Dinner,-0.827423,0.643445,-0.218825
4,Sun,Lunch,-2.595002,1.287724,1.175542
5,Sun,Dinner,-0.488819,-0.026829,-0.342128


In [41]:
exam = pd.DataFrame({
    'class': [1] * 5 + [2] * 5,
    'id': np.arange(1, 11),
    'math': np.random.randint(0, 101, size=10),
    'science': np.random.randint(0, 101, size=10),
    'history': np.random.randint(0, 101, size=10)
})
exam

Unnamed: 0,class,id,math,science,history
0,1,1,35,57,38
1,1,2,3,18,3
2,1,3,83,40,83
3,1,4,45,65,64
4,1,5,53,17,68
5,2,6,86,91,80
6,2,7,42,12,1
7,2,8,84,48,69
8,2,9,73,50,26
9,2,10,20,5,3


In [42]:
df_1 = exam.set_index(keys='class')
df_1

Unnamed: 0_level_0,id,math,science,history
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,35,57,38
1,2,3,18,3
1,3,83,40,83
1,4,45,65,64
1,5,53,17,68
2,6,86,91,80
2,7,42,12,1
2,8,84,48,69
2,9,73,50,26
2,10,20,5,3


In [43]:
df_1.reset_index()

Unnamed: 0,class,id,math,science,history
0,1,1,35,57,38
1,1,2,3,18,3
2,1,3,83,40,83
3,1,4,45,65,64
4,1,5,53,17,68
5,2,6,86,91,80
6,2,7,42,12,1
7,2,8,84,48,69
8,2,9,73,50,26
9,2,10,20,5,3


In [46]:
exam[exam['class'] == 1]  # boolean indexing

Unnamed: 0,class,id,math,science,history
0,1,1,35,57,38
1,1,2,3,18,3
2,1,3,83,40,83
3,1,4,45,65,64
4,1,5,53,17,68


In [48]:
df_1.loc[1]  # loc 속성을 사용한 참조

Unnamed: 0_level_0,id,math,science,history
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,35,57,38
1,2,3,18,3
1,3,83,40,83
1,4,45,65,64
1,5,53,17,68


In [50]:
df_2 = exam.set_index(keys=['class', 'id'])
df_2

Unnamed: 0_level_0,Unnamed: 1_level_0,math,science,history
class,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,35,57,38
1,2,3,18,3
1,3,83,40,83
1,4,45,65,64
1,5,53,17,68
2,6,86,91,80
2,7,42,12,1
2,8,84,48,69
2,9,73,50,26
2,10,20,5,3


In [51]:
df_2.reset_index(level=0)

Unnamed: 0_level_0,class,math,science,history
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,35,57,38
2,1,3,18,3
3,1,83,40,83
4,1,45,65,64
5,1,53,17,68
6,2,86,91,80
7,2,42,12,1
8,2,84,48,69
9,2,73,50,26
10,2,20,5,3


In [52]:
df_2.reset_index(level='class')
# 인덱스가 이름을 가지고 있는 경우는 인덱스의 이름을 argument로 전달해서 reset_index를 할 수 있음.

Unnamed: 0_level_0,class,math,science,history
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,35,57,38
2,1,3,18,3
3,1,83,40,83
4,1,45,65,64
5,1,53,17,68
6,2,86,91,80
7,2,42,12,1
8,2,84,48,69
9,2,73,50,26
10,2,20,5,3


In [53]:
df_2.reset_index(level=['class', 'id'])

Unnamed: 0,class,id,math,science,history
0,1,1,35,57,38
1,1,2,3,18,3
2,1,3,83,40,83
3,1,4,45,65,64
4,1,5,53,17,68
5,2,6,86,91,80
6,2,7,42,12,1
7,2,8,84,48,69
8,2,9,73,50,26
9,2,10,20,5,3
