<a href="https://colab.research.google.com/github/JakeOh/202505_BD50/blob/main/lab_da/da11_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Series 인덱스

## 단일 계층 인덱스

In [89]:
s = pd.Series(data=np.random.rand(5))
s  #> Series 객체를 생성할 때 index를 설정하지 않으면 RangeIndex가 자동으로 만들어짐.

Unnamed: 0,0
0,0.405884
1,0.225382
2,0.827754
3,0.737158
4,0.413404


In [90]:
s.index  # 인덱스 - row label(행 레이블)

RangeIndex(start=0, stop=5, step=1)

In [91]:
s.values  # 값 -> np.ndarray

array([0.40588447, 0.22538171, 0.82775425, 0.73715818, 0.41340447])

In [92]:
s = pd.Series(data=np.random.rand(5), index=['a', 'b', 'c', 'd', 'e'])
s  #> 생성자에서 설정한 Index 객체가 만들어짐.

Unnamed: 0,0
a,0.41979
b,0.805269
c,0.91727
d,0.938694
e,0.133039


In [93]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [94]:
s.index.nlevels  # Index.nlevels 속성(property): 인덱스 계층(level)의 개수.

1

## 계층적 인덱스(hierachical index), Multi-level index

In [95]:
s = pd.Series(data=np.random.randn(6),
              index=[['m', 'm', 'f', 'f', 'u', 'u'],
                     [1, 2, 3, 1, 2, 3]])
s

Unnamed: 0,Unnamed: 1,0
m,1,-0.348652
m,2,0.556732
f,3,0.497226
f,1,-0.887355
u,2,-1.465051
u,3,0.452824


In [96]:
s.values

array([-0.34865221,  0.55673197,  0.49722573, -0.88735548, -1.46505097,
        0.45282355])

In [97]:
s.index  #> MultiIndex: 튜플들을 원소로 갖는 배열.

MultiIndex([('m', 1),
            ('m', 2),
            ('f', 3),
            ('f', 1),
            ('u', 2),
            ('u', 3)],
           )

In [98]:
s.index.nlevels

2

## Indexing, Slicing

nlevels(인덱스의 계층 개수)가 2 이상인 MulitIndex를 사용해서 loc 속성을 이용할 때,
*   첫번째 레벨의 인덱스만 가지고 indexing, slicing을 할 수 있음.
*   두번째 이상의 인덱스만 가지고는 indexing, slicing을 할 수 없음!
*   튜플 형태의 인덱스로 indexing, slicing이 가능.

In [99]:
# indexing
s.loc['m']  # s['m']

Unnamed: 0,0
1,-0.348652
2,0.556732


In [100]:
# slicing
# s.loc['m':'f']  #> UnsortedIndexError 발생

In [101]:
# s.loc[1]  #> KeyError 발생
s.loc[('m', 1)] # 튜플 타입 인덱스는 사용 가능

np.float64(-0.34865221049124395)

In [102]:
s.loc[[('m', 1), ('f', 1)]]

Unnamed: 0,Unnamed: 1,0
m,1,-0.348652
f,1,-0.887355


In [103]:
# s.loc[('m', 1):('f', 3)]  #> UnsortedIndexError 발생

*   `pd.Series.sort_values()`: 값들을 정렬
*   `pd.Series.sort_index()`: 인덱스들을 정렬.

In [104]:
s

Unnamed: 0,Unnamed: 1,0
m,1,-0.348652
m,2,0.556732
f,3,0.497226
f,1,-0.887355
u,2,-1.465051
u,3,0.452824


In [105]:
s_idx_sort = s.sort_index()
s_idx_sort

Unnamed: 0,Unnamed: 1,0
f,1,-0.887355
f,3,0.497226
m,1,-0.348652
m,2,0.556732
u,2,-1.465051
u,3,0.452824


In [106]:
s_idx_sort['f':'m']  #> 인덱스들을 정렬한 후에는 slicing이 가능

Unnamed: 0,Unnamed: 1,0
f,1,-0.887355
f,3,0.497226
m,1,-0.348652
m,2,0.556732


In [107]:
s_idx_sort[('f', 3):('m', 2)]

Unnamed: 0,Unnamed: 1,0
f,3,0.497226
m,1,-0.348652
m,2,0.556732


## index swapping
인덱스의 레벨을 바꾸는 것.

In [108]:
s

Unnamed: 0,Unnamed: 1,0
m,1,-0.348652
m,2,0.556732
f,3,0.497226
f,1,-0.887355
u,2,-1.465051
u,3,0.452824


In [109]:
# s.swaplevel(i=0, j=1)
s_swap = s.swaplevel().sort_index()
s_swap

Unnamed: 0,Unnamed: 1,0
1,f,-0.887355
1,m,-0.348652
2,m,0.556732
2,u,-1.465051
3,f,0.497226
3,u,0.452824


In [110]:
s_swap.loc[1]

Unnamed: 0,0
f,-0.887355
m,-0.348652


In [111]:
s_swap.loc[1:2]

Unnamed: 0,Unnamed: 1,0
1,f,-0.887355
1,m,-0.348652
2,m,0.556732
2,u,-1.465051


In [112]:
# nlevels=3인 인덱스를 갖는 시리즈
s = pd.Series(data=np.random.rand(6),
              index=[np.arange(1, 7),
                     ['A', 'A', 'A', 'B', 'B', 'B'],
                     ['aa','bb', 'aa', 'bb', 'aa', 'bb']])
s

Unnamed: 0,Unnamed: 1,Unnamed: 2,0
1,A,aa,0.440891
2,A,bb,0.803977
3,A,aa,0.875274
4,B,bb,0.748408
5,B,aa,0.157381
6,B,bb,0.825842


In [113]:
s.index

MultiIndex([(1, 'A', 'aa'),
            (2, 'A', 'bb'),
            (3, 'A', 'aa'),
            (4, 'B', 'bb'),
            (5, 'B', 'aa'),
            (6, 'B', 'bb')],
           )

In [114]:
s.index.nlevels

3

In [115]:
s.swaplevel()  # i=-2, j=-1 기본값: 마지막 인덱스와 끝에서 두번째 인덱스를 서로 바꿈.

Unnamed: 0,Unnamed: 1,Unnamed: 2,0
1,aa,A,0.440891
2,bb,A,0.803977
3,aa,A,0.875274
4,bb,B,0.748408
5,aa,B,0.157381
6,bb,B,0.825842


In [116]:
s.swaplevel(i=0, j=1)

Unnamed: 0,Unnamed: 1,Unnamed: 2,0
A,1,aa,0.440891
A,2,bb,0.803977
A,3,aa,0.875274
B,4,bb,0.748408
B,5,aa,0.157381
B,6,bb,0.825842


# DataFrame 계층적 인덱스

In [117]:
df = pd.DataFrame(data=np.random.rand(6, 3),
                  columns=['a', 'b', 'c'],
                  index=[['Fri', 'Fri', 'Sat', 'Sat', 'Sun', 'Sun'],
                         ['Lunch', 'Dinner'] * 3])
df

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.957121,0.928777,0.607646
Fri,Dinner,0.525212,0.859363,0.256992
Sat,Lunch,0.078604,0.311936,0.171406
Sat,Dinner,0.666166,0.200506,0.621618
Sun,Lunch,0.450406,0.88406,0.816396
Sun,Dinner,0.438943,0.013823,0.666989


In [118]:
df.values  #> DataFrame의 값들로 이루어진 2차원 ndarray

array([[0.95712106, 0.92877655, 0.60764607],
       [0.52521224, 0.85936294, 0.25699197],
       [0.07860371, 0.31193648, 0.17140606],
       [0.66616627, 0.20050601, 0.62161799],
       [0.45040607, 0.88406042, 0.81639583],
       [0.43894333, 0.01382344, 0.66698899]])

In [119]:
df.index  #> MultiIndex

MultiIndex([('Fri',  'Lunch'),
            ('Fri', 'Dinner'),
            ('Sat',  'Lunch'),
            ('Sat', 'Dinner'),
            ('Sun',  'Lunch'),
            ('Sun', 'Dinner')],
           )

In [120]:
df.index.nlevels

2

In [121]:
df.loc['Fri']  #> indexing

Unnamed: 0,a,b,c
Lunch,0.957121,0.928777,0.607646
Dinner,0.525212,0.859363,0.256992


In [144]:
df.loc[[('Fri', 'Lunch'), ('Sat', 'Lunch')]]

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.957121,0.928777,0.607646
Sat,Lunch,0.078604,0.311936,0.171406


In [122]:
df.loc['Fri':'Sat']  #> slicing

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.957121,0.928777,0.607646
Fri,Dinner,0.525212,0.859363,0.256992
Sat,Lunch,0.078604,0.311936,0.171406
Sat,Dinner,0.666166,0.200506,0.621618


In [123]:
# df.loc['Lunch']  #> KeyError 발생 - 두번째 레벨의 인덱스로는 인덱싱을 할 수 없음.

In [124]:
df

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.957121,0.928777,0.607646
Fri,Dinner,0.525212,0.859363,0.256992
Sat,Lunch,0.078604,0.311936,0.171406
Sat,Dinner,0.666166,0.200506,0.621618
Sun,Lunch,0.450406,0.88406,0.816396
Sun,Dinner,0.438943,0.013823,0.666989


In [125]:
df_swap = df.swaplevel()
df_swap

Unnamed: 0,Unnamed: 1,a,b,c
Lunch,Fri,0.957121,0.928777,0.607646
Dinner,Fri,0.525212,0.859363,0.256992
Lunch,Sat,0.078604,0.311936,0.171406
Dinner,Sat,0.666166,0.200506,0.621618
Lunch,Sun,0.450406,0.88406,0.816396
Dinner,Sun,0.438943,0.013823,0.666989


In [126]:
df_swap.loc['Lunch']

Unnamed: 0,a,b,c
Fri,0.957121,0.928777,0.607646
Sat,0.078604,0.311936,0.171406
Sun,0.450406,0.88406,0.816396


# DataFrame 컬럼 <--> Row 레이블

*   `pd.DataFrame.set_index()`: 데이터프레임의 컬럼(들)을 인덱스(row 레이블)로 변환한 데이터프레임을 리턴.
*   `pd.DataFrame.reset_index()`: 데이터프레임의 인덱스(들)을 컬럼으로 변환한 데이터프레임을 리턴.

## `reset_index`

In [127]:
df

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.957121,0.928777,0.607646
Fri,Dinner,0.525212,0.859363,0.256992
Sat,Lunch,0.078604,0.311936,0.171406
Sat,Dinner,0.666166,0.200506,0.621618
Sun,Lunch,0.450406,0.88406,0.816396
Sun,Dinner,0.438943,0.013823,0.666989


In [128]:
df.reset_index()  #> level=None 기본값: 모든 레벨의 인덱스를 컬럼으로 변환.

Unnamed: 0,level_0,level_1,a,b,c
0,Fri,Lunch,0.957121,0.928777,0.607646
1,Fri,Dinner,0.525212,0.859363,0.256992
2,Sat,Lunch,0.078604,0.311936,0.171406
3,Sat,Dinner,0.666166,0.200506,0.621618
4,Sun,Lunch,0.450406,0.88406,0.816396
5,Sun,Dinner,0.438943,0.013823,0.666989


In [129]:
df.reset_index(names=['day', 'time'])  #> 인덱스를 컬럼으로 변환할 때 컬럼의 이름을 설정.

Unnamed: 0,day,time,a,b,c
0,Fri,Lunch,0.957121,0.928777,0.607646
1,Fri,Dinner,0.525212,0.859363,0.256992
2,Sat,Lunch,0.078604,0.311936,0.171406
3,Sat,Dinner,0.666166,0.200506,0.621618
4,Sun,Lunch,0.450406,0.88406,0.816396
5,Sun,Dinner,0.438943,0.013823,0.666989


In [130]:
df.reset_index(level=0)  # 레벨 0의 인덱스만 컬럼으로 변환.

Unnamed: 0,level_0,a,b,c
Lunch,Fri,0.957121,0.928777,0.607646
Dinner,Fri,0.525212,0.859363,0.256992
Lunch,Sat,0.078604,0.311936,0.171406
Dinner,Sat,0.666166,0.200506,0.621618
Lunch,Sun,0.450406,0.88406,0.816396
Dinner,Sun,0.438943,0.013823,0.666989


In [131]:
df.reset_index(level=1)  # 레벨 1의 인덱스만 컬럼으로 변환.

Unnamed: 0,level_1,a,b,c
Fri,Lunch,0.957121,0.928777,0.607646
Fri,Dinner,0.525212,0.859363,0.256992
Sat,Lunch,0.078604,0.311936,0.171406
Sat,Dinner,0.666166,0.200506,0.621618
Sun,Lunch,0.450406,0.88406,0.816396
Sun,Dinner,0.438943,0.013823,0.666989


In [132]:
df.reset_index(level=[0, 1])

Unnamed: 0,level_0,level_1,a,b,c
0,Fri,Lunch,0.957121,0.928777,0.607646
1,Fri,Dinner,0.525212,0.859363,0.256992
2,Sat,Lunch,0.078604,0.311936,0.171406
3,Sat,Dinner,0.666166,0.200506,0.621618
4,Sun,Lunch,0.450406,0.88406,0.816396
5,Sun,Dinner,0.438943,0.013823,0.666989


## `set_index`

In [133]:
exam = pd.DataFrame(data={
        'class': [1] * 5 + [2] * 5,
        'id': np.arange(1, 11),
        'math': np.random.randint(101, size=10),
        'science': np.random.randint(101, size=10)
})
exam

Unnamed: 0,class,id,math,science
0,1,1,2,60
1,1,2,87,44
2,1,3,18,53
3,1,4,52,54
4,1,5,39,63
5,2,6,56,23
6,2,7,0,75
7,2,8,1,69
8,2,9,14,53
9,2,10,20,79


In [134]:
# class=1인 데이터
exam[exam['class'] == 1]

Unnamed: 0,class,id,math,science
0,1,1,2,60
1,1,2,87,44
2,1,3,18,53
3,1,4,52,54
4,1,5,39,63


In [135]:
exam_class = exam.set_index(keys='class')
exam_class

Unnamed: 0_level_0,id,math,science
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,2,60
1,2,87,44
1,3,18,53
1,4,52,54
1,5,39,63
2,6,56,23
2,7,0,75
2,8,1,69
2,9,14,53
2,10,20,79


In [136]:
exam_class.loc[1]

Unnamed: 0_level_0,id,math,science
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,2,60
1,2,87,44
1,3,18,53
1,4,52,54
1,5,39,63


In [137]:
exam_class_id = exam.set_index(keys=['class', 'id'])
exam_class_id

Unnamed: 0_level_0,Unnamed: 1_level_0,math,science
class,id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,2,60
1,2,87,44
1,3,18,53
1,4,52,54
1,5,39,63
2,6,56,23
2,7,0,75
2,8,1,69
2,9,14,53
2,10,20,79


In [138]:
exam_class_id.reset_index(level='class')
#> 인덱스가 이름을 가지고 있는 경우 reset_index 메서드의 level 파라미터로 문자열(들의 리스트)를 줄 수 있음.

Unnamed: 0_level_0,class,math,science
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,2,60
2,1,87,44
3,1,18,53
4,1,52,54
5,1,39,63
6,2,56,23
7,2,0,75
8,2,1,69
9,2,14,53
10,2,20,79


reset_index 예

In [139]:
exam

Unnamed: 0,class,id,math,science
0,1,1,2,60
1,1,2,87,44
2,1,3,18,53
3,1,4,52,54
4,1,5,39,63
5,2,6,56,23
6,2,7,0,75
7,2,8,1,69
8,2,9,14,53
9,2,10,20,79


In [140]:
# exam 데이터프레임에서 반별 과목들의 평균
exam_by_class = exam.groupby(by=['class'])[['math', 'science']].mean()
exam_by_class

Unnamed: 0_level_0,math,science
class,Unnamed: 1_level_1,Unnamed: 2_level_1
1,39.6,54.8
2,18.2,59.8


In [141]:
exam_by_class.reset_index()

Unnamed: 0,class,math,science
0,1,39.6,54.8
1,2,18.2,59.8
