<a href="https://colab.research.google.com/github/JakeOh/202505_BD50/blob/main/lab_da/da11_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Series 인덱스

## 단일 계층 인덱스

In [2]:
s = pd.Series(data=np.random.rand(5))
s  #> Series 객체를 생성할 때 index를 설정하지 않으면 RangeIndex가 자동으로 만들어짐.

Unnamed: 0,0
0,0.443666
1,0.312218
2,0.290392
3,0.498375
4,0.480959


In [3]:
s.index  # 인덱스 - row label(행 레이블)

RangeIndex(start=0, stop=5, step=1)

In [4]:
s.values  # 값 -> np.ndarray

array([0.4436661 , 0.31221846, 0.29039216, 0.49837544, 0.48095943])

In [5]:
s = pd.Series(data=np.random.rand(5), index=['a', 'b', 'c', 'd', 'e'])
s  #> 생성자에서 설정한 Index 객체가 만들어짐.

Unnamed: 0,0
a,0.60179
b,0.38659
c,0.740568
d,0.666003
e,0.193522


In [6]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [7]:
s.index.nlevels  # Index.nlevels 속성(property): 인덱스 계층(level)의 개수.

1

## 계층적 인덱스(hierachical index), Multi-level index

In [9]:
s = pd.Series(data=np.random.randn(6),
              index=[['m', 'm', 'f', 'f', 'u', 'u'],
                     [1, 2, 3, 1, 2, 3]])
s

Unnamed: 0,Unnamed: 1,0
m,1,-0.874732
m,2,0.512407
f,3,0.573985
f,1,0.20686
u,2,-1.803297
u,3,2.60749


In [10]:
s.values

array([-0.87473225,  0.51240749,  0.57398479,  0.20686037, -1.8032972 ,
        2.60749006])

In [11]:
s.index  #> MultiIndex: 튜플들을 원소로 갖는 배열.

MultiIndex([('m', 1),
            ('m', 2),
            ('f', 3),
            ('f', 1),
            ('u', 2),
            ('u', 3)],
           )

In [12]:
s.index.nlevels

2

## Indexing, Slicing

nlevels(인덱스의 계층 개수)가 2 이상인 MulitIndex를 사용해서 loc 속성을 이용할 때,
*   첫번째 레벨의 인덱스만 가지고 indexing, slicing을 할 수 있음.
*   두번째 이상의 인덱스만 가지고는 indexing, slicing을 할 수 없음!
*   튜플 형태의 인덱스로 indexing, slicing이 가능.

In [17]:
# indexing
s.loc['m']  # s['m']

Unnamed: 0,0
1,-0.874732
2,0.512407


In [None]:
# slicing
# s.loc['m':'f']  #> UnsortedIndexError 발생

In [25]:
# s.loc[1]  #> KeyError 발생
s.loc[('m', 1)] # 튜플 타입 인덱스는 사용 가능

np.float64(-0.8747322503300471)

In [23]:
s.loc[[('m', 1), ('f', 1)]]

Unnamed: 0,Unnamed: 1,0
m,1,-0.874732
f,1,0.20686


In [None]:
# s.loc[('m', 1):('f', 3)]  #> UnsortedIndexError 발생

*   `pd.Series.sort_values()`: 값들을 정렬
*   `pd.Series.sort_index()`: 인덱스들을 정렬.

In [29]:
s

Unnamed: 0,Unnamed: 1,0
m,1,-0.874732
m,2,0.512407
f,3,0.573985
f,1,0.20686
u,2,-1.803297
u,3,2.60749


In [30]:
s_idx_sort = s.sort_index()
s_idx_sort

Unnamed: 0,Unnamed: 1,0
f,1,0.20686
f,3,0.573985
m,1,-0.874732
m,2,0.512407
u,2,-1.803297
u,3,2.60749


In [31]:
s_idx_sort['f':'m']  #> 인덱스들을 정렬한 후에는 slicing이 가능

Unnamed: 0,Unnamed: 1,0
f,1,0.20686
f,3,0.573985
m,1,-0.874732
m,2,0.512407


In [38]:
s_idx_sort[('f', 3):('m', 2)]

Unnamed: 0,Unnamed: 1,0
f,3,0.573985
m,1,-0.874732
m,2,0.512407


## index swapping
인덱스의 레벨을 바꾸는 것.

In [42]:
s

Unnamed: 0,Unnamed: 1,0
m,1,-0.874732
m,2,0.512407
f,3,0.573985
f,1,0.20686
u,2,-1.803297
u,3,2.60749


In [44]:
# s.swaplevel(i=0, j=1)
s_swap = s.swaplevel().sort_index()
s_swap

Unnamed: 0,Unnamed: 1,0
1,f,0.20686
1,m,-0.874732
2,m,0.512407
2,u,-1.803297
3,f,0.573985
3,u,2.60749


In [45]:
s_swap.loc[1]

Unnamed: 0,0
f,0.20686
m,-0.874732


In [46]:
s_swap.loc[1:2]

Unnamed: 0,Unnamed: 1,0
1,f,0.20686
1,m,-0.874732
2,m,0.512407
2,u,-1.803297


In [47]:
# nlevels=3인 인덱스를 갖는 시리즈
s = pd.Series(data=np.random.rand(6),
              index=[np.arange(1, 7),
                     ['A', 'A', 'A', 'B', 'B', 'B'],
                     ['aa','bb', 'aa', 'bb', 'aa', 'bb']])
s

Unnamed: 0,Unnamed: 1,Unnamed: 2,0
1,A,aa,0.555856
2,A,bb,0.959563
3,A,aa,0.994806
4,B,bb,0.074927
5,B,aa,0.998682
6,B,bb,0.661611


In [48]:
s.index

MultiIndex([(1, 'A', 'aa'),
            (2, 'A', 'bb'),
            (3, 'A', 'aa'),
            (4, 'B', 'bb'),
            (5, 'B', 'aa'),
            (6, 'B', 'bb')],
           )

In [49]:
s.index.nlevels

3

In [50]:
s.swaplevel()  # i=-2, j=-1 기본값: 마지막 인덱스와 끝에서 두번째 인덱스를 서로 바꿈.

Unnamed: 0,Unnamed: 1,Unnamed: 2,0
1,aa,A,0.555856
2,bb,A,0.959563
3,aa,A,0.994806
4,bb,B,0.074927
5,aa,B,0.998682
6,bb,B,0.661611


In [51]:
s.swaplevel(i=0, j=1)

Unnamed: 0,Unnamed: 1,Unnamed: 2,0
A,1,aa,0.555856
A,2,bb,0.959563
A,3,aa,0.994806
B,4,bb,0.074927
B,5,aa,0.998682
B,6,bb,0.661611


# DataFrame 계층적 인덱스

In [54]:
df = pd.DataFrame(data=np.random.rand(6, 3),
                  columns=['a', 'b', 'c'],
                  index=[['Fri', 'Fri', 'Sat', 'Sat', 'Sun', 'Sun'],
                         ['Lunch', 'Dinner'] * 3])
df

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.332263,0.485679,0.324542
Fri,Dinner,0.130839,0.534757,0.64134
Sat,Lunch,0.270904,0.670978,0.938408
Sat,Dinner,0.381807,0.121791,0.400426
Sun,Lunch,0.96243,0.190232,0.696949
Sun,Dinner,0.41065,0.864198,0.717739


In [55]:
df.values  #> DataFrame의 값들로 이루어진 2차원 ndarray

array([[0.33226265, 0.48567861, 0.32454232],
       [0.13083878, 0.53475667, 0.64134036],
       [0.27090416, 0.67097798, 0.93840814],
       [0.38180725, 0.1217909 , 0.40042574],
       [0.96243009, 0.19023238, 0.69694892],
       [0.41064964, 0.86419757, 0.71773943]])

In [56]:
df.index  #> MultiIndex

MultiIndex([('Fri',  'Lunch'),
            ('Fri', 'Dinner'),
            ('Sat',  'Lunch'),
            ('Sat', 'Dinner'),
            ('Sun',  'Lunch'),
            ('Sun', 'Dinner')],
           )

In [57]:
df.index.nlevels

2

In [58]:
df.loc['Fri']  #> indexing

Unnamed: 0,a,b,c
Lunch,0.332263,0.485679,0.324542
Dinner,0.130839,0.534757,0.64134


In [59]:
df.loc['Fri':'Sat']  #> slicing

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.332263,0.485679,0.324542
Fri,Dinner,0.130839,0.534757,0.64134
Sat,Lunch,0.270904,0.670978,0.938408
Sat,Dinner,0.381807,0.121791,0.400426


In [61]:
# df.loc['Lunch']  #> KeyError 발생 - 두번째 레벨의 인덱스로는 인덱싱을 할 수 없음.

KeyError: 'Lunch'

In [62]:
df

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.332263,0.485679,0.324542
Fri,Dinner,0.130839,0.534757,0.64134
Sat,Lunch,0.270904,0.670978,0.938408
Sat,Dinner,0.381807,0.121791,0.400426
Sun,Lunch,0.96243,0.190232,0.696949
Sun,Dinner,0.41065,0.864198,0.717739


In [63]:
df_swap = df.swaplevel()
df_swap

Unnamed: 0,Unnamed: 1,a,b,c
Lunch,Fri,0.332263,0.485679,0.324542
Dinner,Fri,0.130839,0.534757,0.64134
Lunch,Sat,0.270904,0.670978,0.938408
Dinner,Sat,0.381807,0.121791,0.400426
Lunch,Sun,0.96243,0.190232,0.696949
Dinner,Sun,0.41065,0.864198,0.717739


In [64]:
df_swap.loc['Lunch']

Unnamed: 0,a,b,c
Fri,0.332263,0.485679,0.324542
Sat,0.270904,0.670978,0.938408
Sun,0.96243,0.190232,0.696949


# DataFrame 컬럼 <--> Row 레이블

*   `pd.DataFrame.set_index()`: 데이터프레임의 컬럼(들)을 인덱스(row 레이블)로 변환한 데이터프레임을 리턴.
*   `pd.DataFrame.reset_index()`: 데이터프레임의 인덱스(들)을 컬럼으로 변환한 데이터프레임을 리턴.

## `reset_index`

In [65]:
df

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.332263,0.485679,0.324542
Fri,Dinner,0.130839,0.534757,0.64134
Sat,Lunch,0.270904,0.670978,0.938408
Sat,Dinner,0.381807,0.121791,0.400426
Sun,Lunch,0.96243,0.190232,0.696949
Sun,Dinner,0.41065,0.864198,0.717739


In [66]:
df.reset_index()  #> level=None 기본값: 모든 레벨의 인덱스를 컬럼으로 변환.

Unnamed: 0,level_0,level_1,a,b,c
0,Fri,Lunch,0.332263,0.485679,0.324542
1,Fri,Dinner,0.130839,0.534757,0.64134
2,Sat,Lunch,0.270904,0.670978,0.938408
3,Sat,Dinner,0.381807,0.121791,0.400426
4,Sun,Lunch,0.96243,0.190232,0.696949
5,Sun,Dinner,0.41065,0.864198,0.717739


In [67]:
df.reset_index(names=['day', 'time'])  #> 인덱스를 컬럼으로 변환할 때 컬럼의 이름을 설정.

Unnamed: 0,day,time,a,b,c
0,Fri,Lunch,0.332263,0.485679,0.324542
1,Fri,Dinner,0.130839,0.534757,0.64134
2,Sat,Lunch,0.270904,0.670978,0.938408
3,Sat,Dinner,0.381807,0.121791,0.400426
4,Sun,Lunch,0.96243,0.190232,0.696949
5,Sun,Dinner,0.41065,0.864198,0.717739


In [69]:
df.reset_index(level=0)  # 레벨 0의 인덱스만 컬럼으로 변환.

Unnamed: 0,level_0,a,b,c
Lunch,Fri,0.332263,0.485679,0.324542
Dinner,Fri,0.130839,0.534757,0.64134
Lunch,Sat,0.270904,0.670978,0.938408
Dinner,Sat,0.381807,0.121791,0.400426
Lunch,Sun,0.96243,0.190232,0.696949
Dinner,Sun,0.41065,0.864198,0.717739


In [70]:
df.reset_index(level=1)  # 레벨 1의 인덱스만 컬럼으로 변환.

Unnamed: 0,level_1,a,b,c
Fri,Lunch,0.332263,0.485679,0.324542
Fri,Dinner,0.130839,0.534757,0.64134
Sat,Lunch,0.270904,0.670978,0.938408
Sat,Dinner,0.381807,0.121791,0.400426
Sun,Lunch,0.96243,0.190232,0.696949
Sun,Dinner,0.41065,0.864198,0.717739


In [73]:
df.reset_index(level=[0, 1])

Unnamed: 0,level_0,level_1,a,b,c
0,Fri,Lunch,0.332263,0.485679,0.324542
1,Fri,Dinner,0.130839,0.534757,0.64134
2,Sat,Lunch,0.270904,0.670978,0.938408
3,Sat,Dinner,0.381807,0.121791,0.400426
4,Sun,Lunch,0.96243,0.190232,0.696949
5,Sun,Dinner,0.41065,0.864198,0.717739


## `set_index`

In [74]:
exam = pd.DataFrame(data={
        'class': [1] * 5 + [2] * 5,
        'id': np.arange(1, 11),
        'math': np.random.randint(101, size=10),
        'science': np.random.randint(101, size=10)
})
exam

Unnamed: 0,class,id,math,science
0,1,1,84,21
1,1,2,51,42
2,1,3,0,52
3,1,4,13,46
4,1,5,29,35
5,2,6,50,80
6,2,7,68,54
7,2,8,12,71
8,2,9,50,90
9,2,10,12,74


In [75]:
# class=1인 데이터
exam[exam['class'] == 1]

Unnamed: 0,class,id,math,science
0,1,1,84,21
1,1,2,51,42
2,1,3,0,52
3,1,4,13,46
4,1,5,29,35


In [77]:
exam_class = exam.set_index(keys='class')
exam_class

Unnamed: 0_level_0,id,math,science
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,84,21
1,2,51,42
1,3,0,52
1,4,13,46
1,5,29,35
2,6,50,80
2,7,68,54
2,8,12,71
2,9,50,90
2,10,12,74


In [78]:
exam_class.loc[1]

Unnamed: 0_level_0,id,math,science
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,84,21
1,2,51,42
1,3,0,52
1,4,13,46
1,5,29,35


In [80]:
exam_class_id = exam.set_index(keys=['class', 'id'])
exam_class_id

Unnamed: 0_level_0,Unnamed: 1_level_0,math,science
class,id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,84,21
1,2,51,42
1,3,0,52
1,4,13,46
1,5,29,35
2,6,50,80
2,7,68,54
2,8,12,71
2,9,50,90
2,10,12,74


In [82]:
exam_class_id.reset_index(level='class')
#> 인덱스가 이름을 가지고 있는 경우 reset_index 메서드의 level 파라미터로 문자열(들의 리스트)를 줄 수 있음.

Unnamed: 0_level_0,class,math,science
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,84,21
2,1,51,42
3,1,0,52
4,1,13,46
5,1,29,35
6,2,50,80
7,2,68,54
8,2,12,71
9,2,50,90
10,2,12,74


reset_index 예

In [83]:
exam

Unnamed: 0,class,id,math,science
0,1,1,84,21
1,1,2,51,42
2,1,3,0,52
3,1,4,13,46
4,1,5,29,35
5,2,6,50,80
6,2,7,68,54
7,2,8,12,71
8,2,9,50,90
9,2,10,12,74


In [86]:
# exam 데이터프레임에서 반별 과목들의 평균
exam_by_class = exam.groupby(by=['class'])[['math', 'science']].mean()
exam_by_class

Unnamed: 0_level_0,math,science
class,Unnamed: 1_level_1,Unnamed: 2_level_1
1,35.4,39.2
2,38.4,73.8


In [87]:
exam_by_class.reset_index()

Unnamed: 0,class,math,science
0,1,35.4,39.2
1,2,38.4,73.8
