### 2.6 인덱스 조작

In [2]:
import numpy as np
import pandas as pd

- 인덱스 설정 및 제거

In [2]:
np.random.seed(0)
df1 = pd.DataFrame(np.vstack([list('ABCDE'),
                              np.round(np.random.rand(3, 5), 2)]).T,
                   columns=["C1", "C2", "C3", "C4"])
df1

Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [3]:
df2 = df1.set_index('C1')
df2

Unnamed: 0_level_0,C2,C3,C4
C1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.55,0.65,0.79
B,0.72,0.44,0.53
C,0.6,0.89,0.57
D,0.54,0.96,0.93
E,0.42,0.38,0.07


In [5]:
print(df1.index)
print(df2.index)

RangeIndex(start=0, stop=5, step=1)
Index(['A', 'B', 'C', 'D', 'E'], dtype='object', name='C1')


In [6]:
df2.reset_index() # inplace=True가 되는 순간 값이 바뀐다.

Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [10]:
df1.to_csv('data/sample11.csv', index=False)
pd.read_csv('data/sample11.csv')

Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [11]:
df2.to_csv('data/sample12.csv')
pd.read_csv('data/sample12.csv',index_col='C1')

Unnamed: 0_level_0,C2,C3,C4
C1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.55,0.65,0.79
B,0.72,0.44,0.53
C,0.6,0.89,0.57
D,0.54,0.96,0.93
E,0.42,0.38,0.07


- 다중 인덱스

In [12]:
np.random.seed(0)
df3 = pd.DataFrame(np.round(np.random.randn(5, 4), 2),
                   columns=[["A", "A", "B", "B"],
                            ["C1", "C2", "C1", "C2"]])
df3

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,C1,C2,C1,C2
0,1.76,0.4,0.98,2.24
1,1.87,-0.98,0.95,-0.15
2,-0.1,0.41,0.14,1.45
3,0.76,0.12,0.44,0.33
4,1.49,-0.21,0.31,-0.85


In [13]:
# 0.41
df3['A','C2'][2]

0.41

In [14]:
df3['A']

Unnamed: 0,C1,C2
0,1.76,0.4
1,1.87,-0.98
2,-0.1,0.41
3,0.76,0.12
4,1.49,-0.21


In [15]:
df3.columns.names = ['Cidx1','Cidx2']
df3

Cidx1,A,A,B,B
Cidx2,C1,C2,C1,C2
0,1.76,0.4,0.98,2.24
1,1.87,-0.98,0.95,-0.15
2,-0.1,0.41,0.14,1.45
3,0.76,0.12,0.44,0.33
4,1.49,-0.21,0.31,-0.85


In [16]:
np.random.seed(0)
df4 = pd.DataFrame(np.round(np.random.randn(6, 4), 2),
                   columns=[["A", "A", "B", "B"],
                            ["C", "D", "C", "D"]],
                   index=[["M", "M", "M", "F", "F", "F"],
                          ["id_" + str(i + 1) for i in range(3)] * 2])
df4.columns.names = ["Cidx1", "Cidx2"]
df4.index.names = ["Ridx1", "Ridx2"]
df4

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74


In [17]:
# 0.41
df4['A','D']['M','id_3']

0.41

In [18]:
# iloc 인덱서를 사용하면 다중 인덱스 무시하고 인덱싱하면 됨
df4.iloc[2, 1]

0.41

- 연습 문제 4.5.1

![image.png](attachment:image.png)

In [26]:
np.random.seed(0)
df_score1 = pd.DataFrame(np.vstack([list('ABCDE'),
                              np.round(np.random.rand(3, 5), 2)]).T,
                   columns=["과목","국어", "영어", "수학"])
df_score1

Unnamed: 0,과목,국어,영어,수학
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [27]:
df_score2 = df_score1.set_index("과목")
df_score2

Unnamed: 0_level_0,국어,영어,수학
과목,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.55,0.65,0.79
B,0.72,0.44,0.53
C,0.6,0.89,0.57
D,0.54,0.96,0.93
E,0.42,0.38,0.07


In [28]:
df_score2.reset_index()

Unnamed: 0,과목,국어,영어,수학
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


### CCTV 문제

In [29]:
df5 = pd.read_csv('data/서울시 자치구 연도별 방범용 CCTV 운영 현황_230630기준.csv', encoding='EUC-KR',index_col=0)
df5

Unnamed: 0,구분,2015년,2016년,2017년,2018년,2019년,2020년,2021년,2022년,2023년
1,종로구,935,1066,1225,1322,1338,1526,1573,1812,1808
2,중구,363,565,838,1174,1234,1482,1911,2026,2026
3,용산구,1398,1689,1831,1888,1986,2131,2321,2531,2647
4,성동구,1089,1328,2103,2390,2697,3162,3519,3627,3643
5,광진구,638,657,1112,1586,2233,2375,3111,3370,3359
6,동대문구,1202,1425,1535,1775,1969,2142,2471,2592,2592
7,중랑구,751,898,1047,1203,2250,3165,3592,3856,3856
8,성북구,1035,1534,1940,2542,2895,3081,3815,4014,4129
9,강북구,608,840,841,1159,1656,2337,2960,3184,3184
10,도봉구,345,443,542,679,835,979,1684,1994,2120


In [None]:
# 구별 합계
df5 = pd.read_csv('data/서울시 자치구 연도별 방범용 CCTV 운영 현황_230630기준.csv', encoding='EUC-KR',index_col=0)
df5.set_index('구분', inplace=True)
df5 = df5.astype({'2015년':'int','2016년':'int','2017년':'int','2018년':'int','2019년':'int','2020년':'int','2021년':'int','2022년':'int','2023년':'int'})
#df5['합계'] = (df5.sum(axis=1))
df5.dtypes

In [33]:
df1 = df5.astype({'col1':'int32'})

ValueError: invalid literal for int() with base 10: '용산구'