# 온보딩 (7) : Pandas

In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## 행 단위 인덱싱

In [2]:
# 예제 데이터
KTX_data = {'경부선 KTX':[39060,39896,42005,43621,41702,41266,32427],
            '호남선 KTX':[7313,6967,6873,6626,8675,10622,9228],
            '경전선 KTX':[3627,4168,4088,4424,4606,4984,5570],
            '전라선 KTX':[309,1771,1954,2244,3146,3945,5766],
            '동해선 KTX':[np.nan,np.nan,np.nan,np.nan,2395,3786,6667]}

col_list = ['경부선 KTX', '호남선 KTX', '경전선 KTX', '전라선 KTX', '동해선 KTX']
index_list = ['2011','2012','2013','2014','2015','2016','2017']

df_KTX = pd.DataFrame(KTX_data, columns=col_list, index=index_list)
df_KTX

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


In [3]:
print(df_KTX.index)
print(df_KTX.columns)
print(df_KTX.values)
print(df_KTX.head(3))
print(df_KTX.tail(2))

Index(['2011', '2012', '2013', '2014', '2015', '2016', '2017'], dtype='object')
Index(['경부선 KTX', '호남선 KTX', '경전선 KTX', '전라선 KTX', '동해선 KTX'], dtype='object')
[[39060.  7313.  3627.   309.    nan]
 [39896.  6967.  4168.  1771.    nan]
 [42005.  6873.  4088.  1954.    nan]
 [43621.  6626.  4424.  2244.    nan]
 [41702.  8675.  4606.  3146.  2395.]
 [41266. 10622.  4984.  3945.  3786.]
 [32427.  9228.  5570.  5766.  6667.]]
      경부선 KTX  호남선 KTX  경전선 KTX  전라선 KTX  동해선 KTX
2011    39060     7313     3627      309      NaN
2012    39896     6967     4168     1771      NaN
2013    42005     6873     4088     1954      NaN
      경부선 KTX  호남선 KTX  경전선 KTX  전라선 KTX  동해선 KTX
2016    41266    10622     4984     3945   3786.0
2017    32427     9228     5570     5766   6667.0


In [4]:
# 행 인덱싱 : 숫자로 하기

print(df_KTX[0:1])
print("\n")
print(df_KTX[2:5]) # 2번째 행 ~ 4번째 행 까지

      경부선 KTX  호남선 KTX  경전선 KTX  전라선 KTX  동해선 KTX
2011    39060     7313     3627      309      NaN


      경부선 KTX  호남선 KTX  경전선 KTX  전라선 KTX  동해선 KTX
2013    42005     6873     4088     1954      NaN
2014    43621     6626     4424     2244      NaN
2015    41702     8675     4606     3146   2395.0


In [5]:
# 행 인덱싱 : loc 사용 -> 결과가 데이터프레임

print(df_KTX.loc['2011'])
print("\n")

print(df_KTX.loc['2013':'2016'])
# 에러 발생
# print(df_KTX['2011'])

경부선 KTX    39060.0
호남선 KTX     7313.0
경전선 KTX     3627.0
전라선 KTX      309.0
동해선 KTX        NaN
Name: 2011, dtype: float64


      경부선 KTX  호남선 KTX  경전선 KTX  전라선 KTX  동해선 KTX
2013    42005     6873     4088     1954      NaN
2014    43621     6626     4424     2244      NaN
2015    41702     8675     4606     3146   2395.0
2016    41266    10622     4984     3945   3786.0


## 열 단위 인덱싱

In [6]:
df_KTX['경부선 KTX']

2011    39060
2012    39896
2013    42005
2014    43621
2015    41702
2016    41266
2017    32427
Name: 경부선 KTX, dtype: int64

In [7]:
# iloc 메소드 사용
df_KTX.iloc[0:4, 0:3]

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX
2011,39060,7313,3627
2012,39896,6967,4168
2013,42005,6873,4088
2014,43621,6626,4424


## 복합 인덱싱
열 선택 후 index 범위 지정

In [8]:
print(df_KTX['경부선 KTX']['2012' : '2014'])
print("\n")
print(df_KTX['경부선 KTX'][1:4])

2012    39896
2013    42005
2014    43621
Name: 경부선 KTX, dtype: int64


2012    39896
2013    42005
2014    43621
Name: 경부선 KTX, dtype: int64


In [9]:
print(type(df_KTX.loc['2011']))
print(type(df_KTX[0:1]))

print(df_KTX.loc[['2011']])
print(type(df_KTX.loc[['2011']]))

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
      경부선 KTX  호남선 KTX  경전선 KTX  전라선 KTX  동해선 KTX
2011    39060     7313     3627      309      NaN
<class 'pandas.core.frame.DataFrame'>


In [10]:
df_KTX.iloc[:, 2:5]

Unnamed: 0,경전선 KTX,전라선 KTX,동해선 KTX
2011,3627,309,
2012,4168,1771,
2013,4088,1954,
2014,4424,2244,
2015,4606,3146,2395.0
2016,4984,3945,3786.0
2017,5570,5766,6667.0


In [11]:
df_KTX.loc[:, '호남선 KTX':]

Unnamed: 0,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,7313,3627,309,
2012,6967,4168,1771,
2013,6873,4088,1954,
2014,6626,4424,2244,
2015,8675,4606,3146,2395.0
2016,10622,4984,3945,3786.0
2017,9228,5570,5766,6667.0


In [12]:
df_KTX.loc['2012':'2014', '경전선 KTX':]

Unnamed: 0,경전선 KTX,전라선 KTX,동해선 KTX
2012,4168,1771,
2013,4088,1954,
2014,4424,2244,


In [13]:
# 2016년의 '호남선 KTX'의 이용자 수 선택
df_KTX.loc['2016', '호남선 KTX']

10622

In [14]:
# 열의 항목을 지정해 열의 순서를 변경
df_KTX[['동해선 KTX', '전라선 KTX', '경전선 KTX']]

Unnamed: 0,동해선 KTX,전라선 KTX,경전선 KTX
2011,,309,3627
2012,,1771,4168
2013,,1954,4088
2014,,2244,4424
2015,2395.0,3146,4606
2016,3786.0,3945,4984
2017,6667.0,5766,5570


In [15]:
# 목록을 list로 바꾸기
arr1 = df_KTX.columns.tolist()
arr2 = df_KTX.values.tolist()
arr3 = df_KTX.index.tolist()
arr4 = df_KTX['경부선 KTX'].tolist()
arr5 = df_KTX.loc['2011'].tolist()

print(arr1)
print(arr2)
print(arr3)
print(arr4)
print(arr5)

['경부선 KTX', '호남선 KTX', '경전선 KTX', '전라선 KTX', '동해선 KTX']
[[39060.0, 7313.0, 3627.0, 309.0, nan], [39896.0, 6967.0, 4168.0, 1771.0, nan], [42005.0, 6873.0, 4088.0, 1954.0, nan], [43621.0, 6626.0, 4424.0, 2244.0, nan], [41702.0, 8675.0, 4606.0, 3146.0, 2395.0], [41266.0, 10622.0, 4984.0, 3945.0, 3786.0], [32427.0, 9228.0, 5570.0, 5766.0, 6667.0]]
['2011', '2012', '2013', '2014', '2015', '2016', '2017']
[39060, 39896, 42005, 43621, 41702, 41266, 32427]
[39060.0, 7313.0, 3627.0, 309.0, nan]


## 조건 기반 인덱싱

In [16]:
df_KTX[df_KTX.index >= '2016']

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


In [17]:
df_KTX[df_KTX.index < '2016']

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0


In [18]:
df_KTX[df_KTX.index == '2016']

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2016,41266,10622,4984,3945,3786.0


## loc vs iloc

In [19]:
print(df_KTX.loc['2011'])

경부선 KTX    39060.0
호남선 KTX     7313.0
경전선 KTX     3627.0
전라선 KTX      309.0
동해선 KTX        NaN
Name: 2011, dtype: float64


In [20]:
print(df_KTX.loc['2011':'2013'])

      경부선 KTX  호남선 KTX  경전선 KTX  전라선 KTX  동해선 KTX
2011    39060     7313     3627      309      NaN
2012    39896     6967     4168     1771      NaN
2013    42005     6873     4088     1954      NaN


In [21]:
print(df_KTX.loc[['2011','2013']])

      경부선 KTX  호남선 KTX  경전선 KTX  전라선 KTX  동해선 KTX
2011    39060     7313     3627      309      NaN
2013    42005     6873     4088     1954      NaN


In [22]:
print(df_KTX.loc[['2011']])

      경부선 KTX  호남선 KTX  경전선 KTX  전라선 KTX  동해선 KTX
2011    39060     7313     3627      309      NaN


In [23]:
print(df_KTX.iloc[0])

경부선 KTX    39060.0
호남선 KTX     7313.0
경전선 KTX     3627.0
전라선 KTX      309.0
동해선 KTX        NaN
Name: 2011, dtype: float64


In [24]:
print(df_KTX.iloc[0:3])

      경부선 KTX  호남선 KTX  경전선 KTX  전라선 KTX  동해선 KTX
2011    39060     7313     3627      309      NaN
2012    39896     6967     4168     1771      NaN
2013    42005     6873     4088     1954      NaN


In [25]:
print(df_KTX.iloc[[0]])

      경부선 KTX  호남선 KTX  경전선 KTX  전라선 KTX  동해선 KTX
2011    39060     7313     3627      309      NaN


In [26]:
df_KTX.loc[['2011','2013']]
df_KTX.iloc[[0, 2]]

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2013,42005,6873,4088,1954,


In [27]:
KTX_2 = df_KTX.iloc[0:3, 0].tolist()
KTX_2

[39060, 39896, 42005]