## pandas 기초

- pandas는 R의 핵심 데이터 시리즈와 프레임을 파이썬의 추가한것이다
- numpy를 기반으로 구현되었고, numpy 대비 기능을 더 확장하여 재구현한것
- Python Data Analysis Library
- https://pandas.pydata.org

In [31]:
import numpy as np
import pandas as pd

- 파이썬의 자료구조
> 수치형, 문자열, 리스트, 딕셔너리, 튜플, 집합, 블린  
- numpy의 자료구조
> ndarray(배열) : 배열의 데이터는 모든 같은 타입이다.
- pandas의 자료구조
> Series(시리즈), DataFrame(데이터프레임)  
> DataFrame의 인덱싱 -> Series의 인덱싱 -> 값(스칼라), 수치, 문자, 블린, NaN 이 등장  > Series: 인덱스와 데이터만 존재하는, 컬럼이 없는 자료구조  
> DataFrame: 인덱스와 컬럼이 존재하는 자료구조
> NaN : 데이터가 없다. (난, 넌), Not a Number => np.nan

In [32]:
# Series
# 데이터를 정수로 넣었으나 기본형으로 float64가 반영되었다
a = pd.Series( [1,3,5,np.nan, 6,8] )
a

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [33]:
# 속성-타입
a.dtype

dtype('float64')

In [34]:
# 속성-크기
a.shape
# 1차원데이터로 총6개의 데이터가 존재

(6,)

In [35]:
# DataFrame
# 인덱스와 컬럼이 존재하는 자료구조
cols   = list('ABCD') # ['A','B','C','D']
indexs = pd.date_range('20190812', periods=7)
# 컬럼 4개, 인덱스 7개
cols, indexs

(['A', 'B', 'C', 'D'],
 DatetimeIndex(['2019-08-12', '2019-08-13', '2019-08-14', '2019-08-15',
                '2019-08-16', '2019-08-17', '2019-08-18'],
               dtype='datetime64[ns]', freq='D'))

In [36]:
# 데이터는  shape => (7,4)
datas = np.random.randn(7,4)
datas, datas.shape

(array([[-0.98623167,  0.21079312,  0.24845099,  0.51831836],
        [ 0.12702371, -0.39684833, -1.13595582, -0.01893462],
        [ 0.12789857, -1.37906398, -0.96180707,  0.43154078],
        [ 0.2538674 , -0.14841347,  1.1000304 ,  1.07753015],
        [-0.40305449, -1.27607336,  0.38673681, -0.15429143],
        [-0.19129383, -0.17136055,  0.07618368, -0.14870524],
        [-1.06113039, -0.07903367,  0.43644983, -0.9517162 ]]), (7, 4))

In [37]:
# df 생성
df = pd.DataFrame( datas, index=indexs, columns=cols )
df

Unnamed: 0,A,B,C,D
2019-08-12,-0.986232,0.210793,0.248451,0.518318
2019-08-13,0.127024,-0.396848,-1.135956,-0.018935
2019-08-14,0.127899,-1.379064,-0.961807,0.431541
2019-08-15,0.253867,-0.148413,1.10003,1.07753
2019-08-16,-0.403054,-1.276073,0.386737,-0.154291
2019-08-17,-0.191294,-0.171361,0.076184,-0.148705
2019-08-18,-1.06113,-0.079034,0.43645,-0.951716


- **데이터가 로드된후 DataFrame을 만든후 점검할 사항**

In [38]:
df.head(3)

Unnamed: 0,A,B,C,D
2019-08-12,-0.986232,0.210793,0.248451,0.518318
2019-08-13,0.127024,-0.396848,-1.135956,-0.018935
2019-08-14,0.127899,-1.379064,-0.961807,0.431541


In [39]:
df.tail(2)

Unnamed: 0,A,B,C,D
2019-08-17,-0.191294,-0.171361,0.076184,-0.148705
2019-08-18,-1.06113,-0.079034,0.43645,-0.951716


In [40]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [41]:
df.index

DatetimeIndex(['2019-08-12', '2019-08-13', '2019-08-14', '2019-08-15',
               '2019-08-16', '2019-08-17', '2019-08-18'],
              dtype='datetime64[ns]', freq='D')

In [42]:
df.values

array([[-0.98623167,  0.21079312,  0.24845099,  0.51831836],
       [ 0.12702371, -0.39684833, -1.13595582, -0.01893462],
       [ 0.12789857, -1.37906398, -0.96180707,  0.43154078],
       [ 0.2538674 , -0.14841347,  1.1000304 ,  1.07753015],
       [-0.40305449, -1.27607336,  0.38673681, -0.15429143],
       [-0.19129383, -0.17136055,  0.07618368, -0.14870524],
       [-1.06113039, -0.07903367,  0.43644983, -0.9517162 ]])

In [43]:
type(df.values)

numpy.ndarray

In [44]:
df.shape

(7, 4)

In [45]:
df.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

In [46]:
# df의 개요
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7 entries, 2019-08-12 to 2019-08-18
Freq: D
Data columns (total 4 columns):
A    7 non-null float64
B    7 non-null float64
C    7 non-null float64
D    7 non-null float64
dtypes: float64(4)
memory usage: 280.0 bytes


In [47]:
# 통계요약 : 개수, 평균, 표준편차, 최소, 25%, 50%, 75%, 최대
df.describe()

Unnamed: 0,A,B,C,D
count,7.0,7.0,7.0,7.0
mean,-0.304703,-0.462857,0.021441,0.107677
std,0.539248,0.617803,0.798948,0.643795
min,-1.06113,-1.379064,-1.135956,-0.951716
25%,-0.694643,-0.836461,-0.442812,-0.151498
50%,-0.191294,-0.171361,0.248451,-0.018935
75%,0.127461,-0.113724,0.411593,0.47493
max,0.253867,0.210793,1.10003,1.07753


In [48]:
# B열 기준 데이터를 정렬, 내림차순
df.sort_values( by='B', ascending=False )

Unnamed: 0,A,B,C,D
2019-08-12,-0.986232,0.210793,0.248451,0.518318
2019-08-18,-1.06113,-0.079034,0.43645,-0.951716
2019-08-15,0.253867,-0.148413,1.10003,1.07753
2019-08-17,-0.191294,-0.171361,0.076184,-0.148705
2019-08-13,0.127024,-0.396848,-1.135956,-0.018935
2019-08-16,-0.403054,-1.276073,0.386737,-0.154291
2019-08-14,0.127899,-1.379064,-0.961807,0.431541


In [49]:
# 특정 컬럼의 데이터만 보기 => 인덱싱 => 차원축소
df['C'], type(df['C'])

(2019-08-12    0.248451
 2019-08-13   -1.135956
 2019-08-14   -0.961807
 2019-08-15    1.100030
 2019-08-16    0.386737
 2019-08-17    0.076184
 2019-08-18    0.436450
 Freq: D, Name: C, dtype: float64, pandas.core.series.Series)

In [50]:
# 슬라이싱 : 차원유지
df[:] # 카피동일

Unnamed: 0,A,B,C,D
2019-08-12,-0.986232,0.210793,0.248451,0.518318
2019-08-13,0.127024,-0.396848,-1.135956,-0.018935
2019-08-14,0.127899,-1.379064,-0.961807,0.431541
2019-08-15,0.253867,-0.148413,1.10003,1.07753
2019-08-16,-0.403054,-1.276073,0.386737,-0.154291
2019-08-17,-0.191294,-0.171361,0.076184,-0.148705
2019-08-18,-1.06113,-0.079034,0.43645,-0.951716


In [51]:
# 데이터가 슬라이싱 되서 나온다 => 차원을 유지해야 하니까
# a <= x <b
df[1:3] 

Unnamed: 0,A,B,C,D
2019-08-13,0.127024,-0.396848,-1.135956,-0.018935
2019-08-14,0.127899,-1.379064,-0.961807,0.431541


In [52]:
# 슬라이싱을 하는데 인덱스값이 아닌, 실제값으로 자르기
# a <= x <=b
df[ '2019-08-13' : '2019-08-15' ]

Unnamed: 0,A,B,C,D
2019-08-13,0.127024,-0.396848,-1.135956,-0.018935
2019-08-14,0.127899,-1.379064,-0.961807,0.431541
2019-08-15,0.253867,-0.148413,1.10003,1.07753


- 전통적인 인덱싱와 슬라이싱을 진행하면 표현의 한계점에 도달
- 이를 극복하기 위해서 pandas 만의 데이터 추출법이 추가가 되었다 
- loc, iloc <= 2개를 주로 사용한다
- 연속데이터에 대한 추출 <-> 비연속 데이터들의 추출법 (펜시인덱싱, 쿼리수행등등)

### loc

In [53]:
# loc : location 정보를 옵션으로 하여 슬라이싱 지원
# loc을 통한 데이어 추출
# df.loc[ 인덱스명 ]
df.loc[ '2019-08-12' ], type(df.loc[ '2019-08-12' ])

(A   -0.986232
 B    0.210793
 C    0.248451
 D    0.518318
 Name: 2019-08-12 00:00:00, dtype: float64, pandas.core.series.Series)

In [54]:
# 원본카피
df.loc[ : ]

Unnamed: 0,A,B,C,D
2019-08-12,-0.986232,0.210793,0.248451,0.518318
2019-08-13,0.127024,-0.396848,-1.135956,-0.018935
2019-08-14,0.127899,-1.379064,-0.961807,0.431541
2019-08-15,0.253867,-0.148413,1.10003,1.07753
2019-08-16,-0.403054,-1.276073,0.386737,-0.154291
2019-08-17,-0.191294,-0.171361,0.076184,-0.148705
2019-08-18,-1.06113,-0.079034,0.43645,-0.951716


In [55]:
# 인덱스쪽은 전부 포함시키고(1차원), 컬럼은 A,C만 포함(2차원)
df.loc[ : , ['A','C'] ]

Unnamed: 0,A,C
2019-08-12,-0.986232,0.248451
2019-08-13,0.127024,-1.135956
2019-08-14,0.127899,-0.961807
2019-08-15,0.253867,1.10003
2019-08-16,-0.403054,0.386737
2019-08-17,-0.191294,0.076184
2019-08-18,-1.06113,0.43645


In [56]:
# 차원축소
df.loc[ : , 'A' ]

2019-08-12   -0.986232
2019-08-13    0.127024
2019-08-14    0.127899
2019-08-15    0.253867
2019-08-16   -0.403054
2019-08-17   -0.191294
2019-08-18   -1.061130
Freq: D, Name: A, dtype: float64

In [57]:
# 차원유지
df.loc[ : , ['A'] ]

Unnamed: 0,A
2019-08-12,-0.986232
2019-08-13,0.127024
2019-08-14,0.127899
2019-08-15,0.253867
2019-08-16,-0.403054
2019-08-17,-0.191294
2019-08-18,-1.06113


In [58]:
df.loc[ '2019-08-13':'2019-08-15' , ['A','C'] ]

Unnamed: 0,A,C
2019-08-13,0.127024,-1.135956
2019-08-14,0.127899,-0.961807
2019-08-15,0.253867,1.10003


In [59]:
# 차원축소 -> 인덱스를 한개만 지정
df.loc[ '2019-08-13' , ['A','C'] ]

A    0.127024
C   -1.135956
Name: 2019-08-13 00:00:00, dtype: float64

In [60]:
# error
#df.loc[ ['2019-08-13'] , ['A','C'] ]
# error
#df.loc[ ['2019-08-13':'2019-08-13'] , ['A','C'] ]

In [61]:
df.loc[ '2019-08-13':'2019-08-13' , ['A','C'] ]

Unnamed: 0,A,C
2019-08-13,0.127024,-1.135956


In [62]:
# 차원축소가 2회 진행 -> 스칼라(값)
df.loc[ '2019-08-13' , 'A' ]

0.12702370935382853

### iloc

- 펜시인덱싱과 유사 하다
- 행과 열의 번호를 이용하여 데이터를 접근하는 방식
- i->index

In [63]:
# 2019-08-13자 데이터 추출
# 1 => 인덱스값이 1인 데이터
df.iloc[ 1 ]

A    0.127024
B   -0.396848
C   -1.135956
D   -0.018935
Name: 2019-08-13 00:00:00, dtype: float64

In [64]:
# iloc 슬라이싱
# a <= index < b,   c <= column < d
df.iloc[ 1:3, 1:3 ]

Unnamed: 0,B,C
2019-08-13,-0.396848,-1.135956
2019-08-14,-1.379064,-0.961807


In [65]:
# iloc + 펜시인덱싱 기법 사용 (인덱스, 컬럼을 비연속적 위치를 나열)
df.iloc[ [1,4,2] , [0,2] ]

Unnamed: 0,A,C
2019-08-13,0.127024,-1.135956
2019-08-16,-0.403054,0.386737
2019-08-14,0.127899,-0.961807


In [66]:
# 특정 조건에 만족하는 데이터만 추출
# 데이터프레임이 생성되면 컬러명은 맴버 변수로 자동생성됨
# C컬럼에 존재하는 데이터중에 양수만(양수면 True, 0이하면 False)
# 조건을 부여하여 블리언 데이터를 만들어 참만 포함시키는 방식 : 블리언 인덱싱
# [ T, F, F, F, T, T, T ] 데이터를  and하면 참만 살아남아서 아래과 같은 결과를 발생
# df에 식을 치면 => 전체 구성원에 전부다 연산이 진행된다 
# 행렬 (연산) 값 => 각 구성원에 일일이 다 연산하는것과 동일
df[ df.C > 0 ]

Unnamed: 0,A,B,C,D
2019-08-12,-0.986232,0.210793,0.248451,0.518318
2019-08-15,0.253867,-0.148413,1.10003,1.07753
2019-08-16,-0.403054,-1.276073,0.386737,-0.154291
2019-08-17,-0.191294,-0.171361,0.076184,-0.148705
2019-08-18,-1.06113,-0.079034,0.43645,-0.951716


In [67]:
# 데이터 전체를 기준으로 0보다 큰가? 0보다 같거나 작은 데이터들은? NaN 대체
df[ df > 0  ]

Unnamed: 0,A,B,C,D
2019-08-12,,0.210793,0.248451,0.518318
2019-08-13,0.127024,,,
2019-08-14,0.127899,,,0.431541
2019-08-15,0.253867,,1.10003,1.07753
2019-08-16,,,0.386737,
2019-08-17,,,0.076184,
2019-08-18,,,0.43645,


In [68]:
# 복사 
df.copy(), df[:]

(                   A         B         C         D
 2019-08-12 -0.986232  0.210793  0.248451  0.518318
 2019-08-13  0.127024 -0.396848 -1.135956 -0.018935
 2019-08-14  0.127899 -1.379064 -0.961807  0.431541
 2019-08-15  0.253867 -0.148413  1.100030  1.077530
 2019-08-16 -0.403054 -1.276073  0.386737 -0.154291
 2019-08-17 -0.191294 -0.171361  0.076184 -0.148705
 2019-08-18 -1.061130 -0.079034  0.436450 -0.951716,
                    A         B         C         D
 2019-08-12 -0.986232  0.210793  0.248451  0.518318
 2019-08-13  0.127024 -0.396848 -1.135956 -0.018935
 2019-08-14  0.127899 -1.379064 -0.961807  0.431541
 2019-08-15  0.253867 -0.148413  1.100030  1.077530
 2019-08-16 -0.403054 -1.276073  0.386737 -0.154291
 2019-08-17 -0.191294 -0.171361  0.076184 -0.148705
 2019-08-18 -1.061130 -0.079034  0.436450 -0.951716)

In [69]:
# 기존 데이터 df에 새로운 컬럼을 추가한다 !! (아주 중요) => 파생변수
# 기존  df의 1차원과 동수의 데이터가 존재해야 한다
# 데이터는 리스트 ok, Series도 ok
new_data = ['one','one','two','three','four','three','five']
# 데이터 추가, 대상[ 신규컬럼명 ] = 데이터
df['E'] = new_data
df.head(2)

Unnamed: 0,A,B,C,D,E
2019-08-12,-0.986232,0.210793,0.248451,0.518318,one
2019-08-13,0.127024,-0.396848,-1.135956,-0.018935,one


In [70]:
# 데이터 조사
# 안에 그런값이 잇는가?
df['E'].isin( ['two','four'] )

2019-08-12    False
2019-08-13    False
2019-08-14     True
2019-08-15    False
2019-08-16     True
2019-08-17    False
2019-08-18    False
Freq: D, Name: E, dtype: bool

In [71]:
df[ df['E'].isin( ['two','four'] )  ]

Unnamed: 0,A,B,C,D,E
2019-08-14,0.127899,-1.379064,-0.961807,0.431541,two
2019-08-16,-0.403054,-1.276073,0.386737,-0.154291,four


In [72]:
# 누적합
# apply( 함수를 표현 ) => 맴버들을 다 건드린다
df.apply( np.cumsum )

Unnamed: 0,A,B,C,D,E
2019-08-12,-0.986232,0.210793,0.248451,0.518318,one
2019-08-13,-0.859208,-0.186055,-0.887505,0.499384,oneone
2019-08-14,-0.731309,-1.565119,-1.849312,0.930925,oneonetwo
2019-08-15,-0.477442,-1.713533,-0.749281,2.008455,oneonetwothree
2019-08-16,-0.880496,-2.989606,-0.362545,1.854163,oneonetwothreefour
2019-08-17,-1.07179,-3.160967,-0.286361,1.705458,oneonetwothreefourthree
2019-08-18,-2.132921,-3.24,0.150089,0.753742,oneonetwothreefourthreefive


In [75]:
try:    
    # 제거
    df.drop( ['E'], inplace=True, axis=1 )
    df
except Exception as e:
    pass

KeyError: "labels ['E'] not contained in axis"