## pandas 기초
- pandas는 R의 핵심 데이터 시리즈와 프레임을 파이썬의 추가한 것이다
- numpy를 기반으로 구현되었고, numpy 대비 기능을 더 확장하여 재구현한것
- Python Data Analysis Library
- [https://pandas.pydata.org/]

In [2]:
import numpy as np
import pandas as pd

- 파이썬의 자료구조
> 수치형, 문자열, 리스트, 딕셔너리, 튜플, 집합, 블린
- numpy의 자료구조
> ndarray(배열) : 배열의 데이터는 모든 같은 타입이다.
- pandas의 자료구조
> Series(시리즈), DataFrame(데이터프레임)
> DataFrame의 인덱싱 -> Series의 인덱싱 -> 값(스칼라), 수치, 문자, 블린, NaN(결측치)이 등장
> Series : 인덱스와 데이터만 존재하는, 컬럼이 없는 자료구조
> DataFrame : 인덱스와 컬럼이 존재하는 자료구조
> NaN : 데이터가 없다, Not a Number -> np.nan

In [3]:
# Series
# 데이터를 정수로 넣었으나 기본형으로 float64가 반영되었다
a = pd.Series( [1,3,5,np.nan,6,8] )
a

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [48]:
a = pd.Series( [1,3,5, np.nan, 6, 8] )
a

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# 속성 - 타입
a.dtype # float64: 소수점 사용(64비트)

dtype('float64')

In [49]:
a.dtype 

dtype('float64')

In [5]:
# 속성 - 크기
a.shape 
# 데이터는 1차원이고 총 6개의 데이터가 존재한다는 뜻

(6,)

In [50]:
a.shape

(6,)

In [6]:
# DataFrame
# 인덱스와 컬럼이 존재하는 자료구조
cols = list('ABCD') # ['A','B','C','D']
indexs = pd.date_range( '20190812', periods=7 )
# 리스트화 방법1 . cols = 'ABCD' -> cols = list('ABCD') -> ['A','B','C','D']
# 컬럼4개, 인덱스7개
cols, indexs

(['A', 'B', 'C', 'D'],
 DatetimeIndex(['2019-08-12', '2019-08-13', '2019-08-14', '2019-08-15',
                '2019-08-16', '2019-08-17', '2019-08-18'],
               dtype='datetime64[ns]', freq='D'))

In [51]:
cols = list('ABCD')
indexs = pd.date_range('20190812', periods=7)
cols, indexs

(['A', 'B', 'C', 'D'],
 DatetimeIndex(['2019-08-12', '2019-08-13', '2019-08-14', '2019-08-15',
                '2019-08-16', '2019-08-17', '2019-08-18'],
               dtype='datetime64[ns]', freq='D'))

In [7]:
# 데이터는 shape => (7,4)
datas = np.random.randn(7,4) # 2차원 데이터 형태로 난수를 발생시킴
datas, datas.shape

(array([[ 0.3285959 ,  1.56755206,  0.18451751,  0.33825372],
        [ 0.02063608, -0.09052592, -1.77295848,  0.86369106],
        [-0.59054156,  1.46671762,  1.67073887,  0.18112947],
        [-0.12596677,  0.62459003, -1.65409809,  1.40846267],
        [ 0.28560944, -1.98837932, -0.07278004, -0.02213298],
        [ 0.47827202, -0.71834425, -0.77069922,  0.67026419],
        [ 0.01121641,  1.63336167,  0.25195699,  0.73367378]]), (7, 4))

In [52]:
datas = np.random.randn(7,4)
datas, datas.shape

(array([[ 0.90758089,  1.16077216, -1.52652386, -0.29442157],
        [ 0.88229785,  0.02107795, -0.55832362, -0.93381873],
        [ 0.27811392, -0.82657218,  1.58407307,  0.42921015],
        [ 0.0371962 , -1.44920457,  0.19009528, -0.32968768],
        [ 0.65571379,  0.09801279,  1.20025475,  0.1850579 ],
        [-0.58061546,  0.54285379,  0.90698779,  0.74882298],
        [ 0.41892268,  0.29930723,  1.58302521, -0.73046024]]), (7, 4))

In [8]:
# df 생성
df = pd.DataFrame( datas, index=indexs, columns=cols ) # index= : 기본값 부여 파라미터.

- **데이터가 로드된 후 DataFrame을 만든후 점검할 사항**

In [9]:
df.head(3)

Unnamed: 0,A,B,C,D
2019-08-12,0.328596,1.567552,0.184518,0.338254
2019-08-13,0.020636,-0.090526,-1.772958,0.863691
2019-08-14,-0.590542,1.466718,1.670739,0.181129


In [10]:
df.tail(3)

Unnamed: 0,A,B,C,D
2019-08-16,0.285609,-1.988379,-0.07278,-0.022133
2019-08-17,0.478272,-0.718344,-0.770699,0.670264
2019-08-18,0.011216,1.633362,0.251957,0.733674


In [11]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
df.values

array([[ 0.3285959 ,  1.56755206,  0.18451751,  0.33825372],
       [ 0.02063608, -0.09052592, -1.77295848,  0.86369106],
       [-0.59054156,  1.46671762,  1.67073887,  0.18112947],
       [-0.12596677,  0.62459003, -1.65409809,  1.40846267],
       [ 0.28560944, -1.98837932, -0.07278004, -0.02213298],
       [ 0.47827202, -0.71834425, -0.77069922,  0.67026419],
       [ 0.01121641,  1.63336167,  0.25195699,  0.73367378]])

In [13]:
type(df.values)

numpy.ndarray

In [14]:
df.shape

(7, 4)

In [15]:
df.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

In [16]:
# df의 개요
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7 entries, 2019-08-12 to 2019-08-18
Freq: D
Data columns (total 4 columns):
A    7 non-null float64
B    7 non-null float64
C    7 non-null float64
D    7 non-null float64
dtypes: float64(4)
memory usage: 280.0 bytes


In [17]:
# 통계요약: 개수, 평균, 표준편차, 최소, 25%, 50%, 75%, 최대
df.describe()

Unnamed: 0,A,B,C,D
count,7.0,7.0,7.0,7.0
mean,0.05826,0.356425,-0.309046,0.596192
std,0.35583,1.369614,1.204092,0.479038
min,-0.590542,-1.988379,-1.772958,-0.022133
25%,-0.057375,-0.404435,-1.212399,0.259692
50%,0.020636,0.62459,-0.07278,0.670264
75%,0.307103,1.517135,0.218237,0.798682
max,0.478272,1.633362,1.670739,1.408463


In [18]:
# B열 기준 데이터를 정렬, 내림차순
df.sort_values( by='B', ascending=False ) # sort : 정렬, ascending : 오름차순

Unnamed: 0,A,B,C,D
2019-08-18,0.011216,1.633362,0.251957,0.733674
2019-08-12,0.328596,1.567552,0.184518,0.338254
2019-08-14,-0.590542,1.466718,1.670739,0.181129
2019-08-15,-0.125967,0.62459,-1.654098,1.408463
2019-08-13,0.020636,-0.090526,-1.772958,0.863691
2019-08-17,0.478272,-0.718344,-0.770699,0.670264
2019-08-16,0.285609,-1.988379,-0.07278,-0.022133


In [19]:
# 특정 컬럼의 데이터만 보기 => 인덱싱 => 차원축소
df['C'], type(df['C'])

(2019-08-12    0.184518
 2019-08-13   -1.772958
 2019-08-14    1.670739
 2019-08-15   -1.654098
 2019-08-16   -0.072780
 2019-08-17   -0.770699
 2019-08-18    0.251957
 Freq: D, Name: C, dtype: float64, pandas.core.series.Series)

In [20]:
# 슬라이싱 : 차원유지
df[:] # 카피동일, 처음부터 끝까지 차원카피

Unnamed: 0,A,B,C,D
2019-08-12,0.328596,1.567552,0.184518,0.338254
2019-08-13,0.020636,-0.090526,-1.772958,0.863691
2019-08-14,-0.590542,1.466718,1.670739,0.181129
2019-08-15,-0.125967,0.62459,-1.654098,1.408463
2019-08-16,0.285609,-1.988379,-0.07278,-0.022133
2019-08-17,0.478272,-0.718344,-0.770699,0.670264
2019-08-18,0.011216,1.633362,0.251957,0.733674


In [21]:
# 데이터가 슬라이싱 되서 나온다 => 차원을 유지해야 하니까 
# a <= x < b 
df[1:3] # 인덱스 3을 제외하고 1,2 가 들어온다

Unnamed: 0,A,B,C,D
2019-08-13,0.020636,-0.090526,-1.772958,0.863691
2019-08-14,-0.590542,1.466718,1.670739,0.181129


In [22]:
# 슬라이싱을 하는데 인덱싱값이 아닌, 실제값으로 자르기
# a <= x <= b 
df[ '2019-08-13':'2019-08-15' ]

Unnamed: 0,A,B,C,D
2019-08-13,0.020636,-0.090526,-1.772958,0.863691
2019-08-14,-0.590542,1.466718,1.670739,0.181129
2019-08-15,-0.125967,0.62459,-1.654098,1.408463


- 전통적인 인덱싱과 슬라이싱을 진행하면 표현의 한계점에 도달
- 이를 극복하기 위해서  pandas만의 데이터 추출법이 추가가 되었다
- loc, iloc <= 이 2개를 주로 사용한다
- 연속데이터에 대한 추출 <-> 비연속 데이터들의 추출법 (펜시인덱싱, 쿼리수행등등)

### loc

In [23]:
# 위에서까지는 dataframe에 대한것에 대해서 알았으니, 이젠 뽑는것에 대해서 알아본다
# loc : location 정보를 옵션으로 하여 슬라이싱 지원
# loc를 통한 데이터 추출
# df.loc[ '인덱스명' ]
df.loc[ '2019-08-12' ], type(df.loc[ '2019-08-12' ]) # []로 진행된다 : loc은 함수라기보단 맴버변수에 가깝다는 뜻
# 전체데이터중에서 8월12일에 해당되는 것만추출, 위에서 df로 표현을 하면 컬럼만을 가져오게 되어 상세히 표현못함

(A    0.328596
 B    1.567552
 C    0.184518
 D    0.338254
 Name: 2019-08-12 00:00:00, dtype: float64, pandas.core.series.Series)

In [24]:
# 원본카피 -> 원본그대로 나온다
df.loc[ : ] 

Unnamed: 0,A,B,C,D
2019-08-12,0.328596,1.567552,0.184518,0.338254
2019-08-13,0.020636,-0.090526,-1.772958,0.863691
2019-08-14,-0.590542,1.466718,1.670739,0.181129
2019-08-15,-0.125967,0.62459,-1.654098,1.408463
2019-08-16,0.285609,-1.988379,-0.07278,-0.022133
2019-08-17,0.478272,-0.718344,-0.770699,0.670264
2019-08-18,0.011216,1.633362,0.251957,0.733674


In [25]:
# 인덱스쪽은 전부 포함시키고(1차원만 해석한다), 컬럼은 A,C만 포함(2차원만 해석한다)
# 1차원, 2차원을 각각 표현한다는것이 중요하다
df.loc[ : , ['A','C'] ] 

Unnamed: 0,A,C
2019-08-12,0.328596,0.184518
2019-08-13,0.020636,-1.772958
2019-08-14,-0.590542,1.670739
2019-08-15,-0.125967,-1.654098
2019-08-16,0.285609,-0.07278
2019-08-17,0.478272,-0.770699
2019-08-18,0.011216,0.251957


In [26]:
# 차원이 축소가 된다.( 2차원을 건드리는 것이다)
df.loc[ : , 'A' ]

2019-08-12    0.328596
2019-08-13    0.020636
2019-08-14   -0.590542
2019-08-15   -0.125967
2019-08-16    0.285609
2019-08-17    0.478272
2019-08-18    0.011216
Freq: D, Name: A, dtype: float64

In [27]:
# 차원을 유지하는 것이다.( 2차원을 건드리는 것이다)
df.loc[ : , ['A'] ]

Unnamed: 0,A
2019-08-12,0.328596
2019-08-13,0.020636
2019-08-14,-0.590542
2019-08-15,-0.125967
2019-08-16,0.285609
2019-08-17,0.478272
2019-08-18,0.011216


In [28]:
# 차원유지
df.loc[ '2019-08-13':'2019-08-15' , ['A','C'] ]

Unnamed: 0,A,C
2019-08-13,0.020636,-1.772958
2019-08-14,-0.590542,1.670739
2019-08-15,-0.125967,-1.654098


In [29]:
# 차원축소 -> 인덱스를 한개만 지정
df.loc[ '2019-08-13' , ['A','C'] ]

A    0.020636
C   -1.772958
Name: 2019-08-13 00:00:00, dtype: float64

In [30]:
# error
#df.loc[ ['2019-08-13'] , ['A','C'] ]
# error
#df.loc[ ['2019-08-13':'2019-08-13'] , ['A','C'] ]

In [31]:
df.loc[ '2019-08-13':'2019-08-13' , ['A','C'] ]

Unnamed: 0,A,C
2019-08-13,0.020636,-1.772958


In [32]:
# 차원축소가 2회 진행하는 것이다. -> 스칼라(값)
df.loc[ '2019-08-13' , 'A' ]

0.02063608244495575

### iloc
- 펜시인덱싱과 유사하다
- 행과 열의 번호를 이용하여 데이터를 접근하는 방식
- i -> index

In [33]:
df

Unnamed: 0,A,B,C,D
2019-08-12,0.328596,1.567552,0.184518,0.338254
2019-08-13,0.020636,-0.090526,-1.772958,0.863691
2019-08-14,-0.590542,1.466718,1.670739,0.181129
2019-08-15,-0.125967,0.62459,-1.654098,1.408463
2019-08-16,0.285609,-1.988379,-0.07278,-0.022133
2019-08-17,0.478272,-0.718344,-0.770699,0.670264
2019-08-18,0.011216,1.633362,0.251957,0.733674


In [34]:
# 2019-08-13일자 데이터 추출
# 1 => 인덱스값이 1인 데이터
df.iloc[ 1 ]

A    0.020636
B   -0.090526
C   -1.772958
D    0.863691
Name: 2019-08-13 00:00:00, dtype: float64

In [35]:
# iloc 슬라이싱
# a <= index < b, c <= column < d
df.iloc[ 1:3, 1:3 ]

Unnamed: 0,B,C
2019-08-13,-0.090526,-1.772958
2019-08-14,1.466718,1.670739


In [36]:
# iloc + 펜시인덱싱 기법 사용 ( 인덱스, 컬럼을 비연속적 위치를 나열 )
df.iloc[ [1,4,2] , [0,2] ]

Unnamed: 0,A,C
2019-08-13,0.020636,-1.772958
2019-08-16,0.285609,-0.07278
2019-08-14,-0.590542,1.670739


In [37]:
# 특정 조건에 만족하는 데이터만 추출
# 데이터프레임이 생성되면 컬러명은 맴버변수로 자동생성됨 : df.(스크롤안에 A,B,C,D가 나옴)
# C컬럼에 존재하는 데이터중에 양수만(양수면: True, 음수면: False)
# 조건을 부여하여 블리언 데이터를 만들어 참만 포함시키는 방식 : 블리언(2진데이터와 같은 맥락이다: 참,거짓 밖에 없기 때문이다) 인덱싱
# 전형적인 슬라이싱 기법에 기반(?)
# 특정조건만을 만족시키는 것만 살게하고 싶으면 조건을 쓰면된다
# [ T, F, F, F, T, T, T ] 데이터를 and하면 참만 살아남아서 아래와 같은 결과를 발생
# df에 식을 치면 => 전체 구성원에 전부다 연산이 진행된다
# 행렬 (연산) 값 => 각 구성원에 일일이 다 연산하는것과 동일
df[ df.C > 0 ] # -> 벡터에 스칼라 기법이기때문에 부등호가 들어간 수식작성 가능

Unnamed: 0,A,B,C,D
2019-08-12,0.328596,1.567552,0.184518,0.338254
2019-08-14,-0.590542,1.466718,1.670739,0.181129
2019-08-18,0.011216,1.633362,0.251957,0.733674


In [38]:
# 데이터 전체를 기준으로 0보다 큰가? -> 대상은 살아남고 비대상은 NaN대체(결과확인)
#                                    -> 0보다 같거나 작은 데이터들은? NaN 대체
df[ df > 0 ]

Unnamed: 0,A,B,C,D
2019-08-12,0.328596,1.567552,0.184518,0.338254
2019-08-13,0.020636,,,0.863691
2019-08-14,,1.466718,1.670739,0.181129
2019-08-15,,0.62459,,1.408463
2019-08-16,0.285609,,,
2019-08-17,0.478272,,,0.670264
2019-08-18,0.011216,1.633362,0.251957,0.733674


In [39]:
# 복사
df.copy(), df[:]

(                   A         B         C         D
 2019-08-12  0.328596  1.567552  0.184518  0.338254
 2019-08-13  0.020636 -0.090526 -1.772958  0.863691
 2019-08-14 -0.590542  1.466718  1.670739  0.181129
 2019-08-15 -0.125967  0.624590 -1.654098  1.408463
 2019-08-16  0.285609 -1.988379 -0.072780 -0.022133
 2019-08-17  0.478272 -0.718344 -0.770699  0.670264
 2019-08-18  0.011216  1.633362  0.251957  0.733674,
                    A         B         C         D
 2019-08-12  0.328596  1.567552  0.184518  0.338254
 2019-08-13  0.020636 -0.090526 -1.772958  0.863691
 2019-08-14 -0.590542  1.466718  1.670739  0.181129
 2019-08-15 -0.125967  0.624590 -1.654098  1.408463
 2019-08-16  0.285609 -1.988379 -0.072780 -0.022133
 2019-08-17  0.478272 -0.718344 -0.770699  0.670264
 2019-08-18  0.011216  1.633362  0.251957  0.733674)

In [40]:
# 기존 데이터 df에 새로운 컬럼을 추가한다 !! ( 아주 중요 ) => 파생변수, 뒤로 갈수록 줄줄이 나올 것으로 예상됨
# 기존 df의 1차원과 동수의 데이터가 존재해야 한다
# 데이터는 리스트 ok, Series도 ok
new_data = ['one','one','two','three','four','three','five']
# 데이터 추가 : 대상[ 신규컬럼명 ] = 데이터
df['E'] = new_data 
df.head(2)

Unnamed: 0,A,B,C,D,E
2019-08-12,0.328596,1.567552,0.184518,0.338254,one
2019-08-13,0.020636,-0.090526,-1.772958,0.863691,one


In [41]:
# 데이터 조사
# 안에 그런값이 있는가? 
df['E'].isin(['two','four'])

2019-08-12    False
2019-08-13    False
2019-08-14     True
2019-08-15    False
2019-08-16     True
2019-08-17    False
2019-08-18    False
Freq: D, Name: E, dtype: bool

In [42]:
df[df['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2019-08-14,-0.590542,1.466718,1.670739,0.181129,two
2019-08-16,0.285609,-1.988379,-0.07278,-0.022133,four


In [43]:
# 누적합
# apply( 함수를 표현 ) => 맴버들을 다 건드린다
df.apply( np.cumsum )

Unnamed: 0,A,B,C,D,E
2019-08-12,0.328596,1.567552,0.184518,0.338254,one
2019-08-13,0.349232,1.477026,-1.588441,1.201945,oneone
2019-08-14,-0.24131,2.943744,0.082298,1.383074,oneonetwo
2019-08-15,-0.367276,3.568334,-1.5718,2.791537,oneonetwothree
2019-08-16,-0.081667,1.579954,-1.64458,2.769404,oneonetwothreefour
2019-08-17,0.396605,0.86161,-2.415279,3.439668,oneonetwothreefourthree
2019-08-18,0.407822,2.494972,-2.163322,4.173342,oneonetwothreefourthreefive


In [44]:
try:
    # 제거
    df.drop( ['E'], inplace=True, axis=1 )  # axis= : 축의 방향
    df
except Exception as e:
    pass

In [45]:
# 각 컬럼의 최대값에서 최소값을 뺀 값 => 거리:distance
df.apply( lambda x:x.max()-x.min() )

A    1.068814
B    3.621741
C    3.443697
D    1.430596
dtype: float64

In [46]:
df.max(), df.min() # max, min 값만 봤을때: 차원축소가 되어서 Series가 된다.

(A    0.478272
 B    1.633362
 C    1.670739
 D    1.408463
 dtype: float64, A   -0.590542
 B   -1.988379
 C   -1.772958
 D   -0.022133
 dtype: float64)