# [Pandas 10분 완성](https://dataitgirls2.github.io/10minutes2pandas/)

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Object Creation (객체 생성)
* [데이터 구조 소개 섹션](https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html) 을 참조
* Pandas는 값을 가지고 있는 리스트를 통해 Series를 만들고, 정수로 만들어진 인덱스를 기본값으로 불러올 것입니다.

In [4]:
s = pd.Series([1, 3, 5,np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

* datetime 인덱스와 레이블이 있는 열을 가지고 있는 numpy 배열을 전달하여 데이터프레임을 만듭니다.

In [5]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.354597,-1.177537,1.044465,-0.827949
2013-01-02,0.044465,-0.196376,1.704628,-0.155759
2013-01-03,1.611715,0.570023,-0.695172,0.700136
2013-01-04,-0.520892,1.541614,-0.775921,-1.201509
2013-01-05,-2.102299,0.431392,0.185523,2.351049
2013-01-06,-0.071998,-1.685459,-0.482766,0.735897


* Series와 같은 것으로 변환될 수 있는 객체들의 dict로 구성된 데이터프레임을 만듭니다.

In [16]:
df2 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D' : np.array([3]  * 4, dtype='int32'),
                    'E' : pd.Categorical(['test', 'train', 'test', 'train']),
                    'F' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [25]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# 2. Viewing Data (데이터 확인하기)
* [Basic Section](https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html)을 참조

In [26]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.354597,-1.177537,1.044465,-0.827949
2013-01-02,0.044465,-0.196376,1.704628,-0.155759
2013-01-03,1.611715,0.570023,-0.695172,0.700136
2013-01-04,-0.520892,1.541614,-0.775921,-1.201509
2013-01-05,-2.102299,0.431392,0.185523,2.351049


In [27]:
df.tail()

Unnamed: 0,A,B,C,D
2013-01-02,0.044465,-0.196376,1.704628,-0.155759
2013-01-03,1.611715,0.570023,-0.695172,0.700136
2013-01-04,-0.520892,1.541614,-0.775921,-1.201509
2013-01-05,-2.102299,0.431392,0.185523,2.351049
2013-01-06,-0.071998,-1.685459,-0.482766,0.735897


In [28]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [29]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [30]:
df.values

array([[ 0.35459675, -1.17753657,  1.04446507, -0.82794887],
       [ 0.04446469, -0.19637628,  1.70462824, -0.15575855],
       [ 1.6117154 ,  0.57002289, -0.69517247,  0.70013554],
       [-0.5208921 ,  1.5416138 , -0.77592064, -1.2015088 ],
       [-2.10229879,  0.4313924 ,  0.18552308,  2.35104924],
       [-0.07199841, -1.68545871, -0.48276597,  0.73589686]])

In [31]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.114069,-0.086057,0.16346,0.266978
std,1.211895,1.1924,1.018739,1.287236
min,-2.102299,-1.685459,-0.775921,-1.201509
25%,-0.408669,-0.932247,-0.642071,-0.659901
50%,-0.013767,0.117508,-0.148621,0.272188
75%,0.277064,0.535365,0.82973,0.726957
max,1.611715,1.541614,1.704628,2.351049


* 행 과 열 전치.

In [32]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.354597,0.044465,1.611715,-0.520892,-2.102299,-0.071998
B,-1.177537,-0.196376,0.570023,1.541614,0.431392,-1.685459
C,1.044465,1.704628,-0.695172,-0.775921,0.185523,-0.482766
D,-0.827949,-0.155759,0.700136,-1.201509,2.351049,0.735897


* 축 별 정렬

In [47]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.827949,1.044465,-1.177537,0.354597
2013-01-02,-0.155759,1.704628,-0.196376,0.044465
2013-01-03,0.700136,-0.695172,0.570023,1.611715
2013-01-04,-1.201509,-0.775921,1.541614,-0.520892
2013-01-05,2.351049,0.185523,0.431392,-2.102299
2013-01-06,0.735897,-0.482766,-1.685459,-0.071998


* 값 별 정렬

In [46]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-06,-0.071998,-1.685459,-0.482766,0.735897
2013-01-01,0.354597,-1.177537,1.044465,-0.827949
2013-01-02,0.044465,-0.196376,1.704628,-0.155759
2013-01-05,-2.102299,0.431392,0.185523,2.351049
2013-01-03,1.611715,0.570023,-0.695172,0.700136
2013-01-04,-0.520892,1.541614,-0.775921,-1.201509


# 3. Selection (선택)
* 주석 (Note) : 선택과 설정을 위한 Python / Numpy의 표준화된 표현들이 직관적이며, 코드 작성을 위한 양방향 작업에 유용하지만 우리는 Pandas에 최적화된 데이터 접근 방법인 .at, .iat, .loc 및 .iloc 을 추천합니다.

* [데이터 인덱싱 및 선택](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html) 문서와 [다중 인덱싱 / 심화 인덱싱 문서](https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html)를 참조하세요.

### Getting (데이터 얻기)
* df.A 와 동일한 Series를 생성하는 단일 열을 선택합니다.

In [48]:
df['A']

2013-01-01    0.354597
2013-01-02    0.044465
2013-01-03    1.611715
2013-01-04   -0.520892
2013-01-05   -2.102299
2013-01-06   -0.071998
Freq: D, Name: A, dtype: float64

* 행을 분할하는 [ ]를 통해 선택합니다.

In [49]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.354597,-1.177537,1.044465,-0.827949
2013-01-02,0.044465,-0.196376,1.704628,-0.155759
2013-01-03,1.611715,0.570023,-0.695172,0.700136


In [50]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.044465,-0.196376,1.704628,-0.155759
2013-01-03,1.611715,0.570023,-0.695172,0.700136
2013-01-04,-0.520892,1.541614,-0.775921,-1.201509


### Selection by Label (Label 을 통한 선택)
* [Label을 통한 선택](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html)에서 더 많은 내용을 확인하세요.

* 라벨을 사용하여 횡단면을 얻습니다.

In [51]:
df.loc[dates[0]]

A    0.354597
B   -1.177537
C    1.044465
D   -0.827949
Name: 2013-01-01 00:00:00, dtype: float64

* 라벨을 사용하여 여러 축(의 데이터)를 얻습니다.

In [52]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,0.354597,-1.177537
2013-01-02,0.044465,-0.196376
2013-01-03,1.611715,0.570023
2013-01-04,-0.520892,1.541614
2013-01-05,-2.102299,0.431392
2013-01-06,-0.071998,-1.685459


* 양쪽 종단점을 포함한 라벨 슬라이싱을 봅니다. (code 2줄은 같은 값 출력)

In [54]:
df.loc['20130102' : '20130104', ['A', 'B']]
# df.loc['20130102' : '20130104', 'A' : 'B']

Unnamed: 0,A,B
2013-01-02,0.044465,-0.196376
2013-01-03,1.611715,0.570023
2013-01-04,-0.520892,1.541614


* 반환되는 객체의 차원를 줄입니다.

In [55]:
df.loc['20130102',['A','B']]

A    0.044465
B   -0.196376
Name: 2013-01-02 00:00:00, dtype: float64

* 스칼라 값을 얻습니다.

In [56]:
df.loc[dates[0],'A']

0.35459675136496255

* 스칼라 값을 더 빠르게 구하는 방법입니다 (앞선 메소드와 동일합니다).

In [57]:
df.at[dates[0], 'A']

0.35459675136496255

### Selection by Position (위치로 선택하기)
* 자세한 내용은 [위치로 선택하기](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html)를 참고

* 넘겨받은 정수의 위치를 기준으로 선택

In [58]:
df.iloc[3]

A   -0.520892
B    1.541614
C   -0.775921
D   -1.201509
Name: 2013-01-04 00:00:00, dtype: float64

* 정수로 표기된 슬라이스들을 통해, numpy / python과 유사하게 작동합니다.

In [60]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-0.520892,1.541614
2013-01-05,-2.102299,0.431392


* 정수로 표기된 위치값의 리스트들을 통해, numpy / python의 스타일과 유사해집니다.

In [61]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,0.044465,1.704628
2013-01-03,1.611715,-0.695172
2013-01-05,-2.102299,0.185523


* 명시적으로 행을 나누고자 하는 경우입니다.

In [62]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,0.044465,-0.196376,1.704628,-0.155759
2013-01-03,1.611715,0.570023,-0.695172,0.700136


* 명시적으로 열을 나누고자 하는 경우입니다.

In [63]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-1.177537,1.044465
2013-01-02,-0.196376,1.704628
2013-01-03,0.570023,-0.695172
2013-01-04,1.541614,-0.775921
2013-01-05,0.431392,0.185523
2013-01-06,-1.685459,-0.482766


* 명시적으로 (특정한) 값을 얻고자 하는 경우입니다.

* 스칼라 값을 빠르게 얻는 방법입니다 (위의 방식과 동일합니다).

### Boolean Indexing
* 데이터를 선택하기 위해 단일 열의 값을 사용합니다.