In [1]:
import pandas as pd
import numpy as np

### pandas 의 버전 확인

참고자료:

`10 minutes to pandas`, `askdjango`, `구글링`

In [2]:
pd.__version__

'0.23.4'

# 10 Minutes to pandas 튜토리얼

### EP1: Object creation(객체 생성)

In [3]:
s = pd.Series([1,3,5,np.nan,6,8]) #NaN = Not a Number
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### date_range 를 사용하여 날짜 생성

In [4]:
dates = pd.date_range('20180304', periods=20)
dates

DatetimeIndex(['2018-03-04', '2018-03-05', '2018-03-06', '2018-03-07',
               '2018-03-08', '2018-03-09', '2018-03-10', '2018-03-11',
               '2018-03-12', '2018-03-13', '2018-03-14', '2018-03-15',
               '2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21', '2018-03-22', '2018-03-23'],
              dtype='datetime64[ns]', freq='D')

### 임의의 배열(array) 를 생성

In [5]:
np.random.randn(6,4) # 6행 4열의 2차원 랜덤한 array 를 생성

array([[ 4.59081389e-01, -1.12869306e+00,  2.69274099e+00,
        -3.64585904e-01],
       [-6.43209530e-01,  6.00650804e-04,  1.53822323e+00,
        -1.83938279e+00],
       [ 1.70232215e+00, -8.97962860e-01, -2.62646415e-01,
        -1.31935032e+00],
       [ 7.62793510e-01,  5.49635018e-01,  9.10210516e-01,
        -1.73475364e+00],
       [ 3.62775517e-01,  3.59156288e-01, -1.11084570e+00,
        -7.75852926e-01],
       [-5.10352726e-01,  2.07149780e+00, -9.20242296e-01,
        -2.55541664e-01]])

### 임의로 생성한 배열을 데이터 프레임으로 생성

In [6]:
# 2차원 데이터 구조인 DataFrame 으로 생성

pd.DataFrame(np.random.randn(6, 4), index=['인덱스0', '인덱스1', '인덱스2', '인덱스3', '인덱스4', '인덱스5'])

Unnamed: 0,0,1,2,3
인덱스0,-1.19004,-0.650851,1.526844,0.397755
인덱스1,-0.298046,-0.625153,0.439832,0.61852
인덱스2,0.117274,-2.376662,-0.361141,-0.276977
인덱스3,0.459043,1.116591,0.466648,-1.750471
인덱스4,0.10815,0.64304,-2.005338,-0.684018
인덱스5,0.029832,-1.758634,-0.353506,0.548303


# 만약 위에서 생성한 dates를 index 로 넣게 된다면?

행이 맞지 않기 때문에 `ValueError: Shape of passed values is (4, 6), indices imply (4, 20)` 오류가 발생


### 꼭 values 의 길이를 일치해야 에러없이 데이터 테이블이 생성된다

#### + 다른 사람들도 보기 편하도록 데이터 테이블을 만들면 아래와 같이 shape, head 를 넣어주자

In [7]:
df = pd.DataFrame(np.random.randn(20, 4), index=dates, columns=['A','B','C','D'])

print(df.shape)
df.head()

(20, 4)


Unnamed: 0,A,B,C,D
2018-03-04,-0.703106,0.65374,-1.257482,0.965541
2018-03-05,0.547757,-0.519178,-0.392001,-1.193207
2018-03-06,-0.130877,-0.170858,0.614137,-1.084493
2018-03-07,1.090962,1.989445,-0.57396,-0.715516
2018-03-08,-1.013661,0.404164,-0.365994,-2.269701


### 직렬로 변환할 수 있는 object 를 전달하여 DataFrame 생성하기

In [8]:
df2 = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })
print(df2.shape)
df2

(4, 6)


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [9]:
df2.dtypes # 스마트하게 각 인덱스의 데이터타입을 출력해준다

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## EP2: Viewing Data

----------------------

Data의 값을 찾고 해당 데이터의 변경을 해본다

```
|        | columns1 | columns2 | columns3 | columns4 |
|--------|----------|----------|----------|----------|
| index1 |          |          |          |          |
| index2 |          |          |          |          |
| index3 |          |          |          |          |
```

### index 값 찾기

In [10]:
df.index

DatetimeIndex(['2018-03-04', '2018-03-05', '2018-03-06', '2018-03-07',
               '2018-03-08', '2018-03-09', '2018-03-10', '2018-03-11',
               '2018-03-12', '2018-03-13', '2018-03-14', '2018-03-15',
               '2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21', '2018-03-22', '2018-03-23'],
              dtype='datetime64[ns]', freq='D')

### 데이터의 columns 의 값 찾기/변경하기

In [11]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
df.columns = ['A', 'B', 'C', 'D']

### DataFrame의 요약 정보를 알아보기

In [13]:
df.describe()

Unnamed: 0,A,B,C,D
count,20.0,20.0,20.0,20.0
mean,-0.263461,-0.358772,0.052354,-0.08548
std,1.037438,0.975795,0.891763,1.059816
min,-2.317784,-1.834366,-1.287916,-2.269701
25%,-0.79574,-1.057945,-0.600179,-0.725772
50%,-0.175319,-0.423575,-0.114012,-0.072497
75%,0.387284,0.109467,0.867335,0.361035
max,1.834355,1.989445,1.552706,2.759436


### index 와 column 을 바꿔주는 `.T`

In [14]:
df.T

Unnamed: 0,2018-03-04 00:00:00,2018-03-05 00:00:00,2018-03-06 00:00:00,2018-03-07 00:00:00,2018-03-08 00:00:00,2018-03-09 00:00:00,2018-03-10 00:00:00,2018-03-11 00:00:00,2018-03-12 00:00:00,2018-03-13 00:00:00,2018-03-14 00:00:00,2018-03-15 00:00:00,2018-03-16 00:00:00,2018-03-17 00:00:00,2018-03-18 00:00:00,2018-03-19 00:00:00,2018-03-20 00:00:00,2018-03-21 00:00:00,2018-03-22 00:00:00,2018-03-23 00:00:00
A,-0.703106,0.547757,-0.130877,1.090962,-1.013661,0.118995,-0.219761,-0.333685,0.656796,0.179418,-0.015846,0.333792,-0.793051,-1.365017,0.647727,1.834355,-2.317784,-0.803808,-2.302691,-0.679732
B,0.65374,-0.519178,-0.170858,1.989445,0.404164,-0.461176,-1.574852,-0.948865,-0.385973,-0.625696,-0.063043,-1.834366,-0.778356,0.365965,-0.00362,-1.464695,1.111655,0.023968,-1.385183,-1.508508
C,-1.257482,-0.392001,0.614137,-0.57396,-0.365994,-0.268378,0.839809,0.949915,0.110937,-0.790553,-0.678836,1.103465,1.552706,0.0016,-1.172329,1.34325,0.99079,-1.287916,0.557555,-0.229623
D,0.965541,-1.193207,-1.084493,-0.715516,-2.269701,-0.75654,0.167952,0.701913,0.077588,0.480335,0.321268,-0.56434,-0.030697,2.759436,0.255908,-1.320738,-0.223556,1.010108,-0.114297,-0.176565


### 원하는 index 기준으로 다시 정렬하기

In [15]:
df.sort_index(ascending=False).head()

Unnamed: 0,A,B,C,D
2018-03-23,-0.679732,-1.508508,-0.229623,-0.176565
2018-03-22,-2.302691,-1.385183,0.557555,-0.114297
2018-03-21,-0.803808,0.023968,-1.287916,1.010108
2018-03-20,-2.317784,1.111655,0.99079,-0.223556
2018-03-19,1.834355,-1.464695,1.34325,-1.320738


### 특정 column을 기준으로 정렬하기

df.sort_values('<기준이 되는 column>')

In [16]:
df.sort_values('C').head()

Unnamed: 0,A,B,C,D
2018-03-21,-0.803808,0.023968,-1.287916,1.010108
2018-03-04,-0.703106,0.65374,-1.257482,0.965541
2018-03-18,0.647727,-0.00362,-1.172329,0.255908
2018-03-13,0.179418,-0.625696,-0.790553,0.480335
2018-03-14,-0.015846,-0.063043,-0.678836,0.321268


## EP3: Selection

특정 Column 을 선택해서 데이터를 확인

2가지 방법은 모두 동일한 결과값을 가지지만 `Column명이 "D-D" ` 와 같은 경우에는 마이너스(-) _연산자로 인식하여 (데이터 - 데이터)로 계산을 한다_

In [17]:
df['D'].head(3) # DataFrame 'D'의 데이터 가져오기(1)

2018-03-04    0.965541
2018-03-05   -1.193207
2018-03-06   -1.084493
Freq: D, Name: D, dtype: float64

In [18]:
df.D.head(3) # DataFrame 'D'의 데이터 가져오기(1)

2018-03-04    0.965541
2018-03-05   -1.193207
2018-03-06   -1.084493
Freq: D, Name: D, dtype: float64

### 슬라이싱하기

#### DataFrame 에서 `0이상~3미만` 값을 출력하기

In [19]:
df[0:3]

Unnamed: 0,A,B,C,D
2018-03-04,-0.703106,0.65374,-1.257482,0.965541
2018-03-05,0.547757,-0.519178,-0.392001,-1.193207
2018-03-06,-0.130877,-0.170858,0.614137,-1.084493


#### DataFrame 에서 `A이상 F이하`의 값을 출력하기

In [20]:
df['2018-03-04': '2018-03-06']

Unnamed: 0,A,B,C,D
2018-03-04,-0.703106,0.65374,-1.257482,0.965541
2018-03-05,0.547757,-0.519178,-0.392001,-1.193207
2018-03-06,-0.130877,-0.170858,0.614137,-1.084493


#### 위에서 선언한 변수에 0번째 index 들의 값 출력하기

`dates = pd.date_range('20180304', periods=20)`


In [21]:
df.loc[dates[0]]

A   -0.703106
B    0.653740
C   -1.257482
D    0.965541
Name: 2018-03-04 00:00:00, dtype: float64

In [22]:
df.loc[:, ['A', 'D']].head() # 특정 index 의 값을 찾기

Unnamed: 0,A,D
2018-03-04,-0.703106,0.965541
2018-03-05,0.547757,-1.193207
2018-03-06,-0.130877,-1.084493
2018-03-07,1.090962,-0.715516
2018-03-08,-1.013661,-2.269701


In [23]:
df.loc['20180311': '20180316', ['A', 'C']]

Unnamed: 0,A,C
2018-03-11,-0.333685,0.949915
2018-03-12,0.656796,0.110937
2018-03-13,0.179418,-0.790553
2018-03-14,-0.015846,-0.678836
2018-03-15,0.333792,1.103465
2018-03-16,-0.793051,1.552706


In [24]:
# 1개의 행만 지정했기 때문에 Series 로 출력
df.loc['20180311', ['A', 'C']]

A   -0.333685
C    0.949915
Name: 2018-03-11 00:00:00, dtype: float64

In [25]:
# 행이던 열이던 한줄의 데이터 범위를 선택하면 Series 이다.
df.loc['20180311':'20180319', 'A']

2018-03-11   -0.333685
2018-03-12    0.656796
2018-03-13    0.179418
2018-03-14   -0.015846
2018-03-15    0.333792
2018-03-16   -0.793051
2018-03-17   -1.365017
2018-03-18    0.647727
2018-03-19    1.834355
Freq: D, Name: A, dtype: float64

### 세번째 행을 Selection 하기

In [26]:
df.head()

Unnamed: 0,A,B,C,D
2018-03-04,-0.703106,0.65374,-1.257482,0.965541
2018-03-05,0.547757,-0.519178,-0.392001,-1.193207
2018-03-06,-0.130877,-0.170858,0.614137,-1.084493
2018-03-07,1.090962,1.989445,-0.57396,-0.715516
2018-03-08,-1.013661,0.404164,-0.365994,-2.269701


In [27]:
df.iloc[3] # index 3번째의 행 값을 출력

A    1.090962
B    1.989445
C   -0.573960
D   -0.715516
Name: 2018-03-07 00:00:00, dtype: float64

In [28]:
df.iloc[3:5, 0:2] # 3월 7일부터 3월 8일 // AA, BB 인덱스의 값 출력

Unnamed: 0,A,B
2018-03-07,1.090962,1.989445
2018-03-08,-1.013661,0.404164


In [29]:
df.iloc[[1, 2, 4], [0,2]]

Unnamed: 0,A,C
2018-03-05,0.547757,-0.392001
2018-03-06,-0.130877,0.614137
2018-03-08,-1.013661,-0.365994


### *명시적으로 모든 행을 출력한다고 해줄 때는 범위가 없더라도 : 를 넣어준다

In [30]:
df.iloc[[1, 3], :]

Unnamed: 0,A,B,C,D
2018-03-05,0.547757,-0.519178,-0.392001,-1.193207
2018-03-07,1.090962,1.989445,-0.57396,-0.715516


### Boolean Indexing

DataFrame 에서 0 이상의 값을 찾아보기

In [31]:
df['A'] > 0

2018-03-04    False
2018-03-05     True
2018-03-06    False
2018-03-07     True
2018-03-08    False
2018-03-09     True
2018-03-10    False
2018-03-11    False
2018-03-12     True
2018-03-13     True
2018-03-14    False
2018-03-15     True
2018-03-16    False
2018-03-17    False
2018-03-18     True
2018-03-19     True
2018-03-20    False
2018-03-21    False
2018-03-22    False
2018-03-23    False
Freq: D, Name: A, dtype: bool

### AA 인덱스에서 True 값만 가져오기

In [32]:
mask = df['A'] > 0
df[mask]

Unnamed: 0,A,B,C,D
2018-03-05,0.547757,-0.519178,-0.392001,-1.193207
2018-03-07,1.090962,1.989445,-0.57396,-0.715516
2018-03-09,0.118995,-0.461176,-0.268378,-0.75654
2018-03-12,0.656796,-0.385973,0.110937,0.077588
2018-03-13,0.179418,-0.625696,-0.790553,0.480335
2018-03-15,0.333792,-1.834366,1.103465,-0.56434
2018-03-18,0.647727,-0.00362,-1.172329,0.255908
2018-03-19,1.834355,-1.464695,1.34325,-1.320738


### df2 로 복사하여 새로운 튜토리얼을 진행

In [33]:
df2 = df.copy()

In [34]:
df2['E'] = [
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
]
df2.head()

Unnamed: 0,A,B,C,D,E
2018-03-04,-0.703106,0.65374,-1.257482,0.965541,one
2018-03-05,0.547757,-0.519178,-0.392001,-1.193207,two
2018-03-06,-0.130877,-0.170858,0.614137,-1.084493,three
2018-03-07,1.090962,1.989445,-0.57396,-0.715516,four
2018-03-08,-1.013661,0.404164,-0.365994,-2.269701,five


### 'one', 'two' 에 해당하는 값만 가져오기

In [35]:
mask = df2.E.isin(['one', 'two'])
df[mask]

Unnamed: 0,A,B,C,D
2018-03-04,-0.703106,0.65374,-1.257482,0.965541
2018-03-05,0.547757,-0.519178,-0.392001,-1.193207
2018-03-09,0.118995,-0.461176,-0.268378,-0.75654
2018-03-10,-0.219761,-1.574852,0.839809,0.167952
2018-03-14,-0.015846,-0.063043,-0.678836,0.321268
2018-03-15,0.333792,-1.834366,1.103465,-0.56434
2018-03-19,1.834355,-1.464695,1.34325,-1.320738
2018-03-20,-2.317784,1.111655,0.99079,-0.223556


### 새로운 Series 를 정의하기

In [36]:
s1 = pd.Series(range(1, 21),
               index=pd.date_range(
                   '20180305', periods=20)
              ) # 의도적으로 3월 5일부터 작성
df['F'] = s1

In [37]:
df # 데이터프레임 F 를 생성했고, 값이 없으면 NaN 으로 출력됨을 확인

Unnamed: 0,A,B,C,D,F
2018-03-04,-0.703106,0.65374,-1.257482,0.965541,
2018-03-05,0.547757,-0.519178,-0.392001,-1.193207,1.0
2018-03-06,-0.130877,-0.170858,0.614137,-1.084493,2.0
2018-03-07,1.090962,1.989445,-0.57396,-0.715516,3.0
2018-03-08,-1.013661,0.404164,-0.365994,-2.269701,4.0
2018-03-09,0.118995,-0.461176,-0.268378,-0.75654,5.0
2018-03-10,-0.219761,-1.574852,0.839809,0.167952,6.0
2018-03-11,-0.333685,-0.948865,0.949915,0.701913,7.0
2018-03-12,0.656796,-0.385973,0.110937,0.077588,8.0
2018-03-13,0.179418,-0.625696,-0.790553,0.480335,9.0


### 데이터프레임의 길이만큼 EE 인덱스의 값을 13으로 생성하시오

In [38]:
df.loc[:, 'E'] = [13] * len(df)
df

Unnamed: 0,A,B,C,D,F,E
2018-03-04,-0.703106,0.65374,-1.257482,0.965541,,13
2018-03-05,0.547757,-0.519178,-0.392001,-1.193207,1.0,13
2018-03-06,-0.130877,-0.170858,0.614137,-1.084493,2.0,13
2018-03-07,1.090962,1.989445,-0.57396,-0.715516,3.0,13
2018-03-08,-1.013661,0.404164,-0.365994,-2.269701,4.0,13
2018-03-09,0.118995,-0.461176,-0.268378,-0.75654,5.0,13
2018-03-10,-0.219761,-1.574852,0.839809,0.167952,6.0,13
2018-03-11,-0.333685,-0.948865,0.949915,0.701913,7.0,13
2018-03-12,0.656796,-0.385973,0.110937,0.077588,8.0,13
2018-03-13,0.179418,-0.625696,-0.790553,0.480335,9.0,13


### 데이터 프레임의 모든 수를 음수로 전환하기

In [39]:
df2 = df.copy()

In [40]:
df2[df2 > 0] = -df2 # 모든 수를 음수로 전환

In [41]:
df2.head()

Unnamed: 0,A,B,C,D,F,E
2018-03-04,-0.703106,-0.65374,-1.257482,-0.965541,,-13
2018-03-05,-0.547757,-0.519178,-0.392001,-1.193207,-1.0,-13
2018-03-06,-0.130877,-0.170858,-0.614137,-1.084493,-2.0,-13
2018-03-07,-1.090962,-1.989445,-0.57396,-0.715516,-3.0,-13
2018-03-08,-1.013661,-0.404164,-0.365994,-2.269701,-4.0,-13


## Missing Data

### reindex 를 사용하여 지정된 축의 index 를 변경/추가/삭제

In [42]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

In [43]:
df1

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-04,-0.703106,0.65374,-1.257482,0.965541,,13,13
2018-03-05,0.547757,-0.519178,-0.392001,-1.193207,1.0,13,13
2018-03-06,-0.130877,-0.170858,0.614137,-1.084493,2.0,13,13
2018-03-07,1.090962,1.989445,-0.57396,-0.715516,3.0,13,13


### index[0], [1] 에 해당하는 E Column 의 값을 1로 변경하기

In [44]:
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-04,-0.703106,0.65374,-1.257482,0.965541,,1,1
2018-03-05,0.547757,-0.519178,-0.392001,-1.193207,1.0,1,1
2018-03-06,-0.130877,-0.170858,0.614137,-1.084493,2.0,13,13
2018-03-07,1.090962,1.989445,-0.57396,-0.715516,3.0,13,13


### NaN 에 해당하는 행을 모두 지우기

In [45]:
df1.dropna()

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-05,0.547757,-0.519178,-0.392001,-1.193207,1.0,1,1
2018-03-06,-0.130877,-0.170858,0.614137,-1.084493,2.0,13,13
2018-03-07,1.090962,1.989445,-0.57396,-0.715516,3.0,13,13


### NaN 에 해당하는 값을 5로 대체하기

In [46]:
df1.fillna(5)

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-04,-0.703106,0.65374,-1.257482,0.965541,5.0,1,1
2018-03-05,0.547757,-0.519178,-0.392001,-1.193207,1.0,1,1
2018-03-06,-0.130877,-0.170858,0.614137,-1.084493,2.0,13,13
2018-03-07,1.090962,1.989445,-0.57396,-0.715516,3.0,13,13


### NaN 에 해당하는 값을 Boolean 값으로 표시하기

In [47]:
pd.isna(df)

Unnamed: 0,A,B,C,D,F,E
2018-03-04,False,False,False,False,True,False
2018-03-05,False,False,False,False,False,False
2018-03-06,False,False,False,False,False,False
2018-03-07,False,False,False,False,False,False
2018-03-08,False,False,False,False,False,False
2018-03-09,False,False,False,False,False,False
2018-03-10,False,False,False,False,False,False
2018-03-11,False,False,False,False,False,False
2018-03-12,False,False,False,False,False,False
2018-03-13,False,False,False,False,False,False
