In [1]:
import pandas as pd
import numpy as np

### pandas 의 버전 확인

참고자료:

`10 minutes to pandas`, `askdjango`, `구글링`

In [2]:
pd.__version__

'0.23.4'

# 10 Minutes to pandas 튜토리얼

### EP1: Object creation(객체 생성)

In [3]:
s = pd.Series([1,3,5,np.nan,6,8]) #NaN = Not a Number
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### date_range 를 사용하여 날짜 생성

In [4]:
dates = pd.date_range('20180304', periods=20)
dates

DatetimeIndex(['2018-03-04', '2018-03-05', '2018-03-06', '2018-03-07',
               '2018-03-08', '2018-03-09', '2018-03-10', '2018-03-11',
               '2018-03-12', '2018-03-13', '2018-03-14', '2018-03-15',
               '2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21', '2018-03-22', '2018-03-23'],
              dtype='datetime64[ns]', freq='D')

### 임의의 배열(array) 를 생성

In [5]:
np.random.randn(6,4) # 6행 4열의 2차원 랜덤한 array 를 생성

array([[ 0.76835082, -0.94027806,  1.03812382, -1.51161557],
       [-0.53448072,  0.40780034, -0.22879399, -0.68522619],
       [ 0.20679524, -1.0499555 ,  2.19835926, -0.63592652],
       [-0.43799402, -1.68816798,  0.41004628, -0.99703253],
       [-1.20667477,  0.82709562,  0.27293776,  0.03174209],
       [ 0.82537751, -1.02862859, -0.37994225, -0.0060564 ]])

### 임의로 생성한 배열을 데이터 프레임으로 생성

In [6]:
# 2차원 데이터 구조인 DataFrame 으로 생성

pd.DataFrame(np.random.randn(6, 4), index=['인덱스0', '인덱스1', '인덱스2', '인덱스3', '인덱스4', '인덱스5'])

Unnamed: 0,0,1,2,3
인덱스0,-0.660495,-0.139967,-1.273437,0.769692
인덱스1,-1.059665,-1.114148,-1.804457,-1.635893
인덱스2,0.453444,-1.348903,-1.145283,0.646543
인덱스3,-0.06451,0.846399,1.313728,-0.34448
인덱스4,0.516938,-0.604556,0.003495,0.358959
인덱스5,-0.711918,2.397363,-0.075824,-0.436567


# 만약 위에서 생성한 dates를 index 로 넣게 된다면?

행이 맞지 않기 때문에 `ValueError: Shape of passed values is (4, 6), indices imply (4, 20)` 오류가 발생


### 꼭 values 의 길이를 일치해야 에러없이 데이터 테이블이 생성된다

#### + 다른 사람들도 보기 편하도록 데이터 테이블을 만들면 아래와 같이 shape, head 를 넣어주자

In [7]:
df = pd.DataFrame(np.random.randn(20, 4), index=dates, columns=['A','B','C','D'])

print(df.shape)
df.head()

(20, 4)


Unnamed: 0,A,B,C,D
2018-03-04,-0.252957,0.439122,0.385989,-0.871614
2018-03-05,0.326848,-0.025573,0.119267,0.801228
2018-03-06,0.111545,-1.212578,0.70976,-0.406191
2018-03-07,0.587586,0.611694,-0.949295,0.047066
2018-03-08,-1.770163,0.558087,-0.979803,-0.577235


### 직렬로 변환할 수 있는 object 를 전달하여 DataFrame 생성하기

In [8]:
df2 = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })
print(df2.shape)
df2

(4, 6)


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [9]:
df2.dtypes # 스마트하게 각 인덱스의 데이터타입을 출력해준다

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## EP2: Viewing Data

----------------------

Data의 값을 찾고 해당 데이터의 변경을 해본다

```
|        | columns1 | columns2 | columns3 | columns4 |
|--------|----------|----------|----------|----------|
| index1 |          |          |          |          |
| index2 |          |          |          |          |
| index3 |          |          |          |          |
```

### index 값 찾기

In [10]:
df.index

DatetimeIndex(['2018-03-04', '2018-03-05', '2018-03-06', '2018-03-07',
               '2018-03-08', '2018-03-09', '2018-03-10', '2018-03-11',
               '2018-03-12', '2018-03-13', '2018-03-14', '2018-03-15',
               '2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21', '2018-03-22', '2018-03-23'],
              dtype='datetime64[ns]', freq='D')

### 데이터의 columns 의 값 찾기/변경하기

In [11]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
df.columns = ['A', 'B', 'C', 'D']

### DataFrame의 요약 정보를 알아보기

In [13]:
df.describe()

Unnamed: 0,A,B,C,D
count,20.0,20.0,20.0,20.0
mean,-0.100435,0.279111,0.178873,0.160783
std,0.911732,0.889238,1.094696,0.751327
min,-2.047036,-1.212578,-2.410551,-1.003263
25%,-0.46621,-0.353288,-0.44537,-0.448952
50%,0.112502,0.241839,0.252628,0.164433
75%,0.348855,0.767826,0.892447,0.642413
max,1.823522,1.934884,2.535563,1.870247


### index 와 column 을 바꿔주는 `.T`

In [14]:
df.T

Unnamed: 0,2018-03-04 00:00:00,2018-03-05 00:00:00,2018-03-06 00:00:00,2018-03-07 00:00:00,2018-03-08 00:00:00,2018-03-09 00:00:00,2018-03-10 00:00:00,2018-03-11 00:00:00,2018-03-12 00:00:00,2018-03-13 00:00:00,2018-03-14 00:00:00,2018-03-15 00:00:00,2018-03-16 00:00:00,2018-03-17 00:00:00,2018-03-18 00:00:00,2018-03-19 00:00:00,2018-03-20 00:00:00,2018-03-21 00:00:00,2018-03-22 00:00:00,2018-03-23 00:00:00
A,-0.252957,0.326848,0.111545,0.587586,-1.770163,-0.176269,0.270744,-0.619015,0.113459,0.414876,-0.321636,-0.847386,-1.277966,-0.415275,1.010141,0.188207,0.638815,-2.047036,0.233262,1.823522
B,0.439122,-0.025573,-1.212578,0.611694,0.558087,0.827106,-0.226251,0.748065,-0.439769,1.934884,1.849691,1.643273,-0.724695,-0.977803,0.24655,-0.329698,0.986284,-0.139238,-0.424058,0.237128
C,0.385989,0.119267,0.70976,-0.949295,-0.979803,1.047592,-0.270154,-0.183903,2.535563,-0.915241,1.680028,-0.090823,-0.422134,1.039589,-2.410551,0.482883,0.404751,-0.515078,0.8434,1.065618
D,-0.871614,0.801228,-0.406191,0.047066,-0.577235,-0.245867,0.791146,0.679426,0.009409,0.15957,-0.786624,0.51576,1.303832,0.169296,-1.003263,1.870247,0.469043,0.630075,-0.602422,0.26277


### 원하는 index 기준으로 다시 정렬하기

In [15]:
df.sort_index(ascending=False).head()

Unnamed: 0,A,B,C,D
2018-03-23,1.823522,0.237128,1.065618,0.26277
2018-03-22,0.233262,-0.424058,0.8434,-0.602422
2018-03-21,-2.047036,-0.139238,-0.515078,0.630075
2018-03-20,0.638815,0.986284,0.404751,0.469043
2018-03-19,0.188207,-0.329698,0.482883,1.870247


### 특정 column을 기준으로 정렬하기

df.sort_values('<기준이 되는 column>')

In [16]:
df.sort_values('C').head()

Unnamed: 0,A,B,C,D
2018-03-18,1.010141,0.24655,-2.410551,-1.003263
2018-03-08,-1.770163,0.558087,-0.979803,-0.577235
2018-03-07,0.587586,0.611694,-0.949295,0.047066
2018-03-13,0.414876,1.934884,-0.915241,0.15957
2018-03-21,-2.047036,-0.139238,-0.515078,0.630075


## EP3: Selection

특정 Column 을 선택해서 데이터를 확인

2가지 방법은 모두 동일한 결과값을 가지지만 `Column명이 "D-D" ` 와 같은 경우에는 마이너스(-) _연산자로 인식하여 (데이터 - 데이터)로 계산을 한다_

In [17]:
df['D'].head(3) # DataFrame 'D'의 데이터 가져오기(1)

2018-03-04   -0.871614
2018-03-05    0.801228
2018-03-06   -0.406191
Freq: D, Name: D, dtype: float64

In [18]:
df.D.head(3) # DataFrame 'D'의 데이터 가져오기(1)

2018-03-04   -0.871614
2018-03-05    0.801228
2018-03-06   -0.406191
Freq: D, Name: D, dtype: float64

### 슬라이싱하기

#### DataFrame 에서 `0이상~3미만` 값을 출력하기

In [19]:
df[0:3]

Unnamed: 0,A,B,C,D
2018-03-04,-0.252957,0.439122,0.385989,-0.871614
2018-03-05,0.326848,-0.025573,0.119267,0.801228
2018-03-06,0.111545,-1.212578,0.70976,-0.406191


#### DataFrame 에서 `A이상 F이하`의 값을 출력하기

In [20]:
df['2018-03-04': '2018-03-06']

Unnamed: 0,A,B,C,D
2018-03-04,-0.252957,0.439122,0.385989,-0.871614
2018-03-05,0.326848,-0.025573,0.119267,0.801228
2018-03-06,0.111545,-1.212578,0.70976,-0.406191


#### 위에서 선언한 변수에 0번째 index 들의 값 출력하기

`dates = pd.date_range('20180304', periods=20)`


In [21]:
df.loc[dates[0]]

A   -0.252957
B    0.439122
C    0.385989
D   -0.871614
Name: 2018-03-04 00:00:00, dtype: float64

In [22]:
df.loc[:, ['A', 'D']].head() # 특정 index 의 값을 찾기

Unnamed: 0,A,D
2018-03-04,-0.252957,-0.871614
2018-03-05,0.326848,0.801228
2018-03-06,0.111545,-0.406191
2018-03-07,0.587586,0.047066
2018-03-08,-1.770163,-0.577235


In [23]:
df.loc['20180311': '20180316', ['A', 'C']]

Unnamed: 0,A,C
2018-03-11,-0.619015,-0.183903
2018-03-12,0.113459,2.535563
2018-03-13,0.414876,-0.915241
2018-03-14,-0.321636,1.680028
2018-03-15,-0.847386,-0.090823
2018-03-16,-1.277966,-0.422134


In [24]:
# 1개의 행만 지정했기 때문에 Series 로 출력
df.loc['20180311', ['A', 'C']]

A   -0.619015
C   -0.183903
Name: 2018-03-11 00:00:00, dtype: float64

In [25]:
# 행이던 열이던 한줄의 데이터 범위를 선택하면 Series 이다.
df.loc['20180311':'20180319', 'A']

2018-03-11   -0.619015
2018-03-12    0.113459
2018-03-13    0.414876
2018-03-14   -0.321636
2018-03-15   -0.847386
2018-03-16   -1.277966
2018-03-17   -0.415275
2018-03-18    1.010141
2018-03-19    0.188207
Freq: D, Name: A, dtype: float64

### 세번째 행을 Selection 하기

In [26]:
df.head()

Unnamed: 0,A,B,C,D
2018-03-04,-0.252957,0.439122,0.385989,-0.871614
2018-03-05,0.326848,-0.025573,0.119267,0.801228
2018-03-06,0.111545,-1.212578,0.70976,-0.406191
2018-03-07,0.587586,0.611694,-0.949295,0.047066
2018-03-08,-1.770163,0.558087,-0.979803,-0.577235


In [27]:
df.iloc[3] # index 3번째의 행 값을 출력

A    0.587586
B    0.611694
C   -0.949295
D    0.047066
Name: 2018-03-07 00:00:00, dtype: float64

In [28]:
df.iloc[3:5, 0:2] # 3월 7일부터 3월 8일 // AA, BB 인덱스의 값 출력

Unnamed: 0,A,B
2018-03-07,0.587586,0.611694
2018-03-08,-1.770163,0.558087


In [29]:
df.iloc[[1, 2, 4], [0,2]]

Unnamed: 0,A,C
2018-03-05,0.326848,0.119267
2018-03-06,0.111545,0.70976
2018-03-08,-1.770163,-0.979803


### *명시적으로 모든 행을 출력한다고 해줄 때는 범위가 없더라도 : 를 넣어준다

In [30]:
df.iloc[[1, 3], :]

Unnamed: 0,A,B,C,D
2018-03-05,0.326848,-0.025573,0.119267,0.801228
2018-03-07,0.587586,0.611694,-0.949295,0.047066


### Boolean Indexing

DataFrame 에서 0 이상의 값을 찾아보기

In [31]:
df['A'] > 0

2018-03-04    False
2018-03-05     True
2018-03-06     True
2018-03-07     True
2018-03-08    False
2018-03-09    False
2018-03-10     True
2018-03-11    False
2018-03-12     True
2018-03-13     True
2018-03-14    False
2018-03-15    False
2018-03-16    False
2018-03-17    False
2018-03-18     True
2018-03-19     True
2018-03-20     True
2018-03-21    False
2018-03-22     True
2018-03-23     True
Freq: D, Name: A, dtype: bool

### AA 인덱스에서 True 값만 가져오기

In [32]:
mask = df['A'] > 0
df[mask]

Unnamed: 0,A,B,C,D
2018-03-05,0.326848,-0.025573,0.119267,0.801228
2018-03-06,0.111545,-1.212578,0.70976,-0.406191
2018-03-07,0.587586,0.611694,-0.949295,0.047066
2018-03-10,0.270744,-0.226251,-0.270154,0.791146
2018-03-12,0.113459,-0.439769,2.535563,0.009409
2018-03-13,0.414876,1.934884,-0.915241,0.15957
2018-03-18,1.010141,0.24655,-2.410551,-1.003263
2018-03-19,0.188207,-0.329698,0.482883,1.870247
2018-03-20,0.638815,0.986284,0.404751,0.469043
2018-03-22,0.233262,-0.424058,0.8434,-0.602422


### df2 로 복사하여 새로운 튜토리얼을 진행

In [33]:
df2 = df.copy()

In [34]:
df2['E'] = [
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
]
df2.head()

Unnamed: 0,A,B,C,D,E
2018-03-04,-0.252957,0.439122,0.385989,-0.871614,one
2018-03-05,0.326848,-0.025573,0.119267,0.801228,two
2018-03-06,0.111545,-1.212578,0.70976,-0.406191,three
2018-03-07,0.587586,0.611694,-0.949295,0.047066,four
2018-03-08,-1.770163,0.558087,-0.979803,-0.577235,five


### 'one', 'two' 에 해당하는 값만 가져오기

In [35]:
mask = df2.E.isin(['one', 'two'])
df[mask]

Unnamed: 0,A,B,C,D
2018-03-04,-0.252957,0.439122,0.385989,-0.871614
2018-03-05,0.326848,-0.025573,0.119267,0.801228
2018-03-09,-0.176269,0.827106,1.047592,-0.245867
2018-03-10,0.270744,-0.226251,-0.270154,0.791146
2018-03-14,-0.321636,1.849691,1.680028,-0.786624
2018-03-15,-0.847386,1.643273,-0.090823,0.51576
2018-03-19,0.188207,-0.329698,0.482883,1.870247
2018-03-20,0.638815,0.986284,0.404751,0.469043


### 새로운 Series 를 정의하기

In [36]:
s1 = pd.Series(range(1, 21),
               index=pd.date_range(
                   '20180305', periods=20)
              ) # 의도적으로 3월 5일부터 작성
df['F'] = s1

In [37]:
df # 데이터프레임 F 를 생성했고, 값이 없으면 NaN 으로 출력됨을 확인

Unnamed: 0,A,B,C,D,F
2018-03-04,-0.252957,0.439122,0.385989,-0.871614,
2018-03-05,0.326848,-0.025573,0.119267,0.801228,1.0
2018-03-06,0.111545,-1.212578,0.70976,-0.406191,2.0
2018-03-07,0.587586,0.611694,-0.949295,0.047066,3.0
2018-03-08,-1.770163,0.558087,-0.979803,-0.577235,4.0
2018-03-09,-0.176269,0.827106,1.047592,-0.245867,5.0
2018-03-10,0.270744,-0.226251,-0.270154,0.791146,6.0
2018-03-11,-0.619015,0.748065,-0.183903,0.679426,7.0
2018-03-12,0.113459,-0.439769,2.535563,0.009409,8.0
2018-03-13,0.414876,1.934884,-0.915241,0.15957,9.0


### 데이터프레임의 길이만큼 EE 인덱스의 값을 13으로 생성하시오

In [38]:
df.loc[:, 'E'] = [13] * len(df)
df

Unnamed: 0,A,B,C,D,F,E
2018-03-04,-0.252957,0.439122,0.385989,-0.871614,,13
2018-03-05,0.326848,-0.025573,0.119267,0.801228,1.0,13
2018-03-06,0.111545,-1.212578,0.70976,-0.406191,2.0,13
2018-03-07,0.587586,0.611694,-0.949295,0.047066,3.0,13
2018-03-08,-1.770163,0.558087,-0.979803,-0.577235,4.0,13
2018-03-09,-0.176269,0.827106,1.047592,-0.245867,5.0,13
2018-03-10,0.270744,-0.226251,-0.270154,0.791146,6.0,13
2018-03-11,-0.619015,0.748065,-0.183903,0.679426,7.0,13
2018-03-12,0.113459,-0.439769,2.535563,0.009409,8.0,13
2018-03-13,0.414876,1.934884,-0.915241,0.15957,9.0,13


### 데이터 프레임의 모든 수를 음수로 전환하기

In [39]:
df2 = df.copy()

In [40]:
df2[df2 > 0] = -df2 # 모든 수를 음수로 전환

In [41]:
df2.head()

Unnamed: 0,A,B,C,D,F,E
2018-03-04,-0.252957,-0.439122,-0.385989,-0.871614,,-13
2018-03-05,-0.326848,-0.025573,-0.119267,-0.801228,-1.0,-13
2018-03-06,-0.111545,-1.212578,-0.70976,-0.406191,-2.0,-13
2018-03-07,-0.587586,-0.611694,-0.949295,-0.047066,-3.0,-13
2018-03-08,-1.770163,-0.558087,-0.979803,-0.577235,-4.0,-13


## Missing Data

### reindex 를 사용하여 지정된 축의 index 를 변경/추가/삭제

In [42]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

In [43]:
df1

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-04,-0.252957,0.439122,0.385989,-0.871614,,13,13
2018-03-05,0.326848,-0.025573,0.119267,0.801228,1.0,13,13
2018-03-06,0.111545,-1.212578,0.70976,-0.406191,2.0,13,13
2018-03-07,0.587586,0.611694,-0.949295,0.047066,3.0,13,13


### index[0], [1] 에 해당하는 E Column 의 값을 1로 변경하기

In [44]:
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-04,-0.252957,0.439122,0.385989,-0.871614,,1,1
2018-03-05,0.326848,-0.025573,0.119267,0.801228,1.0,1,1
2018-03-06,0.111545,-1.212578,0.70976,-0.406191,2.0,13,13
2018-03-07,0.587586,0.611694,-0.949295,0.047066,3.0,13,13


### NaN 에 해당하는 행을 모두 지우기

In [45]:
df1.dropna()

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-05,0.326848,-0.025573,0.119267,0.801228,1.0,1,1
2018-03-06,0.111545,-1.212578,0.70976,-0.406191,2.0,13,13
2018-03-07,0.587586,0.611694,-0.949295,0.047066,3.0,13,13


### NaN 에 해당하는 값을 5로 대체하기

In [46]:
df1.fillna(5)

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-04,-0.252957,0.439122,0.385989,-0.871614,5.0,1,1
2018-03-05,0.326848,-0.025573,0.119267,0.801228,1.0,1,1
2018-03-06,0.111545,-1.212578,0.70976,-0.406191,2.0,13,13
2018-03-07,0.587586,0.611694,-0.949295,0.047066,3.0,13,13


### NaN 에 해당하는 값을 Boolean 값으로 표시하기

In [47]:
pd.isna(df)

Unnamed: 0,A,B,C,D,F,E
2018-03-04,False,False,False,False,True,False
2018-03-05,False,False,False,False,False,False
2018-03-06,False,False,False,False,False,False
2018-03-07,False,False,False,False,False,False
2018-03-08,False,False,False,False,False,False
2018-03-09,False,False,False,False,False,False
2018-03-10,False,False,False,False,False,False
2018-03-11,False,False,False,False,False,False
2018-03-12,False,False,False,False,False,False
2018-03-13,False,False,False,False,False,False


## Operations

In [49]:
df.mean()

A    -0.100435
B     0.279111
C     0.178873
D     0.160783
F    10.000000
E    13.000000
dtype: float64

In [52]:
df.mean(1)

2018-03-04    2.540108
2018-03-05    2.536962
2018-03-06    2.367089
2018-03-07    2.716175
2018-03-08    2.371814
2018-03-09    3.242094
2018-03-10    3.260914
2018-03-11    3.437429
2018-03-12    3.869777
2018-03-13    3.932348
2018-03-14    4.236910
2018-03-15    4.203471
2018-03-16    3.979840
2018-03-17    4.302635
2018-03-18    4.140479
2018-03-19    5.035273
2018-03-20    5.249815
2018-03-21    4.654787
2018-03-22    5.175030
2018-03-23    5.898173
Freq: D, dtype: float64

In [55]:
pd.Series([1, 3, 5, np.nan, 6, 8], index=dates[:6])

2018-03-04    1.0
2018-03-05    3.0
2018-03-06    5.0
2018-03-07    NaN
2018-03-08    6.0
2018-03-09    8.0
Freq: D, dtype: float64

In [57]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates[:6].shift(2))
s

2018-03-06    1.0
2018-03-07    3.0
2018-03-08    5.0
2018-03-09    NaN
2018-03-10    6.0
2018-03-11    8.0
Freq: D, dtype: float64

In [60]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F,E
2018-03-04,,,,,,
2018-03-05,,,,,,
2018-03-06,-0.888455,-2.212578,-0.29024,-1.406191,1.0,12.0
2018-03-07,-2.412414,-2.388306,-3.949295,-2.952934,0.0,10.0
2018-03-08,-6.770163,-4.441913,-5.979803,-5.577235,-1.0,8.0
2018-03-09,,,,,,
2018-03-10,-5.729256,-6.226251,-6.270154,-5.208854,0.0,7.0
2018-03-11,-8.619015,-7.251935,-8.183903,-7.320574,-1.0,5.0
2018-03-12,,,,,,
2018-03-13,,,,,,
