In [1]:
import pandas as pd
import numpy as np

### pandas 의 버전 확인

참고자료:

`10 minutes to pandas`, `askdjango`, `구글링`

In [2]:
pd.__version__

'0.23.4'

# 10 Minutes to pandas 튜토리얼

### EP1: Object creation(객체 생성)

In [3]:
s = pd.Series([1,3,5,np.nan,6,8]) #NaN = Not a Number
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### date_range 를 사용하여 날짜 생성

In [4]:
dates = pd.date_range('20180304', periods=20)
dates

DatetimeIndex(['2018-03-04', '2018-03-05', '2018-03-06', '2018-03-07',
               '2018-03-08', '2018-03-09', '2018-03-10', '2018-03-11',
               '2018-03-12', '2018-03-13', '2018-03-14', '2018-03-15',
               '2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21', '2018-03-22', '2018-03-23'],
              dtype='datetime64[ns]', freq='D')

### 임의의 배열(array) 를 생성

In [5]:
np.random.randn(6,4) # 6행 4열의 2차원 랜덤한 array 를 생성

array([[-0.54524116,  1.27108154,  0.10397068,  2.11915519],
       [ 0.6546215 , -1.10031777,  0.47330821, -0.10602901],
       [-0.49365796,  0.10059236,  0.80886691, -0.18806568],
       [-1.30293856,  0.13010594,  0.33089507, -1.6488051 ],
       [-0.24668392, -0.84773237, -1.49251561,  0.33178638],
       [-1.11067223, -0.95766677,  0.78459024,  0.8669871 ]])

### 임의로 생성한 배열을 데이터 프레임으로 생성

In [6]:
# 2차원 데이터 구조인 DataFrame 으로 생성

pd.DataFrame(np.random.randn(6, 4), index=['인덱스0', '인덱스1', '인덱스2', '인덱스3', '인덱스4', '인덱스5'])

Unnamed: 0,0,1,2,3
인덱스0,-1.457796,1.535562,-1.39373,-1.652597
인덱스1,0.37933,-0.329306,-0.241058,1.759611
인덱스2,1.337315,-0.431995,-0.766556,1.535007
인덱스3,-0.508428,-1.064694,-0.442117,0.594102
인덱스4,-0.952506,-0.289919,0.583938,-0.885277
인덱스5,0.790502,0.287908,0.91731,-0.706523


# 만약 위에서 생성한 dates를 index 로 넣게 된다면?

행이 맞지 않기 때문에 `ValueError: Shape of passed values is (4, 6), indices imply (4, 20)` 오류가 발생


### 꼭 values 의 길이를 일치해야 에러없이 데이터 테이블이 생성된다

#### + 다른 사람들도 보기 편하도록 데이터 테이블을 만들면 아래와 같이 shape, head 를 넣어주자

In [7]:
df = pd.DataFrame(np.random.randn(20, 4), index=dates, columns=['A','B','C','D'])

print(df.shape)
df.head()

(20, 4)


Unnamed: 0,A,B,C,D
2018-03-04,-0.111416,0.98267,1.099774,-1.971252
2018-03-05,0.710864,0.720855,-1.294882,-1.117494
2018-03-06,0.964254,0.167364,-0.332012,0.696122
2018-03-07,0.32456,-0.544726,-1.065665,-0.467353
2018-03-08,0.46882,-1.599092,1.657712,-1.276103


### 직렬로 변환할 수 있는 object 를 전달하여 DataFrame 생성하기

In [8]:
df2 = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })
print(df2.shape)
df2

(4, 6)


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [9]:
df2.dtypes # 스마트하게 각 인덱스의 데이터타입을 출력해준다

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## EP2: Viewing Data

----------------------

Data의 값을 찾고 해당 데이터의 변경을 해본다

```
|        | columns1 | columns2 | columns3 | columns4 |
|--------|----------|----------|----------|----------|
| index1 |          |          |          |          |
| index2 |          |          |          |          |
| index3 |          |          |          |          |
```

### index 값 찾기

In [10]:
df.index

DatetimeIndex(['2018-03-04', '2018-03-05', '2018-03-06', '2018-03-07',
               '2018-03-08', '2018-03-09', '2018-03-10', '2018-03-11',
               '2018-03-12', '2018-03-13', '2018-03-14', '2018-03-15',
               '2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21', '2018-03-22', '2018-03-23'],
              dtype='datetime64[ns]', freq='D')

### 데이터의 columns 의 값 찾기/변경하기

In [11]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
df.columns = ['AA', 'BB', 'CC', 'DD']

### DataFrame의 요약 정보를 알아보기

In [13]:
df.describe()

Unnamed: 0,AA,BB,CC,DD
count,20.0,20.0,20.0,20.0
mean,0.01676,0.038429,-0.029962,0.070371
std,0.756162,1.063686,1.116231,1.188271
min,-1.450708,-1.911083,-2.228759,-1.971252
25%,-0.41266,-0.555892,-0.879199,-0.989334
50%,-0.131387,0.273926,0.155387,0.162989
75%,0.426251,0.511562,0.804518,1.068991
max,1.640194,1.920155,1.657712,1.969327


### index 와 column 을 바꿔주는 `.T`

In [14]:
df.T

Unnamed: 0,2018-03-04 00:00:00,2018-03-05 00:00:00,2018-03-06 00:00:00,2018-03-07 00:00:00,2018-03-08 00:00:00,2018-03-09 00:00:00,2018-03-10 00:00:00,2018-03-11 00:00:00,2018-03-12 00:00:00,2018-03-13 00:00:00,2018-03-14 00:00:00,2018-03-15 00:00:00,2018-03-16 00:00:00,2018-03-17 00:00:00,2018-03-18 00:00:00,2018-03-19 00:00:00,2018-03-20 00:00:00,2018-03-21 00:00:00,2018-03-22 00:00:00,2018-03-23 00:00:00
AA,-0.111416,0.710864,0.964254,0.32456,0.46882,-0.189765,-0.684227,1.640194,-0.322137,-0.08636,-0.191816,-0.173712,1.342993,-0.151358,0.036859,0.412061,-0.708502,-0.777987,-1.450708,-0.717418
BB,0.98267,0.720855,0.167364,-0.544726,-1.599092,0.312118,-1.863847,0.044395,-1.246787,0.081511,1.028748,0.301855,0.318543,-1.911083,-0.589389,0.441797,0.249315,1.920155,1.655633,0.298537
CC,1.099774,-1.294882,-0.332012,-1.065665,1.657712,0.526358,0.795735,0.701436,-0.633855,0.167596,0.830868,-0.817043,-1.647794,1.376948,-1.479406,0.03055,0.237221,0.143178,-2.228759,1.3328
DD,-1.971252,-1.117494,0.696122,-0.467353,-1.276103,-1.783141,1.02979,0.298313,0.259672,0.00488,0.670017,1.186591,-0.593016,-0.946632,1.27497,1.55715,0.066305,-1.117439,1.969327,1.666714


### 원하는 index 기준으로 다시 정렬하기

In [15]:
df.sort_index(ascending=False).head()

Unnamed: 0,AA,BB,CC,DD
2018-03-23,-0.717418,0.298537,1.3328,1.666714
2018-03-22,-1.450708,1.655633,-2.228759,1.969327
2018-03-21,-0.777987,1.920155,0.143178,-1.117439
2018-03-20,-0.708502,0.249315,0.237221,0.066305
2018-03-19,0.412061,0.441797,0.03055,1.55715


### 특정 column을 기준으로 정렬하기

df.sort_values('<기준이 되는 column>')

In [16]:
df.sort_values('CC').head()

Unnamed: 0,AA,BB,CC,DD
2018-03-22,-1.450708,1.655633,-2.228759,1.969327
2018-03-16,1.342993,0.318543,-1.647794,-0.593016
2018-03-18,0.036859,-0.589389,-1.479406,1.27497
2018-03-05,0.710864,0.720855,-1.294882,-1.117494
2018-03-07,0.32456,-0.544726,-1.065665,-0.467353


## EP3: Selection

특정 Column 을 선택해서 데이터를 확인

2가지 방법은 모두 동일한 결과값을 가지지만 `Column명이 "D-D" ` 와 같은 경우에는 마이너스(-) _연산자로 인식하여 (데이터 - 데이터)로 계산을 한다_

In [17]:
df['DD'].head(3) # DataFrame 'DD'의 데이터 가져오기(1)

2018-03-04   -1.971252
2018-03-05   -1.117494
2018-03-06    0.696122
Freq: D, Name: DD, dtype: float64

In [18]:
df.DD.head(3) # DataFrame 'DD'의 데이터 가져오기(1)

2018-03-04   -1.971252
2018-03-05   -1.117494
2018-03-06    0.696122
Freq: D, Name: DD, dtype: float64

### 슬라이싱하기

#### DataFrame 에서 `0이상~3미만` 값을 출력하기

In [19]:
df[0:3]

Unnamed: 0,AA,BB,CC,DD
2018-03-04,-0.111416,0.98267,1.099774,-1.971252
2018-03-05,0.710864,0.720855,-1.294882,-1.117494
2018-03-06,0.964254,0.167364,-0.332012,0.696122


#### DataFrame 에서 `A이상 F이하`의 값을 출력하기

In [20]:
df['2018-03-04': '2018-03-06']

Unnamed: 0,AA,BB,CC,DD
2018-03-04,-0.111416,0.98267,1.099774,-1.971252
2018-03-05,0.710864,0.720855,-1.294882,-1.117494
2018-03-06,0.964254,0.167364,-0.332012,0.696122


#### 위에서 선언한 변수에 0번째 index 들의 값 출력하기

`dates = pd.date_range('20180304', periods=20)`


In [21]:
df.loc[dates[0]]

AA   -0.111416
BB    0.982670
CC    1.099774
DD   -1.971252
Name: 2018-03-04 00:00:00, dtype: float64

In [22]:
df.loc[:, ['AA', 'DD']].head() # 특정 index 의 값을 찾기

Unnamed: 0,AA,DD
2018-03-04,-0.111416,-1.971252
2018-03-05,0.710864,-1.117494
2018-03-06,0.964254,0.696122
2018-03-07,0.32456,-0.467353
2018-03-08,0.46882,-1.276103


In [23]:
df.loc['20180311': '20180316', ['AA', 'CC']]

Unnamed: 0,AA,CC
2018-03-11,1.640194,0.701436
2018-03-12,-0.322137,-0.633855
2018-03-13,-0.08636,0.167596
2018-03-14,-0.191816,0.830868
2018-03-15,-0.173712,-0.817043
2018-03-16,1.342993,-1.647794


In [24]:
# 1개의 행만 지정했기 때문에 Series 로 출력
df.loc['20180311', ['AA', 'CC']]

AA    1.640194
CC    0.701436
Name: 2018-03-11 00:00:00, dtype: float64

In [25]:
# 행이던 열이던 한줄의 데이터 범위를 선택하면 Series 이다.
df.loc['20180311':'20180319', 'AA']

2018-03-11    1.640194
2018-03-12   -0.322137
2018-03-13   -0.086360
2018-03-14   -0.191816
2018-03-15   -0.173712
2018-03-16    1.342993
2018-03-17   -0.151358
2018-03-18    0.036859
2018-03-19    0.412061
Freq: D, Name: AA, dtype: float64

### 세번째 행을 Selection 하기

In [26]:
df.head()

Unnamed: 0,AA,BB,CC,DD
2018-03-04,-0.111416,0.98267,1.099774,-1.971252
2018-03-05,0.710864,0.720855,-1.294882,-1.117494
2018-03-06,0.964254,0.167364,-0.332012,0.696122
2018-03-07,0.32456,-0.544726,-1.065665,-0.467353
2018-03-08,0.46882,-1.599092,1.657712,-1.276103


In [27]:
df.iloc[3] # index 3번째의 행 값을 출력

AA    0.324560
BB   -0.544726
CC   -1.065665
DD   -0.467353
Name: 2018-03-07 00:00:00, dtype: float64

In [28]:
df.iloc[3:5, 0:2] # 3월 7일부터 3월 8일 // AA, BB 인덱스의 값 출력

Unnamed: 0,AA,BB
2018-03-07,0.32456,-0.544726
2018-03-08,0.46882,-1.599092


In [29]:
df.iloc[[1, 2, 4], [0,2]]

Unnamed: 0,AA,CC
2018-03-05,0.710864,-1.294882
2018-03-06,0.964254,-0.332012
2018-03-08,0.46882,1.657712


### *명시적으로 모든 행을 출력한다고 해줄 때는 범위가 없더라도 : 를 넣어준다

In [30]:
df.iloc[[1, 3], :]

Unnamed: 0,AA,BB,CC,DD
2018-03-05,0.710864,0.720855,-1.294882,-1.117494
2018-03-07,0.32456,-0.544726,-1.065665,-0.467353


### Boolean Indexing

DataFrame 에서 0 이상의 값을 찾아보기

In [31]:
df['AA'] > 0

2018-03-04    False
2018-03-05     True
2018-03-06     True
2018-03-07     True
2018-03-08     True
2018-03-09    False
2018-03-10    False
2018-03-11     True
2018-03-12    False
2018-03-13    False
2018-03-14    False
2018-03-15    False
2018-03-16     True
2018-03-17    False
2018-03-18     True
2018-03-19     True
2018-03-20    False
2018-03-21    False
2018-03-22    False
2018-03-23    False
Freq: D, Name: AA, dtype: bool

### AA 인덱스에서 True 값만 가져오기

In [32]:
mask = df['AA'] > 0
df[mask]

Unnamed: 0,AA,BB,CC,DD
2018-03-05,0.710864,0.720855,-1.294882,-1.117494
2018-03-06,0.964254,0.167364,-0.332012,0.696122
2018-03-07,0.32456,-0.544726,-1.065665,-0.467353
2018-03-08,0.46882,-1.599092,1.657712,-1.276103
2018-03-11,1.640194,0.044395,0.701436,0.298313
2018-03-16,1.342993,0.318543,-1.647794,-0.593016
2018-03-18,0.036859,-0.589389,-1.479406,1.27497
2018-03-19,0.412061,0.441797,0.03055,1.55715


### df2 로 복사하여 새로운 튜토리얼을 진행

In [33]:
df2 = df.copy()

In [34]:
df2['EE'] = [
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
]
df2.head()

Unnamed: 0,AA,BB,CC,DD,EE
2018-03-04,-0.111416,0.98267,1.099774,-1.971252,one
2018-03-05,0.710864,0.720855,-1.294882,-1.117494,two
2018-03-06,0.964254,0.167364,-0.332012,0.696122,three
2018-03-07,0.32456,-0.544726,-1.065665,-0.467353,four
2018-03-08,0.46882,-1.599092,1.657712,-1.276103,five


### 'one', 'two' 에 해당하는 값만 가져오기

In [35]:
mask = df2.EE.isin(['one', 'two'])
df[mask]

Unnamed: 0,AA,BB,CC,DD
2018-03-04,-0.111416,0.98267,1.099774,-1.971252
2018-03-05,0.710864,0.720855,-1.294882,-1.117494
2018-03-09,-0.189765,0.312118,0.526358,-1.783141
2018-03-10,-0.684227,-1.863847,0.795735,1.02979
2018-03-14,-0.191816,1.028748,0.830868,0.670017
2018-03-15,-0.173712,0.301855,-0.817043,1.186591
2018-03-19,0.412061,0.441797,0.03055,1.55715
2018-03-20,-0.708502,0.249315,0.237221,0.066305


### 새로운 Series 를 정의하기

In [36]:
s1 = pd.Series(range(1, 21),
               index=pd.date_range(
                   '20180305', periods=20)
              ) # 의도적으로 3월 5일부터 작성
df['F'] = s1

In [37]:
df # 데이터프레임 F 를 생성했고, 값이 없으면 NaN 으로 출력됨을 확인

Unnamed: 0,AA,BB,CC,DD,F
2018-03-04,-0.111416,0.98267,1.099774,-1.971252,
2018-03-05,0.710864,0.720855,-1.294882,-1.117494,1.0
2018-03-06,0.964254,0.167364,-0.332012,0.696122,2.0
2018-03-07,0.32456,-0.544726,-1.065665,-0.467353,3.0
2018-03-08,0.46882,-1.599092,1.657712,-1.276103,4.0
2018-03-09,-0.189765,0.312118,0.526358,-1.783141,5.0
2018-03-10,-0.684227,-1.863847,0.795735,1.02979,6.0
2018-03-11,1.640194,0.044395,0.701436,0.298313,7.0
2018-03-12,-0.322137,-1.246787,-0.633855,0.259672,8.0
2018-03-13,-0.08636,0.081511,0.167596,0.00488,9.0


### 데이터프레임의 길이만큼 EE 인덱스의 값을 13으로 생성하시오

In [38]:
df.loc[:, 'EE'] = [13] * len(df)
df

Unnamed: 0,AA,BB,CC,DD,F,EE
2018-03-04,-0.111416,0.98267,1.099774,-1.971252,,13
2018-03-05,0.710864,0.720855,-1.294882,-1.117494,1.0,13
2018-03-06,0.964254,0.167364,-0.332012,0.696122,2.0,13
2018-03-07,0.32456,-0.544726,-1.065665,-0.467353,3.0,13
2018-03-08,0.46882,-1.599092,1.657712,-1.276103,4.0,13
2018-03-09,-0.189765,0.312118,0.526358,-1.783141,5.0,13
2018-03-10,-0.684227,-1.863847,0.795735,1.02979,6.0,13
2018-03-11,1.640194,0.044395,0.701436,0.298313,7.0,13
2018-03-12,-0.322137,-1.246787,-0.633855,0.259672,8.0,13
2018-03-13,-0.08636,0.081511,0.167596,0.00488,9.0,13


### 데이터 프레임의 모든 수를 음수로 전환하기

In [39]:
df2 = df.copy()

In [40]:
df2[df2 > 0] = -df2 # 모든 수를 음수로 전환

In [41]:
df2.head()

Unnamed: 0,AA,BB,CC,DD,F,EE
2018-03-04,-0.111416,-0.98267,-1.099774,-1.971252,,-13
2018-03-05,-0.710864,-0.720855,-1.294882,-1.117494,-1.0,-13
2018-03-06,-0.964254,-0.167364,-0.332012,-0.696122,-2.0,-13
2018-03-07,-0.32456,-0.544726,-1.065665,-0.467353,-3.0,-13
2018-03-08,-0.46882,-1.599092,-1.657712,-1.276103,-4.0,-13
