In [1]:
import pandas as pd
import numpy as np

### pandas 의 버전 확인

참고자료:

`10 minutes to pandas`, `askdjango`, `구글링`

In [61]:
pd.__version__

'0.23.4'

# 10 Minutes to pandas 튜토리얼

### EP1: Object creation(객체 생성)

In [3]:
s = pd.Series([1,3,5,np.nan,6,8]) #NaN = Not a Number
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### date_range 를 사용하여 날짜 생성

In [4]:
dates = pd.date_range('20180304', periods=20)
dates

DatetimeIndex(['2018-03-04', '2018-03-05', '2018-03-06', '2018-03-07',
               '2018-03-08', '2018-03-09', '2018-03-10', '2018-03-11',
               '2018-03-12', '2018-03-13', '2018-03-14', '2018-03-15',
               '2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21', '2018-03-22', '2018-03-23'],
              dtype='datetime64[ns]', freq='D')

### 임의의 배열(array) 를 생성

In [5]:
np.random.randn(6,4) # 6행 4열의 2차원 랜덤한 array 를 생성

array([[ 1.13405398,  0.01515156,  1.63800252,  0.5595408 ],
       [ 0.5687355 ,  1.71579658, -0.04298694,  0.46042791],
       [ 0.41628139,  1.39705157,  0.01886367,  0.66641265],
       [ 0.11857763, -0.71843724,  1.96521793,  0.26828053],
       [ 1.56326053,  0.24834462, -0.42006494,  0.17792266],
       [-0.81969489, -0.52732225, -1.01787975,  2.86325442]])

### 임의로 생성한 배열을 데이터 프레임으로 생성

In [6]:
# 2차원 데이터 구조인 DataFrame 으로 생성

pd.DataFrame(np.random.randn(6, 4), index=['인덱스0', '인덱스1', '인덱스2', '인덱스3', '인덱스4', '인덱스5'])

Unnamed: 0,0,1,2,3
인덱스0,0.056327,2.105542,-0.075597,1.614603
인덱스1,-0.182842,0.335884,0.222483,-1.322793
인덱스2,-1.177208,1.253639,-0.738665,1.369629
인덱스3,0.807751,1.156226,-0.157431,-0.092667
인덱스4,0.836119,-1.344591,0.274686,-0.100899
인덱스5,0.522576,0.977071,-0.065259,1.702514


# 만약 위에서 생성한 dates를 index 로 넣게 된다면?

행이 맞지 않기 때문에 `ValueError: Shape of passed values is (4, 6), indices imply (4, 20)` 오류가 발생


### 꼭 values 의 길이를 일치해야 에러없이 데이터 테이블이 생성된다

#### + 다른 사람들도 보기 편하도록 데이터 테이블을 만들면 아래와 같이 shape, head 를 넣어주자

In [7]:
df = pd.DataFrame(np.random.randn(20, 4), index=dates, columns=['A','B','C','D'])

print(df.shape)
df.head()

(20, 4)


Unnamed: 0,A,B,C,D
2018-03-04,1.193641,-0.866317,0.704511,0.194638
2018-03-05,-0.310299,0.266242,1.514181,0.601382
2018-03-06,0.390148,1.053139,0.418032,0.327367
2018-03-07,-1.065455,-1.012612,-0.208033,-0.160421
2018-03-08,0.268107,0.595058,0.249739,1.575956


### 직렬로 변환할 수 있는 object 를 전달하여 DataFrame 생성하기

In [8]:
df2 = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })
print(df2.shape)
df2

(4, 6)


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [9]:
df2.dtypes # 스마트하게 각 인덱스의 데이터타입을 출력해준다

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## EP2: Viewing Data

----------------------

Data의 값을 찾고 해당 데이터의 변경을 해본다

```
|        | columns1 | columns2 | columns3 | columns4 |
|--------|----------|----------|----------|----------|
| index1 |          |          |          |          |
| index2 |          |          |          |          |
| index3 |          |          |          |          |
```

### index 값 찾기

In [30]:
df.index

DatetimeIndex(['2018-03-04', '2018-03-05', '2018-03-06', '2018-03-07',
               '2018-03-08', '2018-03-09', '2018-03-10', '2018-03-11',
               '2018-03-12', '2018-03-13', '2018-03-14', '2018-03-15',
               '2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21', '2018-03-22', '2018-03-23'],
              dtype='datetime64[ns]', freq='D')

### 데이터의 columns 의 값 찾기/변경하기

In [22]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [29]:
df.columns = ['AA', 'BB', 'CC', 'DD']

### DataFrame의 요약 정보를 알아보기

In [62]:
df.describe()

Unnamed: 0,AA,BB,CC,DD
count,20.0,20.0,20.0,20.0
mean,0.099352,-0.191888,0.196547,0.197103
std,1.250673,1.052196,0.693453,0.890054
min,-1.924294,-1.974297,-0.936026,-1.762854
25%,-0.576164,-0.997636,-0.233765,-0.124372
50%,0.213953,-0.13647,0.129529,0.267516
75%,0.452035,0.605491,0.682473,0.514969
max,2.407345,1.790636,1.568093,2.370924


### index 와 column 을 바꿔주는 `.T`

In [39]:
df.T

Unnamed: 0,2018-03-04 00:00:00,2018-03-05 00:00:00,2018-03-06 00:00:00,2018-03-07 00:00:00,2018-03-08 00:00:00,2018-03-09 00:00:00,2018-03-10 00:00:00,2018-03-11 00:00:00,2018-03-12 00:00:00,2018-03-13 00:00:00,2018-03-14 00:00:00,2018-03-15 00:00:00,2018-03-16 00:00:00,2018-03-17 00:00:00,2018-03-18 00:00:00,2018-03-19 00:00:00,2018-03-20 00:00:00,2018-03-21 00:00:00,2018-03-22 00:00:00,2018-03-23 00:00:00
AA,1.193641,-0.310299,0.390148,-1.065455,0.268107,2.157081,2.079838,0.637694,0.254848,-1.575166,2.407345,0.172105,-0.413067,-1.924294,0.111504,0.229966,0.376198,-1.464458,-1.736645,0.19794
BB,-0.866317,0.266242,1.053139,-1.012612,0.595058,0.510585,-1.823653,0.448549,-0.611264,-0.992644,0.668096,0.845658,-1.317658,0.48994,-1.974297,-0.573919,-0.539181,-1.43091,1.790636,0.636791
CC,0.704511,1.514181,0.418032,-0.208033,0.249739,-0.45057,1.568093,-0.705435,0.036443,0.190253,0.518632,0.027084,-0.936026,0.69904,0.816261,-0.882123,-0.310961,0.068805,-0.063941,0.676951
DD,0.194638,0.601382,0.327367,-0.160421,1.575956,0.499234,0.505551,-0.112356,2.370924,-0.941751,0.68214,0.489282,0.34513,-0.221236,0.055865,-0.048146,-1.762854,0.543223,0.207664,-1.209525


### 원하는 index 기준으로 다시 정렬하기

In [63]:
df.sort_index(ascending=False).head()

Unnamed: 0,AA,BB,CC,DD
2018-03-23,0.19794,0.636791,0.676951,-1.209525
2018-03-22,-1.736645,1.790636,-0.063941,0.207664
2018-03-21,-1.464458,-1.43091,0.068805,0.543223
2018-03-20,0.376198,-0.539181,-0.310961,-1.762854
2018-03-19,0.229966,-0.573919,-0.882123,-0.048146


### 특정 column을 기준으로 정렬하기

df.sort_values('<기준이 되는 column>')

In [64]:
df.sort_values('CC').head()

Unnamed: 0,AA,BB,CC,DD
2018-03-16,-0.413067,-1.317658,-0.936026,0.34513
2018-03-19,0.229966,-0.573919,-0.882123,-0.048146
2018-03-11,0.637694,0.448549,-0.705435,-0.112356
2018-03-09,2.157081,0.510585,-0.45057,0.499234
2018-03-20,0.376198,-0.539181,-0.310961,-1.762854
