In [1]:
import pandas as pd
import numpy as np

### pandas 의 버전 확인

참고자료:

`10 minutes to pandas`, `askdjango`, `구글링`

In [2]:
pd.__version__

'0.23.4'

# 10 Minutes to pandas 튜토리얼

### EP1: Object creation(객체 생성)

In [3]:
s = pd.Series([1,3,5,np.nan,6,8]) #NaN = Not a Number
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### date_range 를 사용하여 날짜 생성

In [4]:
dates = pd.date_range('20180304', periods=20)
dates

DatetimeIndex(['2018-03-04', '2018-03-05', '2018-03-06', '2018-03-07',
               '2018-03-08', '2018-03-09', '2018-03-10', '2018-03-11',
               '2018-03-12', '2018-03-13', '2018-03-14', '2018-03-15',
               '2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21', '2018-03-22', '2018-03-23'],
              dtype='datetime64[ns]', freq='D')

### 임의의 배열(array) 를 생성

In [5]:
np.random.randn(6,4) # 6행 4열의 2차원 랜덤한 array 를 생성

array([[ 0.58603911, -1.0664075 ,  0.60811036, -1.51008851],
       [-1.45098795,  0.82155706, -1.44056967,  0.83297935],
       [-0.37004964, -0.09227712,  0.72155021,  1.43281879],
       [-1.33639877, -2.12862898, -0.69613017,  0.59836684],
       [ 0.37491866,  1.88971009,  1.64639005, -0.05608989],
       [-0.01657686,  0.57668342,  0.01452489,  0.69918845]])

### 임의로 생성한 배열을 데이터 프레임으로 생성

In [6]:
# 2차원 데이터 구조인 DataFrame 으로 생성

pd.DataFrame(np.random.randn(6, 4), index=['인덱스0', '인덱스1', '인덱스2', '인덱스3', '인덱스4', '인덱스5'])

Unnamed: 0,0,1,2,3
인덱스0,-0.72066,-0.548797,-0.459733,-1.163588
인덱스1,1.390917,-0.617609,-0.351201,-0.518628
인덱스2,1.712652,0.700141,-1.220917,1.7127
인덱스3,-0.852881,-1.458387,0.851099,1.067275
인덱스4,-1.002256,0.886445,-1.334388,-0.030283
인덱스5,-0.497746,-0.6201,-0.603874,2.668769


# 만약 위에서 생성한 dates를 index 로 넣게 된다면?

행이 맞지 않기 때문에 `ValueError: Shape of passed values is (4, 6), indices imply (4, 20)` 오류가 발생


### 꼭 values 의 길이를 일치해야 에러없이 데이터 테이블이 생성된다

#### + 다른 사람들도 보기 편하도록 데이터 테이블을 만들면 아래와 같이 shape, head 를 넣어주자

In [7]:
df = pd.DataFrame(np.random.randn(20, 4), index=dates, columns=['A','B','C','D'])

print(df.shape)
df.head()

(20, 4)


Unnamed: 0,A,B,C,D
2018-03-04,0.86855,0.361971,-1.215191,-0.605199
2018-03-05,1.55366,0.180912,0.374747,-1.168331
2018-03-06,-0.457921,-0.263354,1.425379,0.912822
2018-03-07,-1.51653,-0.303393,-1.190135,3.482426
2018-03-08,-0.771939,0.925489,-0.148202,-0.220742


### 직렬로 변환할 수 있는 object 를 전달하여 DataFrame 생성하기

In [8]:
df2 = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })
print(df2.shape)
df2

(4, 6)


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [9]:
df2.dtypes # 스마트하게 각 인덱스의 데이터타입을 출력해준다

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## EP2: Viewing Data

----------------------

Data의 값을 찾고 해당 데이터의 변경을 해본다

```
|        | columns1 | columns2 | columns3 | columns4 |
|--------|----------|----------|----------|----------|
| index1 |          |          |          |          |
| index2 |          |          |          |          |
| index3 |          |          |          |          |
```

### index 값 찾기

In [10]:
df.index

DatetimeIndex(['2018-03-04', '2018-03-05', '2018-03-06', '2018-03-07',
               '2018-03-08', '2018-03-09', '2018-03-10', '2018-03-11',
               '2018-03-12', '2018-03-13', '2018-03-14', '2018-03-15',
               '2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21', '2018-03-22', '2018-03-23'],
              dtype='datetime64[ns]', freq='D')

### 데이터의 columns 의 값 찾기/변경하기

In [11]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
df.columns = ['A', 'B', 'C', 'D']

### DataFrame의 요약 정보를 알아보기

In [13]:
df.describe()

Unnamed: 0,A,B,C,D
count,20.0,20.0,20.0,20.0
mean,-0.073625,0.272341,0.041342,0.42954
std,1.13476,0.767637,0.963458,1.156159
min,-2.581484,-1.191682,-1.868817,-1.168331
25%,-0.833993,-0.002007,-0.498394,-0.229155
50%,0.122175,0.378763,0.058453,0.236062
75%,0.662636,0.694265,0.94617,0.71766
max,1.735567,1.7045,1.425379,3.482426


### index 와 column 을 바꿔주는 `.T`

In [14]:
df.T

Unnamed: 0,2018-03-04 00:00:00,2018-03-05 00:00:00,2018-03-06 00:00:00,2018-03-07 00:00:00,2018-03-08 00:00:00,2018-03-09 00:00:00,2018-03-10 00:00:00,2018-03-11 00:00:00,2018-03-12 00:00:00,2018-03-13 00:00:00,2018-03-14 00:00:00,2018-03-15 00:00:00,2018-03-16 00:00:00,2018-03-17 00:00:00,2018-03-18 00:00:00,2018-03-19 00:00:00,2018-03-20 00:00:00,2018-03-21 00:00:00,2018-03-22 00:00:00,2018-03-23 00:00:00
A,0.86855,1.55366,-0.457921,-1.51653,-0.771939,-0.990552,0.310928,-1.359726,1.077967,0.593998,-0.066579,-1.169535,-0.781807,1.217606,-2.581484,1.735567,0.571404,0.336343,-0.392885,0.350437
B,0.361971,0.180912,-0.263354,-0.303393,0.925489,1.7045,0.810005,0.635321,0.519286,0.427791,0.171686,-1.191682,-1.121264,0.395554,0.233059,1.063777,0.655685,1.214288,0.085108,-1.057915
C,-1.215191,0.374747,1.425379,-1.190135,-0.148202,-1.447922,-0.643031,0.938998,0.099999,0.557566,1.069365,0.169808,-0.084425,1.375085,-0.18588,-0.450182,-1.868817,1.065086,0.016907,0.967686
D,-0.605199,-1.168331,0.912822,3.482426,-0.220742,0.060468,-0.090067,0.135916,0.63873,0.286848,0.822569,0.53488,0.185277,0.682691,-0.973858,2.312442,-0.973287,2.14388,0.677721,-0.254394


### 원하는 index 기준으로 다시 정렬하기

In [15]:
df.sort_index(ascending=False).head()

Unnamed: 0,A,B,C,D
2018-03-23,0.350437,-1.057915,0.967686,-0.254394
2018-03-22,-0.392885,0.085108,0.016907,0.677721
2018-03-21,0.336343,1.214288,1.065086,2.14388
2018-03-20,0.571404,0.655685,-1.868817,-0.973287
2018-03-19,1.735567,1.063777,-0.450182,2.312442


### 특정 column을 기준으로 정렬하기

df.sort_values('<기준이 되는 column>')

In [16]:
df.sort_values('C').head()

Unnamed: 0,A,B,C,D
2018-03-20,0.571404,0.655685,-1.868817,-0.973287
2018-03-09,-0.990552,1.7045,-1.447922,0.060468
2018-03-04,0.86855,0.361971,-1.215191,-0.605199
2018-03-07,-1.51653,-0.303393,-1.190135,3.482426
2018-03-10,0.310928,0.810005,-0.643031,-0.090067


## EP3: Selection

특정 Column 을 선택해서 데이터를 확인

2가지 방법은 모두 동일한 결과값을 가지지만 `Column명이 "D-D" ` 와 같은 경우에는 마이너스(-) _연산자로 인식하여 (데이터 - 데이터)로 계산을 한다_

In [17]:
df['D'].head(3) # DataFrame 'D'의 데이터 가져오기(1)

2018-03-04   -0.605199
2018-03-05   -1.168331
2018-03-06    0.912822
Freq: D, Name: D, dtype: float64

In [18]:
df.D.head(3) # DataFrame 'D'의 데이터 가져오기(1)

2018-03-04   -0.605199
2018-03-05   -1.168331
2018-03-06    0.912822
Freq: D, Name: D, dtype: float64

### 슬라이싱하기

#### DataFrame 에서 `0이상~3미만` 값을 출력하기

In [19]:
df[0:3]

Unnamed: 0,A,B,C,D
2018-03-04,0.86855,0.361971,-1.215191,-0.605199
2018-03-05,1.55366,0.180912,0.374747,-1.168331
2018-03-06,-0.457921,-0.263354,1.425379,0.912822


#### DataFrame 에서 `A이상 F이하`의 값을 출력하기

In [20]:
df['2018-03-04': '2018-03-06']

Unnamed: 0,A,B,C,D
2018-03-04,0.86855,0.361971,-1.215191,-0.605199
2018-03-05,1.55366,0.180912,0.374747,-1.168331
2018-03-06,-0.457921,-0.263354,1.425379,0.912822


#### 위에서 선언한 변수에 0번째 index 들의 값 출력하기

`dates = pd.date_range('20180304', periods=20)`


In [21]:
df.loc[dates[0]]

A    0.868550
B    0.361971
C   -1.215191
D   -0.605199
Name: 2018-03-04 00:00:00, dtype: float64

In [22]:
df.loc[:, ['A', 'D']].head() # 특정 index 의 값을 찾기

Unnamed: 0,A,D
2018-03-04,0.86855,-0.605199
2018-03-05,1.55366,-1.168331
2018-03-06,-0.457921,0.912822
2018-03-07,-1.51653,3.482426
2018-03-08,-0.771939,-0.220742


In [23]:
df.loc['20180311': '20180316', ['A', 'C']]

Unnamed: 0,A,C
2018-03-11,-1.359726,0.938998
2018-03-12,1.077967,0.099999
2018-03-13,0.593998,0.557566
2018-03-14,-0.066579,1.069365
2018-03-15,-1.169535,0.169808
2018-03-16,-0.781807,-0.084425


In [24]:
# 1개의 행만 지정했기 때문에 Series 로 출력
df.loc['20180311', ['A', 'C']]

A   -1.359726
C    0.938998
Name: 2018-03-11 00:00:00, dtype: float64

In [25]:
# 행이던 열이던 한줄의 데이터 범위를 선택하면 Series 이다.
df.loc['20180311':'20180319', 'A']

2018-03-11   -1.359726
2018-03-12    1.077967
2018-03-13    0.593998
2018-03-14   -0.066579
2018-03-15   -1.169535
2018-03-16   -0.781807
2018-03-17    1.217606
2018-03-18   -2.581484
2018-03-19    1.735567
Freq: D, Name: A, dtype: float64

### 세번째 행을 Selection 하기

In [26]:
df.head()

Unnamed: 0,A,B,C,D
2018-03-04,0.86855,0.361971,-1.215191,-0.605199
2018-03-05,1.55366,0.180912,0.374747,-1.168331
2018-03-06,-0.457921,-0.263354,1.425379,0.912822
2018-03-07,-1.51653,-0.303393,-1.190135,3.482426
2018-03-08,-0.771939,0.925489,-0.148202,-0.220742


In [27]:
df.iloc[3] # index 3번째의 행 값을 출력

A   -1.516530
B   -0.303393
C   -1.190135
D    3.482426
Name: 2018-03-07 00:00:00, dtype: float64

In [28]:
df.iloc[3:5, 0:2] # 3월 7일부터 3월 8일 // AA, BB 인덱스의 값 출력

Unnamed: 0,A,B
2018-03-07,-1.51653,-0.303393
2018-03-08,-0.771939,0.925489


In [29]:
df.iloc[[1, 2, 4], [0,2]]

Unnamed: 0,A,C
2018-03-05,1.55366,0.374747
2018-03-06,-0.457921,1.425379
2018-03-08,-0.771939,-0.148202


### *명시적으로 모든 행을 출력한다고 해줄 때는 범위가 없더라도 : 를 넣어준다

In [30]:
df.iloc[[1, 3], :]

Unnamed: 0,A,B,C,D
2018-03-05,1.55366,0.180912,0.374747,-1.168331
2018-03-07,-1.51653,-0.303393,-1.190135,3.482426


### Boolean Indexing

DataFrame 에서 0 이상의 값을 찾아보기

In [31]:
df['A'] > 0

2018-03-04     True
2018-03-05     True
2018-03-06    False
2018-03-07    False
2018-03-08    False
2018-03-09    False
2018-03-10     True
2018-03-11    False
2018-03-12     True
2018-03-13     True
2018-03-14    False
2018-03-15    False
2018-03-16    False
2018-03-17     True
2018-03-18    False
2018-03-19     True
2018-03-20     True
2018-03-21     True
2018-03-22    False
2018-03-23     True
Freq: D, Name: A, dtype: bool

### AA 인덱스에서 True 값만 가져오기

In [32]:
mask = df['A'] > 0
df[mask]

Unnamed: 0,A,B,C,D
2018-03-04,0.86855,0.361971,-1.215191,-0.605199
2018-03-05,1.55366,0.180912,0.374747,-1.168331
2018-03-10,0.310928,0.810005,-0.643031,-0.090067
2018-03-12,1.077967,0.519286,0.099999,0.63873
2018-03-13,0.593998,0.427791,0.557566,0.286848
2018-03-17,1.217606,0.395554,1.375085,0.682691
2018-03-19,1.735567,1.063777,-0.450182,2.312442
2018-03-20,0.571404,0.655685,-1.868817,-0.973287
2018-03-21,0.336343,1.214288,1.065086,2.14388
2018-03-23,0.350437,-1.057915,0.967686,-0.254394


### df2 로 복사하여 새로운 튜토리얼을 진행

In [33]:
df2 = df.copy()

In [34]:
df2['E'] = [
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
]
df2.head()

Unnamed: 0,A,B,C,D,E
2018-03-04,0.86855,0.361971,-1.215191,-0.605199,one
2018-03-05,1.55366,0.180912,0.374747,-1.168331,two
2018-03-06,-0.457921,-0.263354,1.425379,0.912822,three
2018-03-07,-1.51653,-0.303393,-1.190135,3.482426,four
2018-03-08,-0.771939,0.925489,-0.148202,-0.220742,five


### 'one', 'two' 에 해당하는 값만 가져오기

In [35]:
mask = df2.E.isin(['one', 'two'])
df[mask]

Unnamed: 0,A,B,C,D
2018-03-04,0.86855,0.361971,-1.215191,-0.605199
2018-03-05,1.55366,0.180912,0.374747,-1.168331
2018-03-09,-0.990552,1.7045,-1.447922,0.060468
2018-03-10,0.310928,0.810005,-0.643031,-0.090067
2018-03-14,-0.066579,0.171686,1.069365,0.822569
2018-03-15,-1.169535,-1.191682,0.169808,0.53488
2018-03-19,1.735567,1.063777,-0.450182,2.312442
2018-03-20,0.571404,0.655685,-1.868817,-0.973287


### 새로운 Series 를 정의하기

In [36]:
s1 = pd.Series(range(1, 21),
               index=pd.date_range(
                   '20180305', periods=20)
              ) # 의도적으로 3월 5일부터 작성
df['F'] = s1

In [37]:
df # 데이터프레임 F 를 생성했고, 값이 없으면 NaN 으로 출력됨을 확인

Unnamed: 0,A,B,C,D,F
2018-03-04,0.86855,0.361971,-1.215191,-0.605199,
2018-03-05,1.55366,0.180912,0.374747,-1.168331,1.0
2018-03-06,-0.457921,-0.263354,1.425379,0.912822,2.0
2018-03-07,-1.51653,-0.303393,-1.190135,3.482426,3.0
2018-03-08,-0.771939,0.925489,-0.148202,-0.220742,4.0
2018-03-09,-0.990552,1.7045,-1.447922,0.060468,5.0
2018-03-10,0.310928,0.810005,-0.643031,-0.090067,6.0
2018-03-11,-1.359726,0.635321,0.938998,0.135916,7.0
2018-03-12,1.077967,0.519286,0.099999,0.63873,8.0
2018-03-13,0.593998,0.427791,0.557566,0.286848,9.0


### 데이터프레임의 길이만큼 EE 인덱스의 값을 13으로 생성하시오

In [38]:
df.loc[:, 'E'] = [13] * len(df)
df

Unnamed: 0,A,B,C,D,F,E
2018-03-04,0.86855,0.361971,-1.215191,-0.605199,,13
2018-03-05,1.55366,0.180912,0.374747,-1.168331,1.0,13
2018-03-06,-0.457921,-0.263354,1.425379,0.912822,2.0,13
2018-03-07,-1.51653,-0.303393,-1.190135,3.482426,3.0,13
2018-03-08,-0.771939,0.925489,-0.148202,-0.220742,4.0,13
2018-03-09,-0.990552,1.7045,-1.447922,0.060468,5.0,13
2018-03-10,0.310928,0.810005,-0.643031,-0.090067,6.0,13
2018-03-11,-1.359726,0.635321,0.938998,0.135916,7.0,13
2018-03-12,1.077967,0.519286,0.099999,0.63873,8.0,13
2018-03-13,0.593998,0.427791,0.557566,0.286848,9.0,13


### 데이터 프레임의 모든 수를 음수로 전환하기

In [39]:
df2 = df.copy()

In [40]:
df2[df2 > 0] = -df2 # 모든 수를 음수로 전환

In [41]:
df2.head()

Unnamed: 0,A,B,C,D,F,E
2018-03-04,-0.86855,-0.361971,-1.215191,-0.605199,,-13
2018-03-05,-1.55366,-0.180912,-0.374747,-1.168331,-1.0,-13
2018-03-06,-0.457921,-0.263354,-1.425379,-0.912822,-2.0,-13
2018-03-07,-1.51653,-0.303393,-1.190135,-3.482426,-3.0,-13
2018-03-08,-0.771939,-0.925489,-0.148202,-0.220742,-4.0,-13


## Missing Data

### reindex 를 사용하여 지정된 축의 index 를 변경/추가/삭제

In [42]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

In [43]:
df1

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-04,0.86855,0.361971,-1.215191,-0.605199,,13,13
2018-03-05,1.55366,0.180912,0.374747,-1.168331,1.0,13,13
2018-03-06,-0.457921,-0.263354,1.425379,0.912822,2.0,13,13
2018-03-07,-1.51653,-0.303393,-1.190135,3.482426,3.0,13,13


### index[0], [1] 에 해당하는 E Column 의 값을 1로 변경하기

In [44]:
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-04,0.86855,0.361971,-1.215191,-0.605199,,1,1
2018-03-05,1.55366,0.180912,0.374747,-1.168331,1.0,1,1
2018-03-06,-0.457921,-0.263354,1.425379,0.912822,2.0,13,13
2018-03-07,-1.51653,-0.303393,-1.190135,3.482426,3.0,13,13


### NaN 에 해당하는 행을 모두 지우기

In [45]:
df1.dropna()

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-05,1.55366,0.180912,0.374747,-1.168331,1.0,1,1
2018-03-06,-0.457921,-0.263354,1.425379,0.912822,2.0,13,13
2018-03-07,-1.51653,-0.303393,-1.190135,3.482426,3.0,13,13


### NaN 에 해당하는 값을 5로 대체하기

In [46]:
df1.fillna(5)

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-04,0.86855,0.361971,-1.215191,-0.605199,5.0,1,1
2018-03-05,1.55366,0.180912,0.374747,-1.168331,1.0,1,1
2018-03-06,-0.457921,-0.263354,1.425379,0.912822,2.0,13,13
2018-03-07,-1.51653,-0.303393,-1.190135,3.482426,3.0,13,13


### NaN 에 해당하는 값을 Boolean 값으로 표시하기

In [47]:
pd.isna(df)

Unnamed: 0,A,B,C,D,F,E
2018-03-04,False,False,False,False,True,False
2018-03-05,False,False,False,False,False,False
2018-03-06,False,False,False,False,False,False
2018-03-07,False,False,False,False,False,False
2018-03-08,False,False,False,False,False,False
2018-03-09,False,False,False,False,False,False
2018-03-10,False,False,False,False,False,False
2018-03-11,False,False,False,False,False,False
2018-03-12,False,False,False,False,False,False
2018-03-13,False,False,False,False,False,False


## Operations

In [48]:
df.mean()

A    -0.073625
B     0.272341
C     0.041342
D     0.429540
F    10.000000
E    13.000000
dtype: float64

In [49]:
df.mean(1)

2018-03-04    2.482026
2018-03-05    2.490165
2018-03-06    2.769488
2018-03-07    2.745395
2018-03-08    2.797434
2018-03-09    2.887749
2018-03-10    3.231306
2018-03-11    3.391752
2018-03-12    3.889330
2018-03-13    3.977700
2018-03-14    4.166174
2018-03-15    3.723912
2018-03-16    3.866297
2018-03-17    4.945156
2018-03-18    3.915306
2018-03-19    5.443601
2018-03-20    4.564164
2018-03-21    5.793266
2018-03-22    5.231142
2018-03-23    5.334302
Freq: D, dtype: float64

In [50]:
pd.Series([1, 3, 5, np.nan, 6, 8], index=dates[:6])

2018-03-04    1.0
2018-03-05    3.0
2018-03-06    5.0
2018-03-07    NaN
2018-03-08    6.0
2018-03-09    8.0
Freq: D, dtype: float64

In [51]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates[:6].shift(2))
s

2018-03-06    1.0
2018-03-07    3.0
2018-03-08    5.0
2018-03-09    NaN
2018-03-10    6.0
2018-03-11    8.0
Freq: D, dtype: float64

In [52]:
df.sub(s, axis='index').dropna()

Unnamed: 0,A,B,C,D,F,E
2018-03-06,-1.457921,-1.263354,0.425379,-0.087178,1.0,12.0
2018-03-07,-4.51653,-3.303393,-4.190135,0.482426,0.0,10.0
2018-03-08,-5.771939,-4.074511,-5.148202,-5.220742,-1.0,8.0
2018-03-10,-5.689072,-5.189995,-6.643031,-6.090067,0.0,7.0
2018-03-11,-9.359726,-7.364679,-7.061002,-7.864084,-1.0,5.0


## Apply

In [53]:
df.head(10)

Unnamed: 0,A,B,C,D,F,E
2018-03-04,0.86855,0.361971,-1.215191,-0.605199,,13
2018-03-05,1.55366,0.180912,0.374747,-1.168331,1.0,13
2018-03-06,-0.457921,-0.263354,1.425379,0.912822,2.0,13
2018-03-07,-1.51653,-0.303393,-1.190135,3.482426,3.0,13
2018-03-08,-0.771939,0.925489,-0.148202,-0.220742,4.0,13
2018-03-09,-0.990552,1.7045,-1.447922,0.060468,5.0,13
2018-03-10,0.310928,0.810005,-0.643031,-0.090067,6.0,13
2018-03-11,-1.359726,0.635321,0.938998,0.135916,7.0,13
2018-03-12,1.077967,0.519286,0.099999,0.63873,8.0,13
2018-03-13,0.593998,0.427791,0.557566,0.286848,9.0,13


In [54]:
df.apply(np.cumsum).head()

Unnamed: 0,A,B,C,D,F,E
2018-03-04,0.86855,0.361971,-1.215191,-0.605199,,13
2018-03-05,2.42221,0.542883,-0.840444,-1.77353,1.0,26
2018-03-06,1.964289,0.279529,0.584935,-0.860708,3.0,39
2018-03-07,0.44776,-0.023864,-0.6052,2.621718,6.0,52
2018-03-08,-0.32418,0.901625,-0.753402,2.400976,10.0,65


In [55]:
def my_cumsum(col):
    return pd.Series([1, 2, 3])

df.apply(my_cumsum)

Unnamed: 0,A,B,C,D,F,E
0,1,1,1,1,1,1
1,2,2,2,2,2,2
2,3,3,3,3,3,3


## Histogramming

value_counts () Series 메서드와 최상위 함수는 1D 값 배열의 막대 그래프 계산

또한 일반 배열에서 함수로 사용

In [57]:
s = pd.Series(np.random.randint(0, 7, size=10))

In [59]:
s

0    5
1    0
2    6
3    3
4    3
5    3
6    4
7    4
8    3
9    2
dtype: int64

In [61]:
s.value_counts()

3    4
4    2
6    1
5    1
2    1
0    1
dtype: int64

## String Methods

In [64]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

In [65]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [66]:
s.str.upper()

0       A
1       B
2       C
3    AABA
4    BACA
5     NaN
6    CABA
7     DOG
8     CAT
dtype: object

In [67]:
s.str.len()

0    1.0
1    1.0
2    1.0
3    4.0
4    4.0
5    NaN
6    4.0
7    3.0
8    3.0
dtype: float64