In [1]:
import pandas as pd
import numpy as np

### pandas 의 버전 확인

참고자료:

`10 minutes to pandas`, `askdjango`, `구글링`

In [2]:
pd.__version__

'0.23.4'

# 10 Minutes to pandas 튜토리얼

### EP1: Object creation(객체 생성)

In [3]:
s = pd.Series([1,3,5,np.nan,6,8]) #NaN = Not a Number
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### date_range 를 사용하여 날짜 생성

In [4]:
dates = pd.date_range('20180304', periods=20)
dates

DatetimeIndex(['2018-03-04', '2018-03-05', '2018-03-06', '2018-03-07',
               '2018-03-08', '2018-03-09', '2018-03-10', '2018-03-11',
               '2018-03-12', '2018-03-13', '2018-03-14', '2018-03-15',
               '2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21', '2018-03-22', '2018-03-23'],
              dtype='datetime64[ns]', freq='D')

### 임의의 배열(array) 를 생성

In [5]:
np.random.randn(6,4) # 6행 4열의 2차원 랜덤한 array 를 생성

array([[-1.00958463, -1.01327331,  0.65304157, -0.19024967],
       [ 0.24298281,  0.58551333, -1.52570415,  0.00767151],
       [ 0.53416739,  1.73817907,  2.42428627, -0.27299232],
       [-1.02824385, -2.44625257,  2.45087941,  0.31765465],
       [-0.93699251, -0.02778373,  0.6471208 ,  0.66631256],
       [ 1.218634  ,  0.71721159, -0.43344963, -0.40049374]])

### 임의로 생성한 배열을 데이터 프레임으로 생성

In [6]:
# 2차원 데이터 구조인 DataFrame 으로 생성

pd.DataFrame(np.random.randn(6, 4), index=['인덱스0', '인덱스1', '인덱스2', '인덱스3', '인덱스4', '인덱스5'])

Unnamed: 0,0,1,2,3
인덱스0,-0.961174,0.857967,-1.231182,0.637483
인덱스1,-0.804291,-0.751086,1.035868,-0.627007
인덱스2,-2.71827,-0.358422,0.052701,-0.015055
인덱스3,0.026807,0.648742,0.169522,0.748975
인덱스4,0.17856,0.630651,-2.267587,1.159425
인덱스5,-0.803274,0.461349,-0.986126,-0.180598


# 만약 위에서 생성한 dates를 index 로 넣게 된다면?

행이 맞지 않기 때문에 `ValueError: Shape of passed values is (4, 6), indices imply (4, 20)` 오류가 발생


### 꼭 values 의 길이를 일치해야 에러없이 데이터 테이블이 생성된다

#### + 다른 사람들도 보기 편하도록 데이터 테이블을 만들면 아래와 같이 shape, head 를 넣어주자

In [7]:
df = pd.DataFrame(np.random.randn(20, 4), index=dates, columns=['A','B','C','D'])

print(df.shape)
df.head()

(20, 4)


Unnamed: 0,A,B,C,D
2018-03-04,1.341755,-0.102368,-0.098966,-1.665442
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219
2018-03-06,-1.12848,-0.083697,0.819332,0.559748
2018-03-07,0.696405,0.216876,0.434205,0.74629
2018-03-08,-0.683227,-0.507309,-0.096254,-0.550115


### 직렬로 변환할 수 있는 object 를 전달하여 DataFrame 생성하기

In [8]:
df2 = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })
print(df2.shape)
df2

(4, 6)


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [9]:
df2.dtypes # 스마트하게 각 인덱스의 데이터타입을 출력해준다

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## EP2: Viewing Data

----------------------

Data의 값을 찾고 해당 데이터의 변경을 해본다

```
|        | columns1 | columns2 | columns3 | columns4 |
|--------|----------|----------|----------|----------|
| index1 |          |          |          |          |
| index2 |          |          |          |          |
| index3 |          |          |          |          |
```

### index 값 찾기

In [10]:
df.index

DatetimeIndex(['2018-03-04', '2018-03-05', '2018-03-06', '2018-03-07',
               '2018-03-08', '2018-03-09', '2018-03-10', '2018-03-11',
               '2018-03-12', '2018-03-13', '2018-03-14', '2018-03-15',
               '2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21', '2018-03-22', '2018-03-23'],
              dtype='datetime64[ns]', freq='D')

### 데이터의 columns 의 값 찾기/변경하기

In [11]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
df.columns = ['A', 'B', 'C', 'D']

### DataFrame의 요약 정보를 알아보기

In [13]:
df.describe()

Unnamed: 0,A,B,C,D
count,20.0,20.0,20.0,20.0
mean,0.175139,0.090448,0.044829,-0.248372
std,0.884672,0.665181,1.078673,1.088915
min,-1.488785,-1.539896,-2.444986,-2.641699
25%,-0.509538,-0.160397,-0.538561,-0.751864
50%,0.086258,0.087223,0.168976,-0.325869
75%,0.867649,0.439049,0.798387,0.606678
max,1.629717,1.680005,1.902692,1.625021


### index 와 column 을 바꿔주는 `.T`

In [14]:
df.T

Unnamed: 0,2018-03-04 00:00:00,2018-03-05 00:00:00,2018-03-06 00:00:00,2018-03-07 00:00:00,2018-03-08 00:00:00,2018-03-09 00:00:00,2018-03-10 00:00:00,2018-03-11 00:00:00,2018-03-12 00:00:00,2018-03-13 00:00:00,2018-03-14 00:00:00,2018-03-15 00:00:00,2018-03-16 00:00:00,2018-03-17 00:00:00,2018-03-18 00:00:00,2018-03-19 00:00:00,2018-03-20 00:00:00,2018-03-21 00:00:00,2018-03-22 00:00:00,2018-03-23 00:00:00
A,1.341755,-1.488785,-1.12848,0.696405,-0.683227,0.116714,0.018328,1.196211,-1.067965,-0.030167,0.882335,0.055802,1.629717,0.862753,-0.517246,-0.506969,1.157695,0.377131,-0.086612,0.677391
B,-0.102368,0.859913,-0.083697,0.216876,-0.507309,0.708181,0.613582,0.564046,-0.185638,-0.371161,0.118667,0.057446,-1.539896,-0.151984,1.680005,-0.793175,0.309981,0.397384,-0.09889,0.117001
C,-0.098966,-2.444986,0.819332,0.434205,-0.096254,0.791406,-0.510919,-1.40846,0.65985,0.606243,-0.358656,-1.459501,0.588722,-0.926197,1.902692,1.048097,-0.420172,1.119139,1.272491,-0.621488
D,-1.665442,-0.11219,0.559748,0.74629,-0.550115,-0.383858,-0.194982,-0.608706,1.398305,-2.641699,1.625021,-0.267881,0.62077,0.601981,-0.899572,-0.702628,-1.783902,-1.184511,-0.414775,0.890709


### 원하는 index 기준으로 다시 정렬하기

In [15]:
df.sort_index(ascending=False).head()

Unnamed: 0,A,B,C,D
2018-03-23,0.677391,0.117001,-0.621488,0.890709
2018-03-22,-0.086612,-0.09889,1.272491,-0.414775
2018-03-21,0.377131,0.397384,1.119139,-1.184511
2018-03-20,1.157695,0.309981,-0.420172,-1.783902
2018-03-19,-0.506969,-0.793175,1.048097,-0.702628


### 특정 column을 기준으로 정렬하기

df.sort_values('<기준이 되는 column>')

In [16]:
df.sort_values('C').head()

Unnamed: 0,A,B,C,D
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219
2018-03-15,0.055802,0.057446,-1.459501,-0.267881
2018-03-11,1.196211,0.564046,-1.40846,-0.608706
2018-03-17,0.862753,-0.151984,-0.926197,0.601981
2018-03-23,0.677391,0.117001,-0.621488,0.890709


## EP3: Selection

특정 Column 을 선택해서 데이터를 확인

2가지 방법은 모두 동일한 결과값을 가지지만 `Column명이 "D-D" ` 와 같은 경우에는 마이너스(-) _연산자로 인식하여 (데이터 - 데이터)로 계산을 한다_

In [17]:
df['D'].head(3) # DataFrame 'D'의 데이터 가져오기(1)

2018-03-04   -1.665442
2018-03-05   -0.112190
2018-03-06    0.559748
Freq: D, Name: D, dtype: float64

In [18]:
df.D.head(3) # DataFrame 'D'의 데이터 가져오기(1)

2018-03-04   -1.665442
2018-03-05   -0.112190
2018-03-06    0.559748
Freq: D, Name: D, dtype: float64

### 슬라이싱하기

#### DataFrame 에서 `0이상~3미만` 값을 출력하기

In [19]:
df[0:3]

Unnamed: 0,A,B,C,D
2018-03-04,1.341755,-0.102368,-0.098966,-1.665442
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219
2018-03-06,-1.12848,-0.083697,0.819332,0.559748


#### DataFrame 에서 `A이상 F이하`의 값을 출력하기

In [20]:
df['2018-03-04': '2018-03-06']

Unnamed: 0,A,B,C,D
2018-03-04,1.341755,-0.102368,-0.098966,-1.665442
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219
2018-03-06,-1.12848,-0.083697,0.819332,0.559748


#### 위에서 선언한 변수에 0번째 index 들의 값 출력하기

`dates = pd.date_range('20180304', periods=20)`


In [21]:
df.loc[dates[0]]

A    1.341755
B   -0.102368
C   -0.098966
D   -1.665442
Name: 2018-03-04 00:00:00, dtype: float64

In [22]:
df.loc[:, ['A', 'D']].head() # 특정 index 의 값을 찾기

Unnamed: 0,A,D
2018-03-04,1.341755,-1.665442
2018-03-05,-1.488785,-0.11219
2018-03-06,-1.12848,0.559748
2018-03-07,0.696405,0.74629
2018-03-08,-0.683227,-0.550115


In [23]:
df.loc['20180311': '20180316', ['A', 'C']]

Unnamed: 0,A,C
2018-03-11,1.196211,-1.40846
2018-03-12,-1.067965,0.65985
2018-03-13,-0.030167,0.606243
2018-03-14,0.882335,-0.358656
2018-03-15,0.055802,-1.459501
2018-03-16,1.629717,0.588722


In [24]:
# 1개의 행만 지정했기 때문에 Series 로 출력
df.loc['20180311', ['A', 'C']]

A    1.196211
C   -1.408460
Name: 2018-03-11 00:00:00, dtype: float64

In [25]:
# 행이던 열이던 한줄의 데이터 범위를 선택하면 Series 이다.
df.loc['20180311':'20180319', 'A']

2018-03-11    1.196211
2018-03-12   -1.067965
2018-03-13   -0.030167
2018-03-14    0.882335
2018-03-15    0.055802
2018-03-16    1.629717
2018-03-17    0.862753
2018-03-18   -0.517246
2018-03-19   -0.506969
Freq: D, Name: A, dtype: float64

### 세번째 행을 Selection 하기

In [26]:
df.head()

Unnamed: 0,A,B,C,D
2018-03-04,1.341755,-0.102368,-0.098966,-1.665442
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219
2018-03-06,-1.12848,-0.083697,0.819332,0.559748
2018-03-07,0.696405,0.216876,0.434205,0.74629
2018-03-08,-0.683227,-0.507309,-0.096254,-0.550115


In [27]:
df.iloc[3] # index 3번째의 행 값을 출력

A    0.696405
B    0.216876
C    0.434205
D    0.746290
Name: 2018-03-07 00:00:00, dtype: float64

In [28]:
df.iloc[3:5, 0:2] # 3월 7일부터 3월 8일 // AA, BB 인덱스의 값 출력

Unnamed: 0,A,B
2018-03-07,0.696405,0.216876
2018-03-08,-0.683227,-0.507309


In [29]:
df.iloc[[1, 2, 4], [0,2]]

Unnamed: 0,A,C
2018-03-05,-1.488785,-2.444986
2018-03-06,-1.12848,0.819332
2018-03-08,-0.683227,-0.096254


### *명시적으로 모든 행을 출력한다고 해줄 때는 범위가 없더라도 : 를 넣어준다

In [30]:
df.iloc[[1, 3], :]

Unnamed: 0,A,B,C,D
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219
2018-03-07,0.696405,0.216876,0.434205,0.74629


### Boolean Indexing

DataFrame 에서 0 이상의 값을 찾아보기

In [31]:
df['A'] > 0

2018-03-04     True
2018-03-05    False
2018-03-06    False
2018-03-07     True
2018-03-08    False
2018-03-09     True
2018-03-10     True
2018-03-11     True
2018-03-12    False
2018-03-13    False
2018-03-14     True
2018-03-15     True
2018-03-16     True
2018-03-17     True
2018-03-18    False
2018-03-19    False
2018-03-20     True
2018-03-21     True
2018-03-22    False
2018-03-23     True
Freq: D, Name: A, dtype: bool

### AA 인덱스에서 True 값만 가져오기

In [32]:
mask = df['A'] > 0
df[mask]

Unnamed: 0,A,B,C,D
2018-03-04,1.341755,-0.102368,-0.098966,-1.665442
2018-03-07,0.696405,0.216876,0.434205,0.74629
2018-03-09,0.116714,0.708181,0.791406,-0.383858
2018-03-10,0.018328,0.613582,-0.510919,-0.194982
2018-03-11,1.196211,0.564046,-1.40846,-0.608706
2018-03-14,0.882335,0.118667,-0.358656,1.625021
2018-03-15,0.055802,0.057446,-1.459501,-0.267881
2018-03-16,1.629717,-1.539896,0.588722,0.62077
2018-03-17,0.862753,-0.151984,-0.926197,0.601981
2018-03-20,1.157695,0.309981,-0.420172,-1.783902


### df2 로 복사하여 새로운 튜토리얼을 진행

In [33]:
df2 = df.copy()

In [34]:
df2['E'] = [
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
]
df2.head()

Unnamed: 0,A,B,C,D,E
2018-03-04,1.341755,-0.102368,-0.098966,-1.665442,one
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219,two
2018-03-06,-1.12848,-0.083697,0.819332,0.559748,three
2018-03-07,0.696405,0.216876,0.434205,0.74629,four
2018-03-08,-0.683227,-0.507309,-0.096254,-0.550115,five


### 'one', 'two' 에 해당하는 값만 가져오기

In [35]:
mask = df2.E.isin(['one', 'two'])
df[mask]

Unnamed: 0,A,B,C,D
2018-03-04,1.341755,-0.102368,-0.098966,-1.665442
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219
2018-03-09,0.116714,0.708181,0.791406,-0.383858
2018-03-10,0.018328,0.613582,-0.510919,-0.194982
2018-03-14,0.882335,0.118667,-0.358656,1.625021
2018-03-15,0.055802,0.057446,-1.459501,-0.267881
2018-03-19,-0.506969,-0.793175,1.048097,-0.702628
2018-03-20,1.157695,0.309981,-0.420172,-1.783902


### 새로운 Series 를 정의하기

In [36]:
s1 = pd.Series(range(1, 21),
               index=pd.date_range(
                   '20180305', periods=20)
              ) # 의도적으로 3월 5일부터 작성
df['F'] = s1

In [37]:
df # 데이터프레임 F 를 생성했고, 값이 없으면 NaN 으로 출력됨을 확인

Unnamed: 0,A,B,C,D,F
2018-03-04,1.341755,-0.102368,-0.098966,-1.665442,
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219,1.0
2018-03-06,-1.12848,-0.083697,0.819332,0.559748,2.0
2018-03-07,0.696405,0.216876,0.434205,0.74629,3.0
2018-03-08,-0.683227,-0.507309,-0.096254,-0.550115,4.0
2018-03-09,0.116714,0.708181,0.791406,-0.383858,5.0
2018-03-10,0.018328,0.613582,-0.510919,-0.194982,6.0
2018-03-11,1.196211,0.564046,-1.40846,-0.608706,7.0
2018-03-12,-1.067965,-0.185638,0.65985,1.398305,8.0
2018-03-13,-0.030167,-0.371161,0.606243,-2.641699,9.0


### 데이터프레임의 길이만큼 EE 인덱스의 값을 13으로 생성하시오

In [38]:
df.loc[:, 'E'] = [13] * len(df)
df

Unnamed: 0,A,B,C,D,F,E
2018-03-04,1.341755,-0.102368,-0.098966,-1.665442,,13
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219,1.0,13
2018-03-06,-1.12848,-0.083697,0.819332,0.559748,2.0,13
2018-03-07,0.696405,0.216876,0.434205,0.74629,3.0,13
2018-03-08,-0.683227,-0.507309,-0.096254,-0.550115,4.0,13
2018-03-09,0.116714,0.708181,0.791406,-0.383858,5.0,13
2018-03-10,0.018328,0.613582,-0.510919,-0.194982,6.0,13
2018-03-11,1.196211,0.564046,-1.40846,-0.608706,7.0,13
2018-03-12,-1.067965,-0.185638,0.65985,1.398305,8.0,13
2018-03-13,-0.030167,-0.371161,0.606243,-2.641699,9.0,13


### 데이터 프레임의 모든 수를 음수로 전환하기

In [39]:
df2 = df.copy()

In [40]:
df2[df2 > 0] = -df2 # 모든 수를 음수로 전환

In [41]:
df2.head()

Unnamed: 0,A,B,C,D,F,E
2018-03-04,-1.341755,-0.102368,-0.098966,-1.665442,,-13
2018-03-05,-1.488785,-0.859913,-2.444986,-0.11219,-1.0,-13
2018-03-06,-1.12848,-0.083697,-0.819332,-0.559748,-2.0,-13
2018-03-07,-0.696405,-0.216876,-0.434205,-0.74629,-3.0,-13
2018-03-08,-0.683227,-0.507309,-0.096254,-0.550115,-4.0,-13


## Missing Data

### reindex 를 사용하여 지정된 축의 index 를 변경/추가/삭제

In [42]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

In [43]:
df1

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-04,1.341755,-0.102368,-0.098966,-1.665442,,13,13
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219,1.0,13,13
2018-03-06,-1.12848,-0.083697,0.819332,0.559748,2.0,13,13
2018-03-07,0.696405,0.216876,0.434205,0.74629,3.0,13,13


### index[0], [1] 에 해당하는 E Column 의 값을 1로 변경하기

In [44]:
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-04,1.341755,-0.102368,-0.098966,-1.665442,,1,1
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219,1.0,1,1
2018-03-06,-1.12848,-0.083697,0.819332,0.559748,2.0,13,13
2018-03-07,0.696405,0.216876,0.434205,0.74629,3.0,13,13


### NaN 에 해당하는 행을 모두 지우기

In [45]:
df1.dropna()

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219,1.0,1,1
2018-03-06,-1.12848,-0.083697,0.819332,0.559748,2.0,13,13
2018-03-07,0.696405,0.216876,0.434205,0.74629,3.0,13,13


### NaN 에 해당하는 값을 5로 대체하기

In [46]:
df1.fillna(5)

Unnamed: 0,A,B,C,D,F,E,E.1
2018-03-04,1.341755,-0.102368,-0.098966,-1.665442,5.0,1,1
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219,1.0,1,1
2018-03-06,-1.12848,-0.083697,0.819332,0.559748,2.0,13,13
2018-03-07,0.696405,0.216876,0.434205,0.74629,3.0,13,13


### NaN 에 해당하는 값을 Boolean 값으로 표시하기

In [47]:
pd.isna(df)

Unnamed: 0,A,B,C,D,F,E
2018-03-04,False,False,False,False,True,False
2018-03-05,False,False,False,False,False,False
2018-03-06,False,False,False,False,False,False
2018-03-07,False,False,False,False,False,False
2018-03-08,False,False,False,False,False,False
2018-03-09,False,False,False,False,False,False
2018-03-10,False,False,False,False,False,False
2018-03-11,False,False,False,False,False,False
2018-03-12,False,False,False,False,False,False
2018-03-13,False,False,False,False,False,False


## Operations

In [48]:
df.mean()

A     0.175139
B     0.090448
C     0.044829
D    -0.248372
F    10.000000
E    13.000000
dtype: float64

In [49]:
df.mean(1)

2018-03-04    2.494996
2018-03-05    1.802325
2018-03-06    2.527817
2018-03-07    3.015630
2018-03-08    2.527183
2018-03-09    3.205407
2018-03-10    3.154335
2018-03-11    3.290515
2018-03-12    3.634092
2018-03-13    3.260536
2018-03-14    4.211228
2018-03-15    3.730978
2018-03-16    4.383219
2018-03-17    4.397759
2018-03-18    4.860980
2018-03-19    4.507554
2018-03-20    4.710600
2018-03-21    5.118190
2018-03-22    5.278702
2018-03-23    5.510602
Freq: D, dtype: float64

In [50]:
pd.Series([1, 3, 5, np.nan, 6, 8], index=dates[:6])

2018-03-04    1.0
2018-03-05    3.0
2018-03-06    5.0
2018-03-07    NaN
2018-03-08    6.0
2018-03-09    8.0
Freq: D, dtype: float64

In [51]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates[:6].shift(2))
s

2018-03-06    1.0
2018-03-07    3.0
2018-03-08    5.0
2018-03-09    NaN
2018-03-10    6.0
2018-03-11    8.0
Freq: D, dtype: float64

In [52]:
df.sub(s, axis='index').dropna()

Unnamed: 0,A,B,C,D,F,E
2018-03-06,-2.12848,-1.083697,-0.180668,-0.440252,1.0,12.0
2018-03-07,-2.303595,-2.783124,-2.565795,-2.25371,0.0,10.0
2018-03-08,-5.683227,-5.507309,-5.096254,-5.550115,-1.0,8.0
2018-03-10,-5.981672,-5.386418,-6.510919,-6.194982,0.0,7.0
2018-03-11,-6.803789,-7.435954,-9.40846,-8.608706,-1.0,5.0


## Apply

In [54]:
df.head(10)

Unnamed: 0,A,B,C,D,F,E
2018-03-04,1.341755,-0.102368,-0.098966,-1.665442,,13
2018-03-05,-1.488785,0.859913,-2.444986,-0.11219,1.0,13
2018-03-06,-1.12848,-0.083697,0.819332,0.559748,2.0,13
2018-03-07,0.696405,0.216876,0.434205,0.74629,3.0,13
2018-03-08,-0.683227,-0.507309,-0.096254,-0.550115,4.0,13
2018-03-09,0.116714,0.708181,0.791406,-0.383858,5.0,13
2018-03-10,0.018328,0.613582,-0.510919,-0.194982,6.0,13
2018-03-11,1.196211,0.564046,-1.40846,-0.608706,7.0,13
2018-03-12,-1.067965,-0.185638,0.65985,1.398305,8.0,13
2018-03-13,-0.030167,-0.371161,0.606243,-2.641699,9.0,13


In [56]:
df.apply(np.cumsum).head()

Unnamed: 0,A,B,C,D,F,E
2018-03-04,1.341755,-0.102368,-0.098966,-1.665442,,13
2018-03-05,-0.14703,0.757545,-2.543952,-1.777632,1.0,26
2018-03-06,-1.275511,0.673849,-1.72462,-1.217883,3.0,39
2018-03-07,-0.579105,0.890725,-1.290415,-0.471593,6.0,52
2018-03-08,-1.262332,0.383416,-1.386669,-1.021708,10.0,65


In [57]:
def my_cumsum(col):
    return pd.Series([1, 2, 3])

df.apply(my_cumsum)

Unnamed: 0,A,B,C,D,F,E
0,1,1,1,1,1,1
1,2,2,2,2,2,2
2,3,3,3,3,3,3
