# 数据规整方面的话题
## 时间序列以及截面对齐

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [12]:
prices = pd.read_csv('/Users/wonderful/Desktop/stock_px.csv')
volume = pd.read_csv('/Users/wonderful/Desktop/volume.csv')

In [13]:
prices.index = prices['Unnamed: 0']

In [14]:
del prices.index.name 

In [15]:
prices = prices[['AAPL', 'JNJ', 'SPX', 'XOM']].ix['2011-09-06':'2011-09-14']

In [16]:
prices

Unnamed: 0,AAPL,JNJ,SPX,XOM
2011-09-06 00:00:00,379.74,64.64,1165.24,71.15
2011-09-07 00:00:00,383.93,65.43,1198.62,73.65
2011-09-08 00:00:00,384.14,64.95,1185.9,72.82
2011-09-09 00:00:00,377.48,63.64,1154.23,71.01
2011-09-12 00:00:00,379.94,63.59,1162.27,71.84
2011-09-13 00:00:00,384.62,63.61,1172.87,71.65


In [17]:
volume.index = volume['Unnamed: 0']
del volume.index.name
volume = volume[['AAPL', 'JNJ', 'XOM']].ix['2011-09-06':'2011-09-12']
volume 

Unnamed: 0,AAPL,JNJ,XOM
2011-09-06 00:00:00,18173500.0,15848300.0,25416300.0
2011-09-07 00:00:00,12492000.0,10759700.0,23108400.0
2011-09-08 00:00:00,14839800.0,15551500.0,22434800.0
2011-09-09 00:00:00,20171900.0,17008200.0,27969100.0


In [18]:
prices * volume

Unnamed: 0,AAPL,JNJ,SPX,XOM
2011-09-06 00:00:00,6901205000.0,1024434000.0,,1808370000.0
2011-09-07 00:00:00,4796054000.0,704007200.0,,1701934000.0
2011-09-08 00:00:00,5700561000.0,1010070000.0,,1633702000.0
2011-09-09 00:00:00,7614489000.0,1082402000.0,,1986086000.0
2011-09-12 00:00:00,,,,
2011-09-13 00:00:00,,,,


In [20]:
vwap = (prices * volume).sum() / volume.sum()

In [21]:
vwap

AAPL    380.837003
JNJ      64.577684
SPX            NaN
XOM      72.073105
dtype: float64

In [22]:
vwap.dropna()

AAPL    380.837003
JNJ      64.577684
XOM      72.073105
dtype: float64

In [23]:
prices.align(volume, join='inner')

(                       AAPL    JNJ    XOM
 2011-09-06 00:00:00  379.74  64.64  71.15
 2011-09-07 00:00:00  383.93  65.43  73.65
 2011-09-08 00:00:00  384.14  64.95  72.82
 2011-09-09 00:00:00  377.48  63.64  71.01,
                            AAPL         JNJ         XOM
 2011-09-06 00:00:00  18173500.0  15848300.0  25416300.0
 2011-09-07 00:00:00  12492000.0  10759700.0  23108400.0
 2011-09-08 00:00:00  14839800.0  15551500.0  22434800.0
 2011-09-09 00:00:00  20171900.0  17008200.0  27969100.0)

In [24]:
s1 = Series(range(3), index=['a', 'b', 'c'])
s2 = Series(range(4), index=['d', 'b', 'c', 'e'])
s3 = Series(range(3), index=['f', 'a', 'c'])

In [27]:
DataFrame({'one': s1,
          'two': s2,
          'three': s3})

Unnamed: 0,one,three,two
a,0.0,1.0,
b,1.0,,1.0
c,2.0,2.0,2.0
d,,,0.0
e,,,3.0
f,,0.0,


In [28]:
DataFrame({'one': s1,'two': s2,'three': s3}, 
          index=list('face'))

Unnamed: 0,one,three,two
f,,0.0,
a,0.0,1.0,
c,2.0,2.0,2.0
e,,,3.0


## 频率不同的时间序列的运算

In [3]:
ts1 = Series(np.random.randn(3),
             index=pd.date_range('2012-6-13', periods=3, freq='W-WED'))

In [4]:
ts1

2012-06-13    0.843815
2012-06-20   -0.286122
2012-06-27   -0.948088
Freq: W-WED, dtype: float64

In [6]:
ts1.resample('B').mean()

2012-06-13    0.843815
2012-06-14         NaN
2012-06-15         NaN
2012-06-18         NaN
2012-06-19         NaN
2012-06-20   -0.286122
2012-06-21         NaN
2012-06-22         NaN
2012-06-25         NaN
2012-06-26         NaN
2012-06-27   -0.948088
Freq: B, dtype: float64

In [7]:
ts1.resample('B').ffill()

2012-06-13    0.843815
2012-06-14    0.843815
2012-06-15    0.843815
2012-06-18    0.843815
2012-06-19    0.843815
2012-06-20   -0.286122
2012-06-21   -0.286122
2012-06-22   -0.286122
2012-06-25   -0.286122
2012-06-26   -0.286122
2012-06-27   -0.948088
Freq: B, dtype: float64

In [9]:
dates = pd.DatetimeIndex(['2012-6-12', '2012-6-17', '2012-6-18',
                      '2012-6-21', '2012-6-22', '2012-6-29'])

In [10]:
ts2 = Series(np.random.randn(6), index=dates)

In [11]:
ts2

2012-06-12   -0.460654
2012-06-17    0.060953
2012-06-18    0.283837
2012-06-21    1.987989
2012-06-22    0.406685
2012-06-29   -2.261984
dtype: float64

In [13]:
ts1

2012-06-13    0.843815
2012-06-20   -0.286122
2012-06-27   -0.948088
Freq: W-WED, dtype: float64

In [12]:
ts1.reindex(ts2.index, method='ffill')

2012-06-12         NaN
2012-06-17    0.843815
2012-06-18    0.843815
2012-06-21   -0.286122
2012-06-22   -0.286122
2012-06-29   -0.948088
dtype: float64

In [14]:
ts2 + ts1.reindex(ts2.index, method='ffill')

2012-06-12         NaN
2012-06-17    0.904768
2012-06-18    1.127652
2012-06-21    1.701867
2012-06-22    0.120563
2012-06-29   -3.210073
dtype: float64

### 使用 Period

In [16]:
gdp = Series([1.78, 1.94, 2.08, 2.01, 2.15, 2.31, 2.46],
             index=pd.period_range('1984Q2', periods=7, freq='Q-SEP'))

In [17]:
infl = Series([0.025, 0.045, 0.037, 0.04],
              index=pd.period_range('1982', periods=4, freq='A-DEC'))

In [18]:
gdp

1984Q2    1.78
1984Q3    1.94
1984Q4    2.08
1985Q1    2.01
1985Q2    2.15
1985Q3    2.31
1985Q4    2.46
Freq: Q-SEP, dtype: float64

In [19]:
infl

1982    0.025
1983    0.045
1984    0.037
1985    0.040
Freq: A-DEC, dtype: float64

In [20]:
infl_q = infl.asfreq('Q-SEP', how='end')

In [21]:
infl_q

1983Q1    0.025
1984Q1    0.045
1985Q1    0.037
1986Q1    0.040
Freq: Q-SEP, dtype: float64

In [27]:
infl_q.reindex(gdp.index, method='ffill')

1984Q2    0.045
1984Q3    0.045
1984Q4    0.045
1985Q1    0.037
1985Q2    0.037
1985Q3    0.037
1985Q4    0.037
Freq: Q-SEP, dtype: float64

## 时间和“最当前”数据选取

In [28]:
rng = pd.date_range('2012-06-01 09:30', '2012-06-01 15:59', freq='T')

In [30]:
rng

DatetimeIndex(['2012-06-01 09:30:00', '2012-06-01 09:31:00',
               '2012-06-01 09:32:00', '2012-06-01 09:33:00',
               '2012-06-01 09:34:00', '2012-06-01 09:35:00',
               '2012-06-01 09:36:00', '2012-06-01 09:37:00',
               '2012-06-01 09:38:00', '2012-06-01 09:39:00',
               ...
               '2012-06-01 15:50:00', '2012-06-01 15:51:00',
               '2012-06-01 15:52:00', '2012-06-01 15:53:00',
               '2012-06-01 15:54:00', '2012-06-01 15:55:00',
               '2012-06-01 15:56:00', '2012-06-01 15:57:00',
               '2012-06-01 15:58:00', '2012-06-01 15:59:00'],
              dtype='datetime64[ns]', length=390, freq='T')

In [31]:
rng = rng.append([rng + pd.offsets.BDay(i) for i in range(1, 4)])

In [32]:
rng

DatetimeIndex(['2012-06-01 09:30:00', '2012-06-01 09:31:00',
               '2012-06-01 09:32:00', '2012-06-01 09:33:00',
               '2012-06-01 09:34:00', '2012-06-01 09:35:00',
               '2012-06-01 09:36:00', '2012-06-01 09:37:00',
               '2012-06-01 09:38:00', '2012-06-01 09:39:00',
               ...
               '2012-06-06 15:50:00', '2012-06-06 15:51:00',
               '2012-06-06 15:52:00', '2012-06-06 15:53:00',
               '2012-06-06 15:54:00', '2012-06-06 15:55:00',
               '2012-06-06 15:56:00', '2012-06-06 15:57:00',
               '2012-06-06 15:58:00', '2012-06-06 15:59:00'],
              dtype='datetime64[ns]', length=1560, freq=None)

In [33]:
ts = Series(np.arange(len(rng), dtype=float), index=rng)

In [34]:
ts

2012-06-01 09:30:00       0.0
2012-06-01 09:31:00       1.0
2012-06-01 09:32:00       2.0
2012-06-01 09:33:00       3.0
2012-06-01 09:34:00       4.0
2012-06-01 09:35:00       5.0
2012-06-01 09:36:00       6.0
2012-06-01 09:37:00       7.0
2012-06-01 09:38:00       8.0
2012-06-01 09:39:00       9.0
2012-06-01 09:40:00      10.0
2012-06-01 09:41:00      11.0
2012-06-01 09:42:00      12.0
2012-06-01 09:43:00      13.0
2012-06-01 09:44:00      14.0
2012-06-01 09:45:00      15.0
2012-06-01 09:46:00      16.0
2012-06-01 09:47:00      17.0
2012-06-01 09:48:00      18.0
2012-06-01 09:49:00      19.0
2012-06-01 09:50:00      20.0
2012-06-01 09:51:00      21.0
2012-06-01 09:52:00      22.0
2012-06-01 09:53:00      23.0
2012-06-01 09:54:00      24.0
2012-06-01 09:55:00      25.0
2012-06-01 09:56:00      26.0
2012-06-01 09:57:00      27.0
2012-06-01 09:58:00      28.0
2012-06-01 09:59:00      29.0
                        ...  
2012-06-06 15:30:00    1530.0
2012-06-06 15:31:00    1531.0
2012-06-06

In [35]:
from datetime import time

In [36]:
ts[time(10, 0)]

2012-06-01 10:00:00      30.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1200.0
dtype: float64

In [37]:
ts.at_time(time(10, 0))

2012-06-01 10:00:00      30.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1200.0
dtype: float64

In [40]:
ts[time(10, 59)]

2012-06-01 10:59:00      89.0
2012-06-04 10:59:00     479.0
2012-06-05 10:59:00     869.0
2012-06-06 10:59:00    1259.0
dtype: float64

In [41]:
ts.between_time(time(10, 0), time(10, 1))

2012-06-01 10:00:00      30.0
2012-06-01 10:01:00      31.0
2012-06-04 10:00:00     420.0
2012-06-04 10:01:00     421.0
2012-06-05 10:00:00     810.0
2012-06-05 10:01:00     811.0
2012-06-06 10:00:00    1200.0
2012-06-06 10:01:00    1201.0
dtype: float64