In [None]:
'''
时间数据重采样
重采样（resampling）：
将时间序列从一个频率转换到另一个频率的过程
例如：原来是从每天采集 现在变为每周采集
groupby：将相同组的数据进行聚合操作

groupby split  apply combine
1、拆分相同组的数据
2、将数据在函数上操作
3、将数据组合起来

高频率->低频率 downsampling 样本数很多，转换为低频 样本数变少
低频率->高频率 upsampling

'''
'''
pandas中resample方法实现重采样
1、生成resample对象

resample(freq).sum()#记得指定频率
resample(freq).mean()
'''


In [5]:
import pandas as pd
import numpy as np

date_rng = pd.date_range('20170101', periods=100, freq='D')
ser_obj = pd.Series(range(len(date_rng)), index=date_rng)
print(ser_obj.head(10))


2017-01-01     0
2017-01-02     1
2017-01-03     2
2017-01-04     3
2017-01-05     4
2017-01-06     5
2017-01-07     6
2017-01-08     7
2017-01-09     8
2017-01-10     9
2017-01-11    10
2017-01-12    11
2017-01-13    12
2017-01-14    13
2017-01-15    14
2017-01-16    15
2017-01-17    16
2017-01-18    17
2017-01-19    18
2017-01-20    19
2017-01-21    20
2017-01-22    21
2017-01-23    22
2017-01-24    23
2017-01-25    24
2017-01-26    25
2017-01-27    26
2017-01-28    27
2017-01-29    28
2017-01-30    29
              ..
2017-03-12    70
2017-03-13    71
2017-03-14    72
2017-03-15    73
2017-03-16    74
2017-03-17    75
2017-03-18    76
2017-03-19    77
2017-03-20    78
2017-03-21    79
2017-03-22    80
2017-03-23    81
2017-03-24    82
2017-03-25    83
2017-03-26    84
2017-03-27    85
2017-03-28    86
2017-03-29    87
2017-03-30    88
2017-03-31    89
2017-04-01    90
2017-04-02    91
2017-04-03    92
2017-04-04    93
2017-04-05    94
2017-04-06    95
2017-04-07    96
2017-04-08    

In [4]:
#将每天的数据转换为每月的数据
#统计每个月的数据总和
resample_month_sum = ser_obj.resample('M').sum()
#统计每个月的数据平均
resample_month_mean = ser_obj.resample('M').mean()

print('按月求和：',resample_month_sum)
print('按月求平均：',resample_month_mean)


按月求和： 2017-01-31     465
2017-02-28    1246
2017-03-31    2294
2017-04-30     945
Freq: M, dtype: int64
按月求平均： 2017-01-31    15.0
2017-02-28    44.5
2017-03-31    74.0
2017-04-30    94.5
Freq: M, dtype: float64


In [6]:
'''
降采样（downsampling）
1、将数据聚合到规整的低频率
2、OHLC重采样，open，high，low，close
open：开盘价
high：最高价
low：最低价
close：收盘价
3、使用groupby 降采样
'''
ser_obj.head()

2017-01-01    0
2017-01-02    1
2017-01-03    2
2017-01-04    3
2017-01-05    4
Freq: D, dtype: int64

In [7]:
#将数据聚合到5天的频率
five_day_sum_sample = ser_obj.resample('5D').sum()
five_day_mean_sample = ser_obj.resample('5D').mean()
five_day_ohlc_sample = ser_obj.resample('5D').ohlc()

print('降采样，sum')
print(five_day_sum_sample)
'''
2017-01-01 2 数据是：01-01~01-05的数据
2017-01-06 7 数据是：01-06~01-10的数据
'''

降采样，sum
2017-01-01     2
2017-01-06     7
2017-01-11    12
2017-01-16    17
2017-01-21    22
2017-01-26    27
2017-01-31    32
2017-02-05    37
2017-02-10    42
2017-02-15    47
2017-02-20    52
2017-02-25    57
2017-03-02    62
2017-03-07    67
2017-03-12    72
2017-03-17    77
2017-03-22    82
2017-03-27    87
2017-04-01    92
2017-04-06    97
dtype: int64


In [8]:
print('降采样，five_day_ohlc_sample')
print(five_day_ohlc_sample)

降采样，five_day_ohlc_sample
            open  high  low  close
2017-01-01     0     4    0      4
2017-01-06     5     9    5      9
2017-01-11    10    14   10     14
2017-01-16    15    19   15     19
2017-01-21    20    24   20     24
2017-01-26    25    29   25     29
2017-01-31    30    34   30     34
2017-02-05    35    39   35     39
2017-02-10    40    44   40     44
2017-02-15    45    49   45     49
2017-02-20    50    54   50     54
2017-02-25    55    59   55     59
2017-03-02    60    64   60     64
2017-03-07    65    69   65     69
2017-03-12    70    74   70     74
2017-03-17    75    79   75     79
2017-03-22    80    84   80     84
2017-03-27    85    89   85     89
2017-04-01    90    94   90     94
2017-04-06    95    99   95     99


In [9]:
#使用groupby 降采样
# x 代表索引，代表每个样本的索引，使用索引的月份 1月 2月 3月 4月
print(ser_obj.groupby(lambda x:x.month).sum())

1     465
2    1246
3    2294
4     945
dtype: int64


In [12]:
#使用groupby 降采样
# x 代表索引，代表每个样本的索引，使用索引的 工作日做降采样
print(ser_obj.groupby(lambda x:x.weekday).sum())

0    750
1    665
2    679
3    693
4    707
5    721
6    735
dtype: int64


In [15]:
'''
升采样（upsampling）
从高频到低频 不需要关心数据

将数据从低频到高频（需要插值），否则为NaN
ffill(limit) 空值取前面的值填充，limit为填充个数

'''
df = pd.DataFrame(np.random.randn(5, 3),
                 index=pd.date_range('20170101',periods=5,freq='W-MON'),
                 columns=['S1','S2','S3'])
print(df)
                


                  S1        S2        S3
2017-01-02  2.047325 -0.334279 -2.001961
2017-01-09  1.456002 -0.032195  0.497578
2017-01-16  0.191146 -0.640366  0.973704
2017-01-23  0.376313  0.401136  0.399456
2017-01-30  0.609537 -2.286638  0.353668


In [16]:
#直接重采样 由于原数据是 每周间隔 每天的数据都没有，则会出现空值的现象
print(df.resample('D').asfreq())

                  S1        S2        S3
2017-01-02  2.047325 -0.334279 -2.001961
2017-01-03       NaN       NaN       NaN
2017-01-04       NaN       NaN       NaN
2017-01-05       NaN       NaN       NaN
2017-01-06       NaN       NaN       NaN
2017-01-07       NaN       NaN       NaN
2017-01-08       NaN       NaN       NaN
2017-01-09  1.456002 -0.032195  0.497578
2017-01-10       NaN       NaN       NaN
2017-01-11       NaN       NaN       NaN
2017-01-12       NaN       NaN       NaN
2017-01-13       NaN       NaN       NaN
2017-01-14       NaN       NaN       NaN
2017-01-15       NaN       NaN       NaN
2017-01-16  0.191146 -0.640366  0.973704
2017-01-17       NaN       NaN       NaN
2017-01-18       NaN       NaN       NaN
2017-01-19       NaN       NaN       NaN
2017-01-20       NaN       NaN       NaN
2017-01-21       NaN       NaN       NaN
2017-01-22       NaN       NaN       NaN
2017-01-23  0.376313  0.401136  0.399456
2017-01-24       NaN       NaN       NaN
2017-01-25      

In [20]:
#ffill
print(df.resample('D').ffill(2))

                  S1        S2        S3
2017-01-02  2.047325 -0.334279 -2.001961
2017-01-03  2.047325 -0.334279 -2.001961
2017-01-04  2.047325 -0.334279 -2.001961
2017-01-05       NaN       NaN       NaN
2017-01-06       NaN       NaN       NaN
2017-01-07       NaN       NaN       NaN
2017-01-08       NaN       NaN       NaN
2017-01-09  1.456002 -0.032195  0.497578
2017-01-10  1.456002 -0.032195  0.497578
2017-01-11  1.456002 -0.032195  0.497578
2017-01-12       NaN       NaN       NaN
2017-01-13       NaN       NaN       NaN
2017-01-14       NaN       NaN       NaN
2017-01-15       NaN       NaN       NaN
2017-01-16  0.191146 -0.640366  0.973704
2017-01-17  0.191146 -0.640366  0.973704
2017-01-18  0.191146 -0.640366  0.973704
2017-01-19       NaN       NaN       NaN
2017-01-20       NaN       NaN       NaN
2017-01-21       NaN       NaN       NaN
2017-01-22       NaN       NaN       NaN
2017-01-23  0.376313  0.401136  0.399456
2017-01-24  0.376313  0.401136  0.399456
2017-01-25  0.37

In [18]:
print(df.resample('D').bfill())

                  S1        S2        S3
2017-01-02  2.047325 -0.334279 -2.001961
2017-01-03  1.456002 -0.032195  0.497578
2017-01-04  1.456002 -0.032195  0.497578
2017-01-05  1.456002 -0.032195  0.497578
2017-01-06  1.456002 -0.032195  0.497578
2017-01-07  1.456002 -0.032195  0.497578
2017-01-08  1.456002 -0.032195  0.497578
2017-01-09  1.456002 -0.032195  0.497578
2017-01-10  0.191146 -0.640366  0.973704
2017-01-11  0.191146 -0.640366  0.973704
2017-01-12  0.191146 -0.640366  0.973704
2017-01-13  0.191146 -0.640366  0.973704
2017-01-14  0.191146 -0.640366  0.973704
2017-01-15  0.191146 -0.640366  0.973704
2017-01-16  0.191146 -0.640366  0.973704
2017-01-17  0.376313  0.401136  0.399456
2017-01-18  0.376313  0.401136  0.399456
2017-01-19  0.376313  0.401136  0.399456
2017-01-20  0.376313  0.401136  0.399456
2017-01-21  0.376313  0.401136  0.399456
2017-01-22  0.376313  0.401136  0.399456
2017-01-23  0.376313  0.401136  0.399456
2017-01-24  0.609537 -2.286638  0.353668
2017-01-25  0.60

In [21]:
print(df.resample('D').fillna('ffill'))

                  S1        S2        S3
2017-01-02  2.047325 -0.334279 -2.001961
2017-01-03  2.047325 -0.334279 -2.001961
2017-01-04  2.047325 -0.334279 -2.001961
2017-01-05  2.047325 -0.334279 -2.001961
2017-01-06  2.047325 -0.334279 -2.001961
2017-01-07  2.047325 -0.334279 -2.001961
2017-01-08  2.047325 -0.334279 -2.001961
2017-01-09  1.456002 -0.032195  0.497578
2017-01-10  1.456002 -0.032195  0.497578
2017-01-11  1.456002 -0.032195  0.497578
2017-01-12  1.456002 -0.032195  0.497578
2017-01-13  1.456002 -0.032195  0.497578
2017-01-14  1.456002 -0.032195  0.497578
2017-01-15  1.456002 -0.032195  0.497578
2017-01-16  0.191146 -0.640366  0.973704
2017-01-17  0.191146 -0.640366  0.973704
2017-01-18  0.191146 -0.640366  0.973704
2017-01-19  0.191146 -0.640366  0.973704
2017-01-20  0.191146 -0.640366  0.973704
2017-01-21  0.191146 -0.640366  0.973704
2017-01-22  0.191146 -0.640366  0.973704
2017-01-23  0.376313  0.401136  0.399456
2017-01-24  0.376313  0.401136  0.399456
2017-01-25  0.37

In [22]:
#插值算法补全数据 线性插值 用一条线做一个拟合，取其中的值
#linear 可以被替换
print(df.resample('D').interpolate('linear'))

                  S1        S2        S3
2017-01-02  2.047325 -0.334279 -2.001961
2017-01-03  1.962850 -0.291124 -1.644884
2017-01-04  1.878376 -0.247969 -1.287807
2017-01-05  1.793901 -0.204814 -0.930730
2017-01-06  1.709426 -0.161659 -0.573653
2017-01-07  1.624952 -0.118504 -0.216576
2017-01-08  1.540477 -0.075350  0.140501
2017-01-09  1.456002 -0.032195  0.497578
2017-01-10  1.275308 -0.119076  0.565596
2017-01-11  1.094615 -0.205958  0.633614
2017-01-12  0.913921 -0.292840  0.701632
2017-01-13  0.733227 -0.379721  0.769650
2017-01-14  0.552533 -0.466603  0.837668
2017-01-15  0.371840 -0.553484  0.905686
2017-01-16  0.191146 -0.640366  0.973704
2017-01-17  0.217598 -0.491580  0.891668
2017-01-18  0.244051 -0.342794  0.809633
2017-01-19  0.270503 -0.194008  0.727598
2017-01-20  0.296956 -0.045222  0.645562
2017-01-21  0.323408  0.103564  0.563527
2017-01-22  0.349861  0.252350  0.481491
2017-01-23  0.376313  0.401136  0.399456
2017-01-24  0.409631  0.017168  0.392915
2017-01-25  0.44