In [1]:

import pandas as pd
import numpy as np

# 一、时序的创建
1. 四类时间变量
现在理解可能关于③和④有些困惑，后面会作出一些说明

In [2]:
pd.to_datetime('2020 1-1')
pd.to_datetime('2020-1 1')
pd.to_datetime('2020-1-1')
pd.to_datetime('2020/1/1')

Timestamp('2020-01-01 00:00:00')

此时可利用format参数强制匹配

In [3]:
pd.to_datetime('2020.1 1',format='%Y.%m %d')
pd.to_datetime('1 1.2020',format='%d %m.%Y')

Timestamp('2020-01-01 00:00:00')

In [4]:

pd.Series(range(2),index=pd.to_datetime(['2020/1/1','2020/1/2']))

2020-01-01    0
2020-01-02    1
dtype: int64

In [5]:
df = pd.DataFrame({'year': [2020, 2020],'month': [1, 1], 'day': [1, 2]})
pd.to_datetime(df)

0   2020-01-01
1   2020-01-02
dtype: datetime64[ns]

In [6]:
ts = pd.Timestamp('2020-3-29 01:00:00', tz='Europe/Helsinki')
ts + pd.Timedelta(days=1)

Timestamp('2020-03-30 02:00:00+0300', tz='Europe/Helsinki')

In [7]:
pd.Series(pd.offsets.CDay(3,weekmask='Wed Fri',holidays='2020010').apply(i)
                                  for i in pd.date_range('20200105',periods=3,freq='D'))

0   2020-01-15
1   2020-01-15
2   2020-01-15
dtype: datetime64[ns]

# 二、时序的索引及属性
1. 索引切片

In [8]:

rng = pd.date_range('2020','2021', freq='W')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts.head()

2020-01-05    0.853316
2020-01-12   -0.259318
2020-01-19   -0.177313
2020-01-26    0.735828
2020-02-02    0.525392
Freq: W-SUN, dtype: float64

In [9]:
pd.date_range('2020','2021', freq='W').month

Int64Index([ 1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,  3,  4,  4,  4,  4,
             5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,
             8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12,
            12],
           dtype='int64')

In [10]:
pd.date_range('2020','2021', freq='W').weekday

Int64Index([6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
            6, 6, 6, 6, 6, 6, 6, 6],
           dtype='int64')

# 三、重采样
所谓重采样，就是指resample函数，它可以看做时序版本的groupby函数
1. resample对象的基本操作¶
采样频率一般设置为上面提到的offset字符

In [11]:

df_r = pd.DataFrame(np.random.randn(1000, 3),index=pd.date_range('1/1/2020', freq='S', periods=1000),
                  columns=['A', 'B', 'C'])

In [12]:

r = df_r.resample('3min')
r


<pandas.core.resample.DatetimeIndexResampler object at 0x000001A811ACFC08>

In [13]:
r.sum()

Unnamed: 0,A,B,C
2020-01-01 00:00:00,-0.264349,-0.10309,4.25449
2020-01-01 00:03:00,-27.251333,8.950418,-18.398609
2020-01-01 00:06:00,-4.569464,-10.931085,-0.800581
2020-01-01 00:09:00,13.37457,14.632407,-4.053795
2020-01-01 00:12:00,10.158384,3.284072,11.423625
2020-01-01 00:15:00,-10.730553,5.223301,3.603022


In [14]:

df_r2 = pd.DataFrame(np.random.randn(200, 3),index=pd.date_range('1/1/2020', freq='D', periods=200),
                  columns=['A', 'B', 'C'])
r = df_r2.resample('CBMS')
r.sum()

Unnamed: 0,A,B,C
2020-01-01,-2.373739,-10.458195,5.890747
2020-02-03,11.440411,0.903741,0.776797
2020-03-02,-0.742493,-1.601813,2.564723
2020-04-01,-9.557266,-1.037846,-10.169209
2020-05-01,8.459699,-2.058364,-10.711473
2020-06-01,2.824515,-4.226076,2.122741
2020-07-01,5.883331,9.85121,-4.666511


In [15]:
r = df_r.resample('3T')

r['A'].mean()

2020-01-01 00:00:00   -0.001469
2020-01-01 00:03:00   -0.151396
2020-01-01 00:06:00   -0.025386
2020-01-01 00:09:00    0.074303
2020-01-01 00:12:00    0.056435
2020-01-01 00:15:00   -0.107306
Freq: 3T, Name: A, dtype: float64

In [16]:
r['A'].agg([np.sum, np.mean, np.std])

Unnamed: 0,sum,mean,std
2020-01-01 00:00:00,-0.264349,-0.001469,0.993745
2020-01-01 00:03:00,-27.251333,-0.151396,0.946664
2020-01-01 00:06:00,-4.569464,-0.025386,1.029938
2020-01-01 00:09:00,13.37457,0.074303,0.895711
2020-01-01 00:12:00,10.158384,0.056435,1.05308
2020-01-01 00:15:00,-10.730553,-0.107306,1.103846


In [17]:
r.agg({'A': np.sum,'B': lambda x: max(x)-min(x)})

Unnamed: 0,A,B
2020-01-01 00:00:00,-0.264349,4.832436
2020-01-01 00:03:00,-27.251333,5.156254
2020-01-01 00:06:00,-4.569464,3.775661
2020-01-01 00:09:00,13.37457,6.542526
2020-01-01 00:12:00,10.158384,6.102924
2020-01-01 00:15:00,-10.730553,4.806965


# 3. 采样组的迭代
采样组的迭代和groupby迭代完全类似，对于每一个组都可以分别做相应操作

In [18]:

small = pd.Series(range(6),index=pd.to_datetime(['2020-01-01 00:00:00', '2020-01-01 00:30:00'
                                                 , '2020-01-01 00:31:00','2020-01-01 01:00:00'
                                                 ,'2020-01-01 03:00:00','2020-01-01 03:05:00']))
resampled = small.resample('H')
for name, group in resampled:
    print("Group: ", name)
    print("-" * 27)
    print(group, end="\n\n")

Group:  2020-01-01 00:00:00
---------------------------
2020-01-01 00:00:00    0
2020-01-01 00:30:00    1
2020-01-01 00:31:00    2
dtype: int64

Group:  2020-01-01 01:00:00
---------------------------
2020-01-01 01:00:00    3
dtype: int64

Group:  2020-01-01 02:00:00
---------------------------
Series([], dtype: int64)

Group:  2020-01-01 03:00:00
---------------------------
2020-01-01 03:00:00    4
2020-01-01 03:05:00    5
dtype: int64



# 四、窗口函数
下面主要介绍pandas中两类主要的窗口(window)函数:rolling/expanding

In [19]:

s = pd.Series(np.random.randn(1000),index=pd.date_range('1/1/2020', periods=1000))
s.head()

2020-01-01   -0.060759
2020-01-02    0.212164
2020-01-03    0.932829
2020-01-04    0.814170
2020-01-05    0.868798
Freq: D, dtype: float64

## 1. Rolling
（a）常用聚合
所谓rolling方法，就是规定一个窗口，它和groupby对象一样，本身不会进行操作，需要配合聚合函数才能计算结果

In [20]:
s.rolling(window=50)

Rolling [window=50,center=False,axis=0]

In [21]:
s.rolling(window=50).mean()

2020-01-01         NaN
2020-01-02         NaN
2020-01-03         NaN
2020-01-04         NaN
2020-01-05         NaN
                ...   
2022-09-22    0.146779
2022-09-23    0.162510
2022-09-24    0.176032
2022-09-25    0.202740
2022-09-26    0.179112
Freq: D, Length: 1000, dtype: float64

## 2. Expanding¶
（a）expanding函数
普通的expanding函数等价与rolling(window=len(s),min_periods=1)，是对序列的累计计算

In [22]:
s.rolling(window=len(s),min_periods=1).sum().head()

2020-01-01   -0.060759
2020-01-02    0.151405
2020-01-03    1.084235
2020-01-04    1.898405
2020-01-05    2.767203
Freq: D, dtype: float64

In [23]:
s.expanding().sum().head()

2020-01-01   -0.060759
2020-01-02    0.151405
2020-01-03    1.084235
2020-01-04    1.898405
2020-01-05    2.767203
Freq: D, dtype: float64

In [24]:

s.expanding().apply(lambda x:sum(x)).head()

2020-01-01   -0.060759
2020-01-02    0.151405
2020-01-03    1.084235
2020-01-04    1.898405
2020-01-05    2.767203
Freq: D, dtype: float64