In [1]:
print("""
@File         : ch11_time_series_new.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-04-07 01:07:23
@Email        : cuixuanstephen@gmail.com
@Description  : 时间序列
""")


@File         : ch11_time_series_new.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-04-07 01:07:23
@Email        : cuixuanstephen@gmail.com
@Description  : 时间序列



In [2]:
import numpy as np
import pandas as pd

## 日期和时间数据的类型及工具

In [3]:
from datetime import datetime

now = datetime.now()
now

datetime.datetime(2024, 4, 7, 14, 11, 41, 193881)

In [4]:
now.year, now.month, now.day

(2024, 4, 7)

In [5]:
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
delta

datetime.timedelta(days=926, seconds=56700)

In [6]:
delta.days

926

In [7]:
delta.seconds

56700

In [8]:
from datetime import timedelta
start = datetime(2011, 1, 7)

start + timedelta(12)

datetime.datetime(2011, 1, 19, 0, 0)

In [9]:
start - 2 * timedelta(12)

datetime.datetime(2010, 12, 14, 0, 0)

### 字符串与 `datetime` 互相转换

In [10]:
stamp = datetime(2011, 1, 3)
str(stamp)

'2011-01-03 00:00:00'

In [11]:
stamp.strftime('%Y-%m-%d')

'2011-01-03'

In [13]:
value = '2011-01-03'
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2011, 1, 3, 0, 0)

In [14]:
datestrs = ["7/6/2011", "8/6/2011"]
[datetime.strptime(x, '%m/%d/%Y') for x in datestrs]

[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]

In [16]:
datestrs = ["2011-07-06 12:00:00", "2011-08-06 00:00:00"]
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

In [18]:
idx = pd.to_datetime(datestrs + [None])
idx

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

In [19]:
idx[2]

NaT

In [21]:
pd.isna(idx)

array([False, False,  True])

## 时间序列基础知识

In [24]:
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
         datetime(2011, 1, 7), datetime(2011, 1, 8),
         datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.standard_normal(6), index=dates)
ts

2011-01-02   -1.332780
2011-01-05   -0.765269
2011-01-07   -0.516614
2011-01-08   -1.714679
2011-01-10    0.130232
2011-01-12   -1.273256
dtype: float64

In [25]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [26]:
ts + ts[::2]

2011-01-02   -2.665561
2011-01-05         NaN
2011-01-07   -1.033229
2011-01-08         NaN
2011-01-10    0.260464
2011-01-12         NaN
dtype: float64

### 索引、选择、子集构造

In [27]:
stamp = ts.index[2]
ts[stamp]

-0.5166143950185419

In [29]:
ts['2011-01-10']

0.13023207252379462

In [30]:
longer_ts = pd.Series(np.random.standard_normal(1000),
                      index=pd.date_range('2000-01-01', periods=1000))
longer_ts

2000-01-01   -0.268960
2000-01-02    0.576116
2000-01-03    1.473300
2000-01-04    1.742472
2000-01-05   -0.113201
                ...   
2002-09-22    0.125951
2002-09-23    0.620604
2002-09-24    0.811293
2002-09-25   -0.005070
2002-09-26    0.220242
Freq: D, Length: 1000, dtype: float64

In [31]:
longer_ts['2001']

2001-01-01    0.476797
2001-01-02    0.870043
2001-01-03   -0.048107
2001-01-04   -0.477197
2001-01-05    0.168995
                ...   
2001-12-27    0.762673
2001-12-28   -0.424329
2001-12-29    0.660895
2001-12-30    0.016532
2001-12-31   -0.856455
Freq: D, Length: 365, dtype: float64

In [32]:
longer_ts['2001-05']

2001-05-01    0.051632
2001-05-02    0.320165
2001-05-03    0.822521
2001-05-04   -2.069660
2001-05-05   -0.582892
2001-05-06    0.931540
2001-05-07    1.103691
2001-05-08   -0.688630
2001-05-09   -2.151092
2001-05-10   -0.720662
2001-05-11    0.388400
2001-05-12   -0.960131
2001-05-13    1.425718
2001-05-14    1.414887
2001-05-15    1.293863
2001-05-16    1.892599
2001-05-17   -0.098360
2001-05-18    0.375078
2001-05-19    1.226785
2001-05-20   -1.216342
2001-05-21    0.501808
2001-05-22   -1.208227
2001-05-23   -0.697562
2001-05-24   -0.179195
2001-05-25    0.623033
2001-05-26    0.627478
2001-05-27   -0.856228
2001-05-28   -0.651084
2001-05-29    0.613136
2001-05-30   -0.964975
2001-05-31   -1.033876
Freq: D, dtype: float64

In [33]:
ts[datetime(2011, 1, 7):]

2011-01-07   -0.516614
2011-01-08   -1.714679
2011-01-10    0.130232
2011-01-12   -1.273256
dtype: float64

In [34]:
ts[datetime(2011, 1, 7):datetime(2011, 1, 10)]

2011-01-07   -0.516614
2011-01-08   -1.714679
2011-01-10    0.130232
dtype: float64

In [35]:
ts

2011-01-02   -1.332780
2011-01-05   -0.765269
2011-01-07   -0.516614
2011-01-08   -1.714679
2011-01-10    0.130232
2011-01-12   -1.273256
dtype: float64

In [36]:
ts['2011-01-06': '2011-01-11']

2011-01-07   -0.516614
2011-01-08   -1.714679
2011-01-10    0.130232
dtype: float64

In [37]:
ts.truncate(after='2011-01-09')

2011-01-02   -1.332780
2011-01-05   -0.765269
2011-01-07   -0.516614
2011-01-08   -1.714679
dtype: float64

In [38]:
dates = pd.date_range('2000-01-01', periods=100, freq='W-WED')
long_df = pd.DataFrame(np.random.standard_normal(size=(100, 4)),
                       index=dates, 
                       columns=["Colorado", "Texas", "New York", "Ohio"])

In [39]:
long_df.loc['2001-05']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,1.050485,0.597673,-0.434286,0.332044
2001-05-09,0.488788,0.566149,1.036781,0.280128
2001-05-16,-0.224182,-0.461448,-0.307531,-1.447481
2001-05-23,0.130111,0.09353,0.164463,0.934241
2001-05-30,-1.852918,0.294856,-1.972825,-1.303103


### 带有重复索引的时间序列

In [41]:
dates = pd.DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02', 
                          '2000-01-02', '2000-01-03'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

In [42]:
dup_ts.index.is_unique

False

In [44]:
dup_ts['2000-01-03'] # 不是重复索引

4

In [45]:
dup_ts['2000-01-02'] # 重复索引

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32

In [46]:
grouped = dup_ts.groupby(level=0)
grouped.mean()

2000-01-01    0.0
2000-01-02    2.0
2000-01-03    4.0
dtype: float64

In [47]:
grouped.size()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64

## 日期的范围、频率以及移位

In [48]:
ts

2011-01-02   -1.332780
2011-01-05   -0.765269
2011-01-07   -0.516614
2011-01-08   -1.714679
2011-01-10    0.130232
2011-01-12   -1.273256
dtype: float64

In [49]:
resampler = ts.resample('D')
resampler

<pandas.core.resample.DatetimeIndexResampler object at 0x000002089FCC5E90>