# Chapter11 時間序列

## Time Series Part I

### 日期、時間日期型態和工具

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from datetime import datetime

In [4]:
now=datetime.now()

In [5]:
now

datetime.datetime(2022, 10, 28, 19, 49, 48, 268271)

### 目前時間的年月日。

In [6]:
now.year, now.month, now.day

(2022, 10, 28)

### datetime可以直接進行日期的加減，會出現差距多少天與秒數。

In [7]:
delta=datetime(2012,10,19)-datetime(2008,5,5,8,15)

In [8]:
delta

datetime.timedelta(days=1627, seconds=56700)

In [9]:
delta.days

1627

In [10]:
delta.seconds

56700

### timedelta(天)：可以用來進行日期的計算。

In [11]:
from datetime import timedelta

In [12]:
start=datetime(2011,1,7)

In [13]:
start+timedelta(12)

datetime.datetime(2011, 1, 19, 0, 0)

In [14]:
start - 2*timedelta(12)

datetime.datetime(2010, 12, 14, 0, 0)

### 字串和時間日期轉換

In [15]:
stamp=datetime(2011,1,13)

### str(date) 會將日期轉成文字。

In [16]:
str(stamp)

'2011-01-13 00:00:00'

### strftime('') 可以將日期格式化
### %Y:2021 %y:21  %m:[01,12]  %d:[01,31] 
### %H:[00,23] %I:[01,12] %M:[00,59] %S:[00,59]
### %w:[0星期日,6星期六] %U:[00,53] 第幾週（星期日為第一天） %W:[00,53] 第幾週（星期一為第一天）
### %F: '%Y-%m-%d' 2022-10-28  %D:'%m/%d/%y' 10/28/22

In [17]:
stamp.strftime('%Y-%m-%d')

'2011-01-13'

In [18]:
value='2022-10-28'

### datetime.strptime(value, 格式) 可以將文字轉為日期。

In [19]:
datetime.strptime(value,'%Y-%m-%d')

datetime.datetime(2022, 10, 28, 0, 0)

In [20]:
datestrs=['7/6/2011','8/6/2011']

In [21]:
[datetime.strptime(x, '%m/%d/%Y') for x in datestrs]

[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]

### dateutil.parser的模組可以直接產生日期，非常方便，但偶爾也會有辨識錯誤的時候，要小心。

In [22]:
from dateutil.parser import parse

In [23]:
parse('2011-01-03')

datetime.datetime(2011, 1, 3, 0, 0)

In [24]:
parse('Jan 31, 1997 10:45 PM')

datetime.datetime(1997, 1, 31, 22, 45)

### 如果是日/月/年，dayfirst=True。

In [25]:
parse('6/12/2011', dayfirst=True)

datetime.datetime(2011, 12, 6, 0, 0)

In [26]:
datestrs=['2011-07-06 12:00:00', '2011-08-06 00:00:00']

### pd.to_datetime(文字) 可以直接將文字轉為日期。

In [27]:
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

### pd.to_datetime() 遇到None，會出現NaT，也就是Not a Time。

In [28]:
idx=pd.to_datetime(datestrs+[None])

In [29]:
idx

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

In [30]:
idx[2]

NaT

In [31]:
pd.isnull(idx)

array([False, False,  True])

### 時間序列基本概念

In [32]:
from datetime import datetime

In [33]:
dates= [datetime(2011,1,2), datetime(2011,1,5), datetime(2011,1,7), 
        datetime(2011,1,8), datetime(2011,1,10), datetime(2011,1,12)]

In [34]:
ts=pd.Series(np.random.randn(6),index=dates)

In [35]:
ts

2011-01-02    0.358977
2011-01-05   -0.183092
2011-01-07    0.716136
2011-01-08    0.897524
2011-01-10   -0.019949
2011-01-12    0.065215
dtype: float64

### datetime的資料會被儲存在DatetimeIndex內。

In [36]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [37]:
ts[::2]

2011-01-02    0.358977
2011-01-07    0.716136
2011-01-10   -0.019949
dtype: float64

### 在進行陣列加總的時候，對應不到的index，計算出來的值會呈現NAN。

In [38]:
ts+ts[::2]

2011-01-02    0.717953
2011-01-05         NaN
2011-01-07    1.432273
2011-01-08         NaN
2011-01-10   -0.039898
2011-01-12         NaN
dtype: float64

In [39]:
ts.index.dtype

dtype('<M8[ns]')

### 從pandas series取出日期的index，格式為Timestamp。

In [40]:
stamp=ts.index[0]

In [41]:
stamp

Timestamp('2011-01-02 00:00:00')

### 索引、選取和相減

In [42]:
stamp=ts.index[2]

### 從Pandas Series以日期為index取值時，除了可以使用.index[2]外，也可以直接輸入日期。

In [43]:
ts[stamp]

0.716136489081796

In [44]:
ts['1/10/2011']

-0.019949090408591486

In [45]:
ts['20110110']

-0.019949090408591486

### pd.date_range(起始日, periods=持續幾天) 

In [46]:
longer_ts=pd.Series(np.random.randn(1000),index=pd.date_range('1/1/2000',periods=1000))

In [47]:
longer_ts

2000-01-01   -1.989683
2000-01-02    1.104973
2000-01-03   -0.067964
2000-01-04   -0.868626
2000-01-05   -0.178573
                ...   
2002-09-22   -0.891946
2002-09-23   -0.398830
2002-09-24   -1.333462
2002-09-25   -0.709664
2002-09-26    0.968782
Freq: D, Length: 1000, dtype: float64

### 輸日四個數字，會被解讀為年份。

In [48]:
longer_ts['2001']

2001-01-01    1.060566
2001-01-02    1.192007
2001-01-03   -1.527857
2001-01-04   -0.843426
2001-01-05    0.468173
                ...   
2001-12-27   -0.396864
2001-12-28    0.080427
2001-12-29    0.500981
2001-12-30   -0.161507
2001-12-31   -0.447260
Freq: D, Length: 365, dtype: float64

### 也可以年搭配月份

In [49]:
longer_ts['2001-05']

2001-05-01    0.311744
2001-05-02    1.770886
2001-05-03    0.316932
2001-05-04   -1.691429
2001-05-05   -2.307274
2001-05-06   -1.155820
2001-05-07    1.186648
2001-05-08   -0.939881
2001-05-09    0.667098
2001-05-10   -0.213028
2001-05-11   -0.211203
2001-05-12   -0.673715
2001-05-13   -1.126643
2001-05-14   -1.809748
2001-05-15    0.860185
2001-05-16    1.026283
2001-05-17   -0.535046
2001-05-18   -0.758183
2001-05-19    0.558369
2001-05-20    1.346997
2001-05-21   -1.513403
2001-05-22    0.036818
2001-05-23    1.240824
2001-05-24   -0.541690
2001-05-25   -1.147801
2001-05-26   -0.104439
2001-05-27    1.032717
2001-05-28   -0.817090
2001-05-29   -0.966421
2001-05-30   -3.110778
2001-05-31   -1.010294
Freq: D, dtype: float64

### 日期也可以使用切片。

In [50]:
ts[datetime(2011,1,7):]

2011-01-07    0.716136
2011-01-08    0.897524
2011-01-10   -0.019949
2011-01-12    0.065215
dtype: float64

In [51]:
ts

2011-01-02    0.358977
2011-01-05   -0.183092
2011-01-07    0.716136
2011-01-08    0.897524
2011-01-10   -0.019949
2011-01-12    0.065215
dtype: float64

### 可以用時間戳記切掉不需要的資料。

In [52]:
ts['1/6/2011':'1/11/2011']

2011-01-07    0.716136
2011-01-08    0.897524
2011-01-10   -0.019949
dtype: float64

### truncate(after=日期) 跟切片很像，找到日期之後的資料。

In [53]:
ts.truncate(after='1/9/2011')

2011-01-02    0.358977
2011-01-05   -0.183092
2011-01-07    0.716136
2011-01-08    0.897524
dtype: float64

### pd.date_range(起始日, periods=持續幾天, freq=間隔多久 )
### freq='W-WED' 間隔一週，也就是7天。

In [54]:
dates=pd.date_range('1/1/2000', periods=100, freq='W-WED')

In [55]:
dates

DatetimeIndex(['2000-01-05', '2000-01-12', '2000-01-19', '2000-01-26',
               '2000-02-02', '2000-02-09', '2000-02-16', '2000-02-23',
               '2000-03-01', '2000-03-08', '2000-03-15', '2000-03-22',
               '2000-03-29', '2000-04-05', '2000-04-12', '2000-04-19',
               '2000-04-26', '2000-05-03', '2000-05-10', '2000-05-17',
               '2000-05-24', '2000-05-31', '2000-06-07', '2000-06-14',
               '2000-06-21', '2000-06-28', '2000-07-05', '2000-07-12',
               '2000-07-19', '2000-07-26', '2000-08-02', '2000-08-09',
               '2000-08-16', '2000-08-23', '2000-08-30', '2000-09-06',
               '2000-09-13', '2000-09-20', '2000-09-27', '2000-10-04',
               '2000-10-11', '2000-10-18', '2000-10-25', '2000-11-01',
               '2000-11-08', '2000-11-15', '2000-11-22', '2000-11-29',
               '2000-12-06', '2000-12-13', '2000-12-20', '2000-12-27',
               '2001-01-03', '2001-01-10', '2001-01-17', '2001-01-24',
      

In [56]:
long_df=pd.DataFrame(np.random.randn(100,4),
                     index=dates,
                     columns=['Colorado','Texas','New York','Ohio'])

In [57]:
long_df.loc['5-2001']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,1.904257,-1.322218,0.28346,-1.080033
2001-05-09,1.601939,-0.729314,0.554877,0.826218
2001-05-16,0.205701,0.178225,1.392081,-0.277564
2001-05-23,1.00604,2.009794,0.109584,0.916001
2001-05-30,0.050315,0.677378,-3.057286,-0.664176


### 有重複索引的時間序列

In [58]:
dates=pd.DatetimeIndex(['1/1/2000','1/2/2000','1/2/2000','1/2/2000','1/3/2000'])

In [59]:
dup_ts=pd.Series(np.arange(5),index=dates)

In [60]:
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int64

### is_unique 可以判斷是否有重複值。

In [61]:
dup_ts.index.is_unique

False

In [62]:
dup_ts['1/3/2000']

4

In [63]:
dup_ts['1/2/2000']

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int64

### 可以使用時間戳記作為groupby的分組依據。

In [64]:
grouped=dup_ts.groupby(level=0)

In [65]:
grouped.mean()

2000-01-01    0.0
2000-01-02    2.0
2000-01-03    4.0
dtype: float64

In [66]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64