# Time Series

---

Created on 2019-05-24

Update on 2019-05-24

Author: Jiacheng

Github: https://github.com/Jiachengciel/Data_Analysis

---

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib notebook

---

## 1. Date and Time Data Types and Tools
## 日期和时间数据类型及工具

In [3]:
now = datetime.now()
now

datetime.datetime(2019, 5, 24, 20, 34, 55, 375004)

In [8]:
now.year, now.month, now.day

(2019, 5, 24)

In [14]:
delta = datetime(2019,10,16) - datetime(2019,9,14,12,30)
delta

datetime.timedelta(31, 41400)

In [15]:
delta.days

31

In [16]:
delta.seconds

41400

In [17]:
from datetime import timedelta

In [18]:
start = datetime(2019,5,22)
start + timedelta(12)

datetime.datetime(2019, 6, 3, 0, 0)

In [19]:
start - 2*timedelta(4)

datetime.datetime(2019, 5, 14, 0, 0)

* ### 字符串和datetime的相互转换

In [23]:
stamp = datetime(2019,5,24,21,53,22)
str(stamp)

'2019-05-24 21:53:22'

In [25]:
# 将时间转为字符串
stamp.strftime('%Y-%m-%d, :%w')

'2019-05-24, week:5'

In [29]:
# 将字符串转为时间
value = '2019-05-24 21:23:22'
datetime.strptime(value, "%Y-%m-%d %H:%M:%S")

datetime.datetime(2019, 5, 24, 21, 23, 22)

In [30]:
datestrs = ['7/7/2022', '6/4/2033']

In [31]:
[datetime.strptime(x, '%m/%d/%Y') for x in datestrs]

[datetime.datetime(2022, 7, 7, 0, 0), datetime.datetime(2033, 6, 4, 0, 0)]

In [32]:
from dateutil.parser import parse

In [33]:
parse(value)

datetime.datetime(2019, 5, 24, 21, 23, 22)

In [35]:
parse(datestrs[0])

datetime.datetime(2022, 7, 7, 0, 0)

In [36]:
parse('Jan 31, 1997 10:45 PM')

datetime.datetime(1997, 1, 31, 22, 45)

In [37]:
# 法语时间转换 (日在前)
parse('6/12/2012', dayfirst=True)

datetime.datetime(2012, 12, 6, 0, 0)

In [38]:
pd.to_datetime(datestrs)

DatetimeIndex(['2022-07-07', '2033-06-04'], dtype='datetime64[ns]', freq=None)

In [40]:
# 可以处理缺失值（None、空字符串等）
idx = pd.to_datetime(datestrs + [None])
idx

DatetimeIndex(['2022-07-07', '2033-06-04', 'NaT'], dtype='datetime64[ns]', freq=None)

In [41]:
pd.isnull(idx)

array([False, False,  True])

---

## 2. Time Series Basics
## 时间序列基础

In [42]:
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
         datetime(2011, 1, 7), datetime(2011, 1, 8),
         datetime(2011, 1, 10), datetime(2011, 1, 12)]

In [44]:
ts = pd.Series(np.random.randn(6), index=dates)
ts

2011-01-02   -0.749213
2011-01-05   -1.251453
2011-01-07    0.260048
2011-01-08   -1.209823
2011-01-10    0.966039
2011-01-12    0.577342
dtype: float64

In [45]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [46]:
ts + ts[::2]

2011-01-02   -1.498427
2011-01-05         NaN
2011-01-07    0.520097
2011-01-08         NaN
2011-01-10    1.932077
2011-01-12         NaN
dtype: float64

In [47]:
ts[::2]

2011-01-02   -0.749213
2011-01-07    0.260048
2011-01-10    0.966039
dtype: float64

In [48]:
# datetime64数据类型以纳秒形式存储时间戳
ts.index.dtype

dtype('<M8[ns]')

In [49]:
stamp = ts.index[0]
stamp

Timestamp('2011-01-02 00:00:00')

* ### 索引、选取、子集构造

In [50]:
ts[stamp]

-0.7492134699643734

In [51]:
ts['1/10/2011']

0.9660386951090641

In [52]:
ts['20110110']

0.9660386951090641

In [53]:
longer_ts = pd.Series(np.random.randn(1000),
                      index=pd.date_range('1/1/2000', periods=1000))
longer_ts

2000-01-01    1.414754
2000-01-02   -1.649033
2000-01-03    0.370279
2000-01-04   -0.017020
2000-01-05    1.699716
2000-01-06   -0.282524
2000-01-07   -1.213512
2000-01-08    0.517558
2000-01-09   -1.416052
2000-01-10    0.970632
2000-01-11    1.021684
2000-01-12   -0.404504
2000-01-13    0.690965
2000-01-14    0.122309
2000-01-15   -0.110744
2000-01-16   -0.509067
2000-01-17    0.564909
2000-01-18   -0.346988
2000-01-19   -1.946213
2000-01-20   -0.749713
2000-01-21    2.706861
2000-01-22    0.267854
2000-01-23   -1.725678
2000-01-24    1.059867
2000-01-25   -1.816758
2000-01-26    0.757713
2000-01-27    0.408359
2000-01-28    0.662216
2000-01-29    0.567927
2000-01-30   -1.130735
                ...   
2002-08-28    1.330663
2002-08-29   -0.078021
2002-08-30    0.678289
2002-08-31    0.346193
2002-09-01    0.825817
2002-09-02   -0.058521
2002-09-03    0.321812
2002-09-04    0.792513
2002-09-05    0.676490
2002-09-06   -0.570551
2002-09-07   -0.091276
2002-09-08   -0.318706
2002-09-09 

In [54]:
longer_ts['2001']

2001-01-01    1.111406
2001-01-02    0.915124
2001-01-03   -1.192863
2001-01-04    0.539039
2001-01-05    0.534574
2001-01-06   -0.553234
2001-01-07   -0.106109
2001-01-08   -0.801423
2001-01-09    0.665296
2001-01-10   -0.591913
2001-01-11    1.346858
2001-01-12   -1.326738
2001-01-13    0.688294
2001-01-14   -1.069012
2001-01-15   -0.262390
2001-01-16    1.002606
2001-01-17   -1.036942
2001-01-18   -1.676646
2001-01-19    2.384419
2001-01-20    1.495779
2001-01-21    0.708219
2001-01-22    1.257924
2001-01-23    0.168115
2001-01-24    0.361763
2001-01-25   -1.878366
2001-01-26    1.017998
2001-01-27    1.152794
2001-01-28    1.164169
2001-01-29   -0.594082
2001-01-30   -0.004903
                ...   
2001-12-02    0.753186
2001-12-03    0.764384
2001-12-04    1.047229
2001-12-05    0.633832
2001-12-06    0.128939
2001-12-07   -0.538282
2001-12-08   -1.990220
2001-12-09    0.850463
2001-12-10   -0.814233
2001-12-11    0.765570
2001-12-12    0.277187
2001-12-13   -0.982946
2001-12-14 

In [56]:
longer_ts['2001-10']

2001-10-01   -0.074808
2001-10-02   -0.895283
2001-10-03   -0.424031
2001-10-04    1.059479
2001-10-05    0.095160
2001-10-06   -0.370243
2001-10-07   -1.033803
2001-10-08    0.299366
2001-10-09    1.349765
2001-10-10    0.086255
2001-10-11   -0.626680
2001-10-12   -0.298238
2001-10-13   -0.134869
2001-10-14   -0.216471
2001-10-15   -0.172224
2001-10-16    1.829977
2001-10-17   -1.319769
2001-10-18    1.143113
2001-10-19    0.454013
2001-10-20   -0.261370
2001-10-21   -0.991677
2001-10-22    0.255920
2001-10-23    0.232754
2001-10-24    0.866532
2001-10-25    0.786895
2001-10-26   -1.158657
2001-10-27   -0.648966
2001-10-28   -0.044730
2001-10-29   -0.286047
2001-10-30    0.150069
2001-10-31    0.163963
Freq: D, dtype: float64

In [57]:
ts['20110107':]

2011-01-07    0.260048
2011-01-08   -1.209823
2011-01-10    0.966039
2011-01-12    0.577342
dtype: float64

In [59]:
# 除去2011-01-07之后的日期
ts.truncate(after='01/07/2011')

2011-01-02   -0.749213
2011-01-05   -1.251453
2011-01-07    0.260048
dtype: float64

In [60]:
# 选取星期三
dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')
long_df = pd.DataFrame(np.random.randn(100, 4),
                       index=dates,
                       columns=['Colorado', 'Texas',
                                'New York', 'Ohio'])
long_df.loc['2001-05']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,0.75268,0.579645,0.853143,0.381457
2001-05-09,0.630215,1.023626,-1.126651,-0.824568
2001-05-16,-1.395418,-0.751142,-0.242988,0.56647
2001-05-23,0.145933,0.499219,-1.207261,-0.091429
2001-05-30,0.869496,-0.181453,0.28905,1.649062


* ### 带有重复索引的时间序列

In [63]:
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000',
                          '1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int64

In [64]:
dup_ts.index.is_unique

False

In [65]:
# 对这个时间序列进行索引，要么产生标量值，要么产生切片
dup_ts['1/3/2000']

4

In [66]:
dup_ts['1/2/2000']

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int64