In [16]:
from datetime import datetime
import pandas as pd

In [2]:
# 현재 시간 확인하기
now = datetime.now()

In [3]:
now

datetime.datetime(2020, 7, 1, 18, 24, 40, 743471)

In [4]:
# 연,월,일
now.year,now.month,now.day

(2020, 7, 1)

In [5]:
# 문자열 변환하기
stamp = datetime(2020,7,1)

In [6]:
str(stamp)

'2020-07-01 00:00:00'

In [8]:
# 변환한 문자열을 날짜로 매핑
stamp.strftime("%Y-%m-%d")

'2020-07-01'

In [10]:
# 알려진 형식의 날짜를 파싱
from dateutil.parser import parse

In [13]:
parse("2020-07-01")

datetime.datetime(2020, 7, 1, 0, 0)

In [18]:
datestrs = ["2020-07-01","2020-07-03 00:00:00"]

In [20]:
# 누락된 값을 결측값으로 명시
idx = pd.to_datetime(datestrs + [None])
idx

DatetimeIndex(['2020-07-01', '2020-07-03', 'NaT'], dtype='datetime64[ns]', freq=None)

In [21]:
pd.isnull(idx)

array([False, False,  True])

### 시계열 기초

- pandas에서 찾아볼 수 있는 가장 기본적인 시계열 객체의 종류는 파이썬 문자열이나 datetime 객체로 표현되는 타임스태프로 색인된 **Series**이다.

In [26]:
from datetime import datetime
import numpy as np

In [24]:
dates  = [datetime(2020,7,1),datetime(2020,7,2),
         datetime(2020,7,3),datetime(2020,7,4),
         datetime(2020,7,5),datetime(2020,7,6)]

In [28]:
ts = pd.Series(np.random.randn(6), index = dates)
ts

2020-07-01    0.054274
2020-07-02    0.403000
2020-07-03    1.252946
2020-07-04    0.684920
2020-07-05   -0.635219
2020-07-06   -1.663174
dtype: float64

In [29]:
# datetime 객체, 변수 타입 확인
ts.index

DatetimeIndex(['2020-07-01', '2020-07-02', '2020-07-03', '2020-07-04',
               '2020-07-05', '2020-07-06'],
              dtype='datetime64[ns]', freq=None)

In [30]:
# 서로 다르게 색인된 시계열 객체 간의 산술 연산은 자동으로 날짜에 맞춰짐
# ts[::2]는 매 두 번째 항목을 선택함.
ts + ts[::2]

2020-07-01    0.108549
2020-07-02         NaN
2020-07-03    2.505892
2020-07-04         NaN
2020-07-05   -1.270439
2020-07-06         NaN
dtype: float64

In [32]:
stamp = ts.index[0]
stamp

Timestamp('2020-07-01 00:00:00')

### 색인, 선택, 부분 선택

In [34]:
stamp = ts.index[2]

ts[stamp]

1.2529460843585667

In [40]:
# 해석할 수 있는 날짜를 문자열로 넘겨서 사용이 가능하다.
print(ts['2020-07-01'])
print(ts['20200701'])

0.054274386836509536
0.054274386836509536


In [44]:
# 긴 시계열에서는 연을 넘기거나 연, 월만 넘겨서 데이터의 일부 구간만 선택할 수도 있다.
longer_ts = pd.Series(np.random.randn(1000),index = pd.date_range('1/1/2020',periods = 1000))
longer_ts

2020-01-01    0.806950
2020-01-02   -0.725303
2020-01-03   -0.049932
2020-01-04    1.559068
2020-01-05   -1.065181
2020-01-06   -0.966798
2020-01-07   -0.283359
2020-01-08   -1.431913
2020-01-09   -0.427461
2020-01-10   -0.931798
2020-01-11   -0.424629
2020-01-12   -1.691623
2020-01-13    1.807351
2020-01-14   -0.932666
2020-01-15   -0.360687
2020-01-16    0.693026
2020-01-17    0.211050
2020-01-18    0.609457
2020-01-19   -0.030732
2020-01-20    0.274425
2020-01-21    1.029461
2020-01-22    1.132328
2020-01-23    0.334287
2020-01-24    1.482782
2020-01-25   -3.291775
2020-01-26    0.982073
2020-01-27    2.500419
2020-01-28    0.437161
2020-01-29   -0.784250
2020-01-30   -0.276050
                ...   
2022-08-28   -0.594818
2022-08-29   -1.362187
2022-08-30   -0.294713
2022-08-31    1.651053
2022-09-01   -0.180731
2022-09-02    0.019350
2022-09-03    0.576186
2022-09-04    0.250389
2022-09-05   -0.547938
2022-09-06    0.368432
2022-09-07   -0.596214
2022-09-08   -1.208504
2022-09-09 

In [46]:
# 2021은 연도로 해석되어 해당 기간의 데이터를 선택한다.
longer_ts['2021']

2021-01-01    0.492526
2021-01-02   -0.536421
2021-01-03   -0.137875
2021-01-04   -0.916347
2021-01-05    0.210597
2021-01-06    0.580741
2021-01-07   -1.720000
2021-01-08    1.546061
2021-01-09    0.692369
2021-01-10   -1.110015
2021-01-11    0.744553
2021-01-12    0.183845
2021-01-13    0.538269
2021-01-14    0.066876
2021-01-15    0.401320
2021-01-16    0.201040
2021-01-17    0.997710
2021-01-18   -0.262033
2021-01-19    0.363940
2021-01-20   -0.610084
2021-01-21   -1.575704
2021-01-22   -2.186575
2021-01-23   -1.710334
2021-01-24    0.072193
2021-01-25    0.214304
2021-01-26    1.864906
2021-01-27   -0.350013
2021-01-28   -0.285516
2021-01-29   -0.341554
2021-01-30    0.885437
                ...   
2021-12-02    0.146749
2021-12-03    1.834164
2021-12-04   -0.755034
2021-12-05   -1.680480
2021-12-06   -1.905769
2021-12-07   -1.077578
2021-12-08   -0.177785
2021-12-09   -1.389382
2021-12-10   -0.536220
2021-12-11    0.019320
2021-12-12   -0.471427
2021-12-13   -0.659326
2021-12-14 

In [47]:
# 월도 가능하다.
longer_ts['2020-5']

2020-05-01   -1.914367
2020-05-02   -2.172528
2020-05-03    0.564960
2020-05-04    2.048677
2020-05-05    0.916654
2020-05-06   -0.324513
2020-05-07   -0.629943
2020-05-08    0.871479
2020-05-09   -0.598341
2020-05-10    0.612794
2020-05-11   -0.333920
2020-05-12   -2.183572
2020-05-13    2.056281
2020-05-14    1.723599
2020-05-15   -0.798565
2020-05-16    0.065589
2020-05-17   -1.863095
2020-05-18   -1.822769
2020-05-19   -0.320055
2020-05-20   -0.745262
2020-05-21   -1.140775
2020-05-22   -0.351384
2020-05-23   -1.021577
2020-05-24   -0.500514
2020-05-25   -0.656809
2020-05-26   -1.073100
2020-05-27    0.723270
2020-05-28    1.160726
2020-05-29   -0.067726
2020-05-30   -0.900609
2020-05-31   -1.064962
Freq: D, dtype: float64

In [54]:
# datetime 객체로 데이터를 잘라내는 작업은 일반적인 Series와 동일한 방식으로 할 수 있다.
ts

2020-07-01    0.054274
2020-07-02    0.403000
2020-07-03    1.252946
2020-07-04    0.684920
2020-07-05   -0.635219
2020-07-06   -1.663174
dtype: float64

In [55]:
ts[datetime(2020,7,3) :]

2020-07-03    1.252946
2020-07-04    0.684920
2020-07-05   -0.635219
2020-07-06   -1.663174
dtype: float64

In [56]:
ts[:datetime(2020,7,5)]

2020-07-01    0.054274
2020-07-02    0.403000
2020-07-03    1.252946
2020-07-04    0.684920
2020-07-05   -0.635219
dtype: float64

대부분의 시계열 데이터는 연대순으로 졍렬되기 때문에 범위를 지정하기 위해 시계열에 포함하지 않고 타임스탬프를 이용해서 Series를 나눌 수 있다.

In [57]:
ts

2020-07-01    0.054274
2020-07-02    0.403000
2020-07-03    1.252946
2020-07-04    0.684920
2020-07-05   -0.635219
2020-07-06   -1.663174
dtype: float64

In [58]:
ts['2020/7/1' : '2020/7/3']

2020-07-01    0.054274
2020-07-02    0.403000
2020-07-03    1.252946
dtype: float64

In [60]:
# 두 개의 날짜로 나누기

ts.truncate(after='2020/7/3')

2020-07-01    0.054274
2020-07-02    0.403000
2020-07-03    1.252946
dtype: float64