<a href="https://colab.research.google.com/github/JakeOh/202205_itw_bd34/blob/main/da12_datetime.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# SPX 데이터 프레임

In [2]:
# 파일이 저장된 github 주소
spx_csv = 'https://github.com/wesm/pydata-book/raw/2nd-edition/examples/spx.csv'

In [3]:
spx = pd.read_csv(spx_csv)

In [4]:
spx.head()

Unnamed: 0,Date,SPX
0,1990-02-01 00:00:00,328.79
1,1990-02-02 00:00:00,330.92
2,1990-02-05 00:00:00,331.85
3,1990-02-06 00:00:00,329.66
4,1990-02-07 00:00:00,333.75


In [5]:
spx.tail()

Unnamed: 0,Date,SPX
5467,2011-10-10 00:00:00,1194.89
5468,2011-10-11 00:00:00,1195.54
5469,2011-10-12 00:00:00,1207.25
5470,2011-10-13 00:00:00,1203.66
5471,2011-10-14 00:00:00,1224.58


In [6]:
spx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5472 entries, 0 to 5471
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    5472 non-null   object 
 1   SPX     5472 non-null   float64
dtypes: float64(1), object(1)
memory usage: 85.6+ KB


In [9]:
spx['year'] = spx['Date'].str[:4].astype('int')

In [10]:
spx.head()

Unnamed: 0,Date,SPX,year
0,1990-02-01 00:00:00,328.79,1990
1,1990-02-02 00:00:00,330.92,1990
2,1990-02-05 00:00:00,331.85,1990
3,1990-02-06 00:00:00,329.66,1990
4,1990-02-07 00:00:00,333.75,1990


In [14]:
spx['year_month'] = spx['Date'].str[:7]

In [15]:
spx.head()

Unnamed: 0,Date,SPX,year,year_month
0,1990-02-01 00:00:00,328.79,1990,1990-02
1,1990-02-02 00:00:00,330.92,1990,1990-02
2,1990-02-05 00:00:00,331.85,1990,1990-02
3,1990-02-06 00:00:00,329.66,1990,1990-02
4,1990-02-07 00:00:00,333.75,1990,1990-02


In [16]:
spx.groupby('year')['SPX'].mean()

year
1990     334.123506
1991     376.186324
1992     415.747008
1993     451.614822
1994     460.416508
1995     541.719087
1996     670.494843
1997     873.427787
1998    1085.503254
1999    1327.329563
2000    1427.221071
2001    1194.178992
2002     993.934802
2003     965.227540
2004    1130.649444
2005    1207.229444
2006    1310.461633
2007    1477.184343
2008    1220.042055
2009     948.046389
2010    1139.965516
2011    1276.093015
Name: SPX, dtype: float64

In [17]:
spx.groupby('year_month')['SPX'].mean()

year_month
1990-02     330.452632
1990-03     338.465000
1990-04     338.178000
1990-05     350.250000
1990-06     360.386667
              ...     
2011-06    1287.288636
2011-07    1325.184500
2011-08    1185.305652
2011-09    1173.879048
2011-10    1171.356000
Name: SPX, Length: 261, dtype: float64

Date 컬럼의 데이터 타입이 문자열인 경우 주기별(연도별, 분기별, 월별, ...) 통계를 계산하기 위해서는 문자열을 파싱(parsing)해서 파생변수를 추가하고 groupby 연산을 수행해야 함.

날짜/시간을 표현하는 데이터인 경우 데이터 타입을 datetime 타입으로 변환하면, 주기별 통계를 보다 쉽게 수행할 수 있음.

In [18]:
spx = pd.read_csv(spx_csv)

In [19]:
spx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5472 entries, 0 to 5471
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    5472 non-null   object 
 1   SPX     5472 non-null   float64
dtypes: float64(1), object(1)
memory usage: 85.6+ KB


In [20]:
# Date 컬럼의 타입(dtype)을 문자열(object)에서 날짜시간(datetime)으로 변환
spx['Date'] = pd.to_datetime(spx['Date'])

In [22]:
spx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5472 entries, 0 to 5471
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    5472 non-null   datetime64[ns]
 1   SPX     5472 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 85.6 KB


In [26]:
# Date 컬럼을 인덱스로 변환
# 날짜시간 타입에서 주기별(연도별, 분기별, ...) 통계를 계산하기 위해서는
# 인덱스가 날짜시간 타입 인덱스를 가져야 하기 때문에.
spx_with_index = spx.set_index('Date')

In [27]:
spx_with_index

Unnamed: 0_level_0,SPX
Date,Unnamed: 1_level_1
1990-02-01,328.79
1990-02-02,330.92
1990-02-05,331.85
1990-02-06,329.66
1990-02-07,333.75
...,...
2011-10-10,1194.89
2011-10-11,1195.54
2011-10-12,1207.25
2011-10-13,1203.66


In [28]:
spx_with_index.info()  #> DatetimeIndex를 갖는 데이터 프레임

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5472 entries, 1990-02-01 to 2011-10-14
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SPX     5472 non-null   float64
dtypes: float64(1)
memory usage: 85.5 KB


In [30]:
# 연도별 SPX 평균
spx_with_index.resample('Y')['SPX'].mean()

Date
1990-12-31     334.123506
1991-12-31     376.186324
1992-12-31     415.747008
1993-12-31     451.614822
1994-12-31     460.416508
1995-12-31     541.719087
1996-12-31     670.494843
1997-12-31     873.427787
1998-12-31    1085.503254
1999-12-31    1327.329563
2000-12-31    1427.221071
2001-12-31    1194.178992
2002-12-31     993.934802
2003-12-31     965.227540
2004-12-31    1130.649444
2005-12-31    1207.229444
2006-12-31    1310.461633
2007-12-31    1477.184343
2008-12-31    1220.042055
2009-12-31     948.046389
2010-12-31    1139.965516
2011-12-31    1276.093015
Freq: A-DEC, Name: SPX, dtype: float64

In [31]:
# 분기별 SPX 평균
spx_with_index.resample('Q')['SPX'].mean()

Date
1990-03-31     334.751951
1990-06-30     349.796508
1990-09-30     335.881746
1990-12-31     316.562031
1991-03-31     352.282951
                 ...     
2010-12-31    1204.585625
2011-03-31    1302.529032
2011-06-30    1318.332857
2011-09-30    1225.268438
2011-12-31    1171.356000
Freq: Q-DEC, Name: SPX, Length: 88, dtype: float64

In [32]:
# 월별 SPX 평균
spx_with_index.resample('M')['SPX'].mean()

Date
1990-02-28     330.452632
1990-03-31     338.465000
1990-04-30     338.178000
1990-05-31     350.250000
1990-06-30     360.386667
                 ...     
2011-06-30    1287.288636
2011-07-31    1325.184500
2011-08-31    1185.305652
2011-09-30    1173.879048
2011-10-31    1171.356000
Freq: M, Name: SPX, Length: 261, dtype: float64