In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import platform
from matplotlib import font_manager,rc

if platform.system()=='Darwin':
	rc('font',family='AppleGothic')
elif platform.system()=='Windows':
	font_name=font_manager.FontProperties(fname='c:/windows/Fonts/malgun.ttf').get_name()

	rc('font',family=font_name)

	
# 그래프에 음수를 사용하기 위한 설정
plt.rcParams['axes.unicode_minus']=False

# 문자열 데이터 시계열 데이터로 가져오기

In [17]:
# 데이터 불러오기.
df=pd.read_csv('./data 4/stock-data.csv')
df.head()
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    20 non-null     object
 1   Close   20 non-null     int64 
 2   Start   20 non-null     int64 
 3   High    20 non-null     int64 
 4   Low     20 non-null     int64 
 5   Volume  20 non-null     int64 
dtypes: int64(5), object(1)
memory usage: 1.1+ KB
None


In [18]:
# 날짜 자료형으로 변경해서 새로운 필드로 저장
df['NewDate']=pd.to_datetime(df['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     20 non-null     object        
 1   Close    20 non-null     int64         
 2   Start    20 non-null     int64         
 3   High     20 non-null     int64         
 4   Low      20 non-null     int64         
 5   Volume   20 non-null     int64         
 6   NewDate  20 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 1.2+ KB


In [19]:
# 새로 만들어진 날짜 컬럼을 인덱스로 지정하고 기존의 날짜 컬럼 삭제
df.set_index('NewDate',inplace=True)
df.drop('Date',axis=1,inplace=True)
df.head()

Unnamed: 0_level_0,Close,Start,High,Low,Volume
NewDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-07-02,10100,10850,10900,10000,137977
2018-06-29,10700,10550,10900,9990,170253
2018-06-28,10400,10900,10950,10150,155769
2018-06-27,10900,10800,11050,10500,133548
2018-06-26,10800,10900,11000,10700,63039


In [24]:
# 첫번째 데이터는 날짜로 변경 가능한 데이터, 두번째 데이터는 변경 불가능한 데이터. 
date_strings=np.array(['03-04-2005 11:35 PM','04-09-2006 09:09 TM'])

# 예외 발생
#  print([pd.to_datetime(date_strings,format='%d-%m-%Y %I:%M %p')] for date in date_strings )

# 옵션을 통해 예외 처리
print([pd.to_datetime(date,format='%d-%m-%Y %I:%M %p',errors='ignore') for date in date_strings])
# 예외 발생시 문자열 그대로 저장. 
# list는 2개의 데이터 자료형이 달라도 문제가 없지만 array나 DataFrame 으로 변경시 문제가 발생할 수 있다. 따라서 권장하지 않는 방법이다. 

print(pd.DataFrame([pd.to_datetime(date,format='%d-%m-%Y %I:%M %p',errors='ignore') for date in date_strings]).info())

[Timestamp('2005-04-03 23:35:00'), '04-09-2006 09:09 TM']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       2 non-null      object
dtypes: object(1)
memory usage: 148.0+ bytes
None


In [26]:
# 변환이 안되는 경우 NaT로 변경
print([pd.to_datetime(date,format='%d-%m-%Y %I:%M %p',errors='coerce') for date in date_strings])

[Timestamp('2005-04-03 23:35:00'), NaT]


In [28]:
# 변환이 안되는 경우 예외발생
#print([pd.to_datetime(date,format='%d-%m-%Y %I:%M %p',errors='raise') for date in date_strings])

## 일정 기간을 나타내는 Period

In [33]:
date_strings=np.array(['2023-01-01','2023-02-02','2023-04-05'])

pddates=pd.to_datetime(date_strings)
# 월단위로 변경. 
pr_months=pddates.to_period(freq='N')
print(pr_months)

PeriodIndex(['2023-01-01 00:00:00.000000000', '2023-02-02 00:00:00.000000000',
             '2023-04-05 00:00:00.000000000'],
            dtype='period[N]')


## 일정 주기의 시간 데이터를 생성


In [34]:
# 2023 년 1월 1일 부터 월단위로 12개 생성
ts_ms=pd.date_range(start='2023-01-01',periods=12,freq='M',tz="Asia/Seoul")
print(ts_ms)

DatetimeIndex(['2023-01-31 00:00:00+09:00', '2023-02-28 00:00:00+09:00',
               '2023-03-31 00:00:00+09:00', '2023-04-30 00:00:00+09:00',
               '2023-05-31 00:00:00+09:00', '2023-06-30 00:00:00+09:00',
               '2023-07-31 00:00:00+09:00', '2023-08-31 00:00:00+09:00',
               '2023-09-30 00:00:00+09:00', '2023-10-31 00:00:00+09:00',
               '2023-11-30 00:00:00+09:00', '2023-12-31 00:00:00+09:00'],
              dtype='datetime64[ns, Asia/Seoul]', freq='M')


## 날짜 차이
* __sub__ 을 이용해서 날짜간 차이를 계산한다.

In [39]:
# print(dir(pd.to_datetime(['01-01-23'])))

ababab


## 날짜 뽑기

In [5]:
# 데이터 불러오기.
df=pd.read_csv('./data 4/stock-data.csv')
df['NewDate']=pd.to_datetime(df['Date'])
df.set_index('NewDate',inplace=True)
df.head()

Unnamed: 0_level_0,Date,Close,Start,High,Low,Volume
NewDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-07-02,2018-07-02,10100,10850,10900,10000,137977
2018-06-29,2018-06-29,10700,10550,10900,9990,170253
2018-06-28,2018-06-28,10400,10900,10950,10150,155769
2018-06-27,2018-06-27,10900,10800,11050,10500,133548
2018-06-26,2018-06-26,10800,10900,11000,10700,63039


In [13]:
# 날짜 인덱스이기 때문에 날짜의 일부분을 가고 인덱싱 가능.
df_y=df.loc['2018']
print(df_y.head())

df_7=df.loc['2018-07']
df_7.head()

                  Date  Close  Start   High    Low  Volume
NewDate                                                   
2018-07-02  2018-07-02  10100  10850  10900  10000  137977
2018-06-29  2018-06-29  10700  10550  10900   9990  170253
2018-06-28  2018-06-28  10400  10900  10950  10150  155769
2018-06-27  2018-06-27  10900  10800  11050  10500  133548
2018-06-26  2018-06-26  10800  10900  11000  10700   63039


Unnamed: 0_level_0,Date,Close,Start,High,Low,Volume
NewDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-07-02,2018-07-02,10100,10850,10900,10000,137977


In [19]:
df_y=df.loc['2018-06-25':'2018-06-30','Start':'Low']
df_y.head()

Unnamed: 0_level_0,Start,High,Low
NewDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-06-29,10550,10900,9990
2018-06-28,10900,10950,10150
2018-06-27,10800,11050,10500
2018-06-26,10900,11000,10700
2018-06-25,11400,11450,11000


## 대치법

In [25]:
time_index=pd.date_range('01-01-2023',periods=5,freq='MS')
df=pd.DataFrame(index=time_index)
df['Sales']=[1.0,2.0,np.nan,np.nan,5.0]
print(df)

            Sales
2023-01-01    1.0
2023-02-01    2.0
2023-03-01    NaN
2023-04-01    NaN
2023-05-01    5.0


In [27]:
print(df.ffill())

            Sales
2023-01-01    1.0
2023-02-01    2.0
2023-03-01    2.0
2023-04-01    2.0
2023-05-01    5.0


In [29]:
# 선형 보간 
print(df.interpolate())
# 비선형 보간
print(df.interpolate(method='quadratic'))

            Sales
2023-01-01    1.0
2023-02-01    2.0
2023-03-01    3.0
2023-04-01    4.0
2023-05-01    5.0
               Sales
2023-01-01  1.000000
2023-02-01  2.000000
2023-03-01  2.923185
2023-04-01  3.967379
2023-05-01  5.000000


### 이동시간 윈도우

In [31]:
# 이동 시간 평균
df['Stock_Price']=[1,2,3,4,5]
print(df.rolling(window=2).mean()) # window - 값을 예측하기 위해 사용하는 값의 개수

# 지수 이동 평균
print(df.ewm(span=3).mean())

            Sales  Stock_Price
2023-01-01    NaN          NaN
2023-02-01    1.5          1.5
2023-03-01    NaN          2.5
2023-04-01    NaN          3.5
2023-05-01    NaN          4.5
               Sales  Stock_Price
2023-01-01  1.000000     1.000000
2023-02-01  1.666667     1.666667
2023-03-01  1.666667     2.428571
2023-04-01  1.666667     3.266667
2023-05-01  4.473684     4.161290
