In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import platform
from matplotlib import font_manager, rc

# 한글
if platform.system() == "Windows":
    font_name = font_manager.FontProperties(fname = 'c:/Windows/Fonts/malgun.ttf').get_name()
    rc('font', family = font_name)
elif platform.system() == "Darwin":
    rc('font', family = 'AppleGothic')

plt.rcParams['axes.unicode_minus'] = False

In [3]:
df = pd.read_csv('./data4/stock-data.csv')
print(df.head())
print(df.info())

         Date  Close  Start   High    Low  Volume
0  2018-07-02  10100  10850  10900  10000  137977
1  2018-06-29  10700  10550  10900   9990  170253
2  2018-06-28  10400  10900  10950  10150  155769
3  2018-06-27  10900  10800  11050  10500  133548
4  2018-06-26  10800  10900  11000  10700   63039
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    20 non-null     object
 1   Close   20 non-null     int64 
 2   Start   20 non-null     int64 
 3   High    20 non-null     int64 
 4   Low     20 non-null     int64 
 5   Volume  20 non-null     int64 
dtypes: int64(5), object(1)
memory usage: 1.1+ KB
None


In [4]:
# 날짜 자료형으로 변경해서 새로운 필드로 저장
df['NewDate'] = pd.to_datetime(df['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     20 non-null     object        
 1   Close    20 non-null     int64         
 2   Start    20 non-null     int64         
 3   High     20 non-null     int64         
 4   Low      20 non-null     int64         
 5   Volume   20 non-null     int64         
 6   NewDate  20 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 1.2+ KB


In [5]:
# 새로 만들어진 날짜 컬럼을 인덱스로 지정하고 기존의 날짜 컬럼 삭제
df.set_index('NewDate', inplace= True)
df.drop('Date', axis = 1, inplace = True)
print(df.head())

            Close  Start   High    Low  Volume
NewDate                                       
2018-07-02  10100  10850  10900  10000  137977
2018-06-29  10700  10550  10900   9990  170253
2018-06-28  10400  10900  10950  10150  155769
2018-06-27  10900  10800  11050  10500  133548
2018-06-26  10800  10900  11000  10700   63039


In [18]:
# 첫 번째 데이터는 날짜로 변경가능한 데이터
# 두 번째 데이터는 날짜로 변경 불가능한 데이터
date_strings = np.array([
    '03-04-2005 11:35 PM', '04-09-2005 09:09 TM'])

# 예외 발생
# print([pd.to_datetime(date, format = '%d-%m-%Y %I:%M %p') for date in date_strings])

# 예외가 발생하면 문자열 그대로 저장
# list 는 2개의 데이터 자료형이 달라고 문제가 없지만
# array 나 dataframe 으로 변환했을 때 문제가 발생할 수 있음 -- 권장하지 않음
''' print([pd.to_datetime(date, format = '%d-%m-%Y %I:%M %p', errors= 'ignore') for date in date_strings])
print(np.array([pd.to_datetime(date, format = '%d-%m-%Y %I:%M %p', errors= 'ignore') for date in date_strings]))
print(pd.DataFrame([pd.to_datetime(date, format = '%d-%m-%Y %I:%M %p', errors= 'ignore') for date in date_strings]).info())
'''

# 변환이 안되는 경우 NaT 로 설정 : 결측치 처리
'''
print(pd.DataFrame([pd.to_datetime(date, format = '%d-%m-%Y %I:%M %p', errors= 'coerce') for date in date_strings]).info())
'''

# 예외를 발생
print(pd.DataFrame([pd.to_datetime(date, format = '%d-%m-%Y %I:%M %p', errors= 'raise') for date in date_strings]).info())


ValueError: time data '04-09-2005 09:09 TM' does not match format '%d-%m-%Y %I:%M %p' (match)

In [19]:
date_strings = np.array([
    '2023-01-01', '2023-02-02', '2023-04-05'])

pddates = pd.to_datetime(date_strings)
# 월 단위로 변경
pr_months = pddates.to_period(freq = 'M')
print(pr_months)

PeriodIndex(['2023-01', '2023-02', '2023-04'], dtype='period[M]')


In [32]:
# 2023년 1월 1일부터 월 단위로 12개를 생성
ts_ms = pd.date_range(start = '2023-01-01', periods = 12, freq = 'MS', tz = 'Asia/Seoul')
ts_ms

DatetimeIndex(['2023-01-01 00:00:00+09:00', '2023-02-01 00:00:00+09:00',
               '2023-03-01 00:00:00+09:00', '2023-04-01 00:00:00+09:00',
               '2023-05-01 00:00:00+09:00', '2023-06-01 00:00:00+09:00',
               '2023-07-01 00:00:00+09:00', '2023-08-01 00:00:00+09:00',
               '2023-09-01 00:00:00+09:00', '2023-10-01 00:00:00+09:00',
               '2023-11-01 00:00:00+09:00', '2023-12-01 00:00:00+09:00'],
              dtype='datetime64[ns, Asia/Seoul]', freq='MS')

In [34]:
# print(dir(pd.to_datetime(['21-08-2023'])))
# print(dir(str))

['__add__', '__class__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mod__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'capitalize', 'casefold', 'center', 'count', 'encode', 'endswith', 'expandtabs', 'find', 'format', 'format_map', 'index', 'isalnum', 'isalpha', 'isascii', 'isdecimal', 'isdigit', 'isidentifier', 'islower', 'isnumeric', 'isprintable', 'isspace', 'istitle', 'isupper', 'join', 'ljust', 'lower', 'lstrip', 'maketrans', 'partition', 'removeprefix', 'removesuffix', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'split', 'splitlines', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'zfill']


In [35]:
# 데이터를 가져와서 문자열을 날짜로 변경
df = pd.read_csv('./data4/stock-data.csv')
df['newDate'] = pd.to_datetime(df['Date'])
df.set_index('newDate', inplace = True)
print(df.head())

                  Date  Close  Start   High    Low  Volume
newDate                                                   
2018-07-02  2018-07-02  10100  10850  10900  10000  137977
2018-06-29  2018-06-29  10700  10550  10900   9990  170253
2018-06-28  2018-06-28  10400  10900  10950  10150  155769
2018-06-27  2018-06-27  10900  10800  11050  10500  133548
2018-06-26  2018-06-26  10800  10900  11000  10700   63039


In [40]:
# 날짜 인덱스라서 날짜의 일부분을 가지고 인덱싱 가능
df_y = df.loc['2018']
print(df_y.head())

                  Date  Close  Start   High    Low  Volume
newDate                                                   
2018-07-02  2018-07-02  10100  10850  10900  10000  137977
2018-06-29  2018-06-29  10700  10550  10900   9990  170253
2018-06-28  2018-06-28  10400  10900  10950  10150  155769
2018-06-27  2018-06-27  10900  10800  11050  10500  133548
2018-06-26  2018-06-26  10800  10900  11000  10700   63039


In [38]:
df_y = df.loc['2018-07']
print(df_y.head())

                  Date  Close  Start   High    Low  Volume
newDate                                                   
2018-07-02  2018-07-02  10100  10850  10900  10000  137977


In [41]:
df_y = df.loc['2018-06-25' : '2018-06-30']
print(df_y.head())

df_y = df.loc['2018-06-25' : '2018-06-30', 'Start' : 'Low']
print(df_y.head())

                  Date  Close  Start   High    Low  Volume
newDate                                                   
2018-06-29  2018-06-29  10700  10550  10900   9990  170253
2018-06-28  2018-06-28  10400  10900  10950  10150  155769
2018-06-27  2018-06-27  10900  10800  11050  10500  133548
2018-06-26  2018-06-26  10800  10900  11000  10700   63039
2018-06-25  2018-06-25  11150  11400  11450  11000   55519
            Start   High    Low
newDate                        
2018-06-29  10550  10900   9990
2018-06-28  10900  10950  10150
2018-06-27  10800  11050  10500
2018-06-26  10900  11000  10700
2018-06-25  11400  11450  11000


In [51]:
time_index = pd.date_range('01-01-2023', periods = 5, freq = 'M')
dataframe = pd.DataFrame(index = time_index)
dataframe['Sales'] = [1.0, 2.0, np.nan, np.nan, 5.0]
# print(dataframe)

# 앞의 데이터로 채우기
# print(dataframe.ffill())

# 보간법
print(dataframe.interpolate())
print(dataframe.interpolate(method = 'quadratic'))

            Sales
2023-01-31    1.0
2023-02-28    2.0
2023-03-31    3.0
2023-04-30    4.0
2023-05-31    5.0
               Sales
2023-01-31  1.000000
2023-02-28  2.000000
2023-03-31  3.059808
2023-04-30  4.038069
2023-05-31  5.000000


In [57]:
dataframe['Stock_Price'] = [1,2,3,4,5]
# 단순 이동 평균
print(dataframe.rolling(window = 2).mean())

# 지수 이동 평균
print(dataframe.ewm(span = 2).mean())

            Sales  Stock_Price
2023-01-31    NaN          NaN
2023-02-28    1.5          1.5
2023-03-31    NaN          2.5
2023-04-30    NaN          3.5
2023-05-31    NaN          4.5
               Sales  Stock_Price
2023-01-31  1.000000     1.000000
2023-02-28  1.750000     1.750000
2023-03-31  1.750000     2.615385
2023-04-30  1.750000     3.550000
2023-05-31  4.847059     4.520661


In [61]:
import cv2
print(cv2.__version__)

4.8.0
