In [1]:
# 데이터 정규화 : 데이터의 상대적인 크기 차이를 제거
# 정규화 : 모든 실수를 0~1 또는 -1~1 사이의 수로 변환 해주는 작업
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv('./dataset/auto-mpg.csv',header=None)
df.columns = ['mpg','cylinders','displacement','horsepower','weight',
             'acceleration','model year','origin','name']

In [5]:
# 데이터 표준화해주기
df['horsepower'].replace('?',np.nan,inplace=True)
df.dropna(subset=['horsepower'],axis=0,inplace=True)
df['horsepower'] = df['horsepower'].astype('float')

In [6]:
# 정규화
# 해당 열의 최대값으로 모든 데이터를 나누어 저장
df['h_1'] = df.horsepower / abs(df.horsepower.max())
print(df[['horsepower','h_1']])

# 해당 열의 ( 최대값 - 최소값 ) 으로 모든 데이터를 나누어 저장
df['h_2'] = df.horsepower / (df.horsepower.max()-df.horsepower.min())
print(df[['horsepower','h_2']])

     horsepower       h_1
0         130.0  0.565217
1         165.0  0.717391
2         150.0  0.652174
3         150.0  0.652174
4         140.0  0.608696
..          ...       ...
393        86.0  0.373913
394        52.0  0.226087
395        84.0  0.365217
396        79.0  0.343478
397        82.0  0.356522

[392 rows x 2 columns]
     horsepower       h_2
0         130.0  0.706522
1         165.0  0.896739
2         150.0  0.815217
3         150.0  0.815217
4         140.0  0.760870
..          ...       ...
393        86.0  0.467391
394        52.0  0.282609
395        84.0  0.456522
396        79.0  0.429348
397        82.0  0.445652

[392 rows x 2 columns]


In [43]:
# 시계열 데이터
#시간에 따른 변화추이, 순차적인 데이터변화 추이를 살필 때
df = pd.read_csv('./dataset/stock-data.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    20 non-null     object
 1   Close   20 non-null     int64 
 2   Start   20 non-null     int64 
 3   High    20 non-null     int64 
 4   Low     20 non-null     int64 
 5   Volume  20 non-null     int64 
dtypes: int64(5), object(1)
memory usage: 1.1+ KB


Unnamed: 0,Date,Close,Start,High,Low,Volume
0,2018-07-02,10100,10850,10900,10000,137977
1,2018-06-29,10700,10550,10900,9990,170253
2,2018-06-28,10400,10900,10950,10150,155769
3,2018-06-27,10900,10800,11050,10500,133548
4,2018-06-26,10800,10900,11000,10700,63039


In [44]:
# Date 컬럼의 자료를 new_Date 로 pandas.datetime 형식으로
df['new_Date'] = pd.to_datetime(df['Date'])
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      20 non-null     object        
 1   Close     20 non-null     int64         
 2   Start     20 non-null     int64         
 3   High      20 non-null     int64         
 4   Low       20 non-null     int64         
 5   Volume    20 non-null     int64         
 6   new_Date  20 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 1.2+ KB


Unnamed: 0,Date,Close,Start,High,Low,Volume,new_Date
0,2018-07-02,10100,10850,10900,10000,137977,2018-07-02
1,2018-06-29,10700,10550,10900,9990,170253,2018-06-29
2,2018-06-28,10400,10900,10950,10150,155769,2018-06-28
3,2018-06-27,10900,10800,11050,10500,133548,2018-06-27
4,2018-06-26,10800,10900,11000,10700,63039,2018-06-26


In [45]:
# 보통 시계열 데이터를 인덱스로 많이 씀
# new_Date 컬럼을 인덱스로 설정
df_new = df.copy()
df_new.set_index('new_Date',inplace=True)
df_new.drop('Date',axis=1,inplace=True)
df_new

Unnamed: 0_level_0,Close,Start,High,Low,Volume
new_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-07-02,10100,10850,10900,10000,137977
2018-06-29,10700,10550,10900,9990,170253
2018-06-28,10400,10900,10950,10150,155769
2018-06-27,10900,10800,11050,10500,133548
2018-06-26,10800,10900,11000,10700,63039
2018-06-25,11150,11400,11450,11000,55519
2018-06-22,11300,11250,11450,10750,134805
2018-06-21,11200,11350,11750,11200,133002
2018-06-20,11550,11200,11600,10900,308596
2018-06-19,11300,11850,11950,11300,180656


In [46]:
# Timestamp를 기간 Period 로 변환
dates = ['2019-01-01','2020-03-01','2021-06-01']
print(type(dates[0]))
# str 을 timestamp 로 변환
ts_dates = pd.to_datetime(dates)
print(ts_dates)

ts_day = ts_dates.to_period(freq='D') # 'D'날짜 'M'월 'A'연도
print(ts_day)

<class 'str'>
DatetimeIndex(['2019-01-01', '2020-03-01', '2021-06-01'], dtype='datetime64[ns]', freq=None)
PeriodIndex(['2019-01-01', '2020-03-01', '2021-06-01'], dtype='period[D]')


In [47]:
# 날짜를 분리
df['Year'] = df['new_Date'].dt.year
df['Month'] = df['new_Date'].dt.month
df['Day'] = df['new_Date'].dt.day
df.head()

Unnamed: 0,Date,Close,Start,High,Low,Volume,new_Date,Year,Month,Day
0,2018-07-02,10100,10850,10900,10000,137977,2018-07-02,2018,7,2
1,2018-06-29,10700,10550,10900,9990,170253,2018-06-29,2018,6,29
2,2018-06-28,10400,10900,10950,10150,155769,2018-06-28,2018,6,28
3,2018-06-27,10900,10800,11050,10500,133548,2018-06-27,2018,6,27
4,2018-06-26,10800,10900,11000,10700,63039,2018-06-26,2018,6,26


In [53]:
# 종가, 시작가, 연, 월, 일, new_Date만 추출 -> df_stock 에 저장
# new_Date를 인덱스로 설정
df_stock = df.loc[:,['Start','Close','Year','Month','Day','new_Date']].copy()
df_stock.set_index('new_Date',inplace=True)
df_stock

Unnamed: 0_level_0,Start,Close,Year,Month,Day
new_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-07-02,10850,10100,2018,7,2
2018-06-29,10550,10700,2018,6,29
2018-06-28,10900,10400,2018,6,28
2018-06-27,10800,10900,2018,6,27
2018-06-26,10900,10800,2018,6,26
2018-06-25,11400,11150,2018,6,25
2018-06-22,11250,11300,2018,6,22
2018-06-21,11350,11200,2018,6,21
2018-06-20,11200,11550,2018,6,20
2018-06-19,11850,11300,2018,6,19
