#### 1) 데이터 표준화 ( Z-score normalzation)

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

In [4]:
df = pd.read_csv("../mtcars.csv")
df.head()

Unnamed: 0,car,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [16]:
from sklearn.preprocessing import StandardScaler
zscaler = StandardScaler() # 변수명은 사용하기 편한 변수명으로 사용
df['zscore'] = zscaler.fit_transform(df[['mpg']])
df.head()

Unnamed: 0,car,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,zscore
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4,0.153299
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4,0.153299
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1,0.456737
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1,0.22073
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2,-0.234427


In [18]:
# 확인
print(df['zscore'].mean(), df['zscore'].std())

-4.996003610813204e-16 1.016001016001524


#### 직접 계산

In [19]:
# Z = ( X - 평균 ) / 표준 편차
std = df['mpg'].std() # 표준 편차 구하기
mean_mpg = df['mpg'].mean() # 평균 구하기
df['zscore_self'] = ( df['mpg'] - mean_mpg ) / std
df.head(3)

Unnamed: 0,car,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,zscore,zscore_self
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4,0.153299,0.150885
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4,0.153299,0.150885
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1,0.456737,0.449543


In [20]:
# 확인
print(df['zscore_self'].mean(), df['zscore_self'].std())

-4.996003610813204e-16 1.0


In [21]:
np_std = np.std(df['mpg']) # 기본값 ddof = 0, 모표준편차(분모의 자유도 n )
# 사이킷런의 StandardScaler는 모표준편차(분모의 자유도 n)를 사용

np_std1 = np.std( df['mpg'], ddof = 1 ) # ddof = 1 표본표준편차 ( 분모의 자유도 n - 1 )
df_std = df['mpg'].std()                # std() 표본표준편차 ( 분모의 자유도 n - 1 )

print('모표준편차:', np_std)
print('표본표준편차1:', np_std1)
print('표본표준편차2:', df_std)

모표준편차: 5.932029552301218
표본표준편차1: 6.026948052089104
표본표준편차2: 6.026948052089104


#### 2) 데이터 정규화 ( min-max normalization )

In [38]:
df = pd.read_csv("../mtcars.csv")
df.head()

from sklearn.preprocessing import MinMaxScaler
mscaler = MinMaxScaler()
df['mpg'] = mscaler.fit_transform(df[['mpg']])
df.head()

Unnamed: 0,car,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,0.451064,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,0.451064,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,0.52766,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,0.468085,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,0.353191,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [39]:
# 확인
print(df['mpg'].min(), df['mpg'].max())

0.0 1.0


#### 6. 데이터 합치기

In [40]:
# 행, 열 방향으로 데이터 합치기
df = sns.load_dataset('iris')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [41]:
# 데이터 2개로 분리
df1 = df.loc[0:30, ] # 0 ~ 30행 데이터
df2 = df.loc[31:60, ] # 31 ~ 60행 데이터

In [42]:
df1.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [43]:
df2.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
31,5.4,3.4,1.5,0.4,setosa
32,5.2,4.1,1.5,0.1,setosa
33,5.5,4.2,1.4,0.2,setosa
34,4.9,3.1,1.5,0.2,setosa
35,5.0,3.2,1.2,0.2,setosa


In [47]:
df_sum = pd.concat([df1, df2], axis=0) # 행 방향으로 결합 ( 위, 아래 )
print(df_sum.head())
print(df_sum.shape)

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa
(61, 5)


In [51]:
# 데이터 2개로 나누기 ( 열 방향 )
df1 = df.loc[:, 'sepal_length':'petal_length'] # 열 방향으로 결합 (좌, 우)
df2 = df.loc[:, ['petal_width', 'species'] ] # 4~5열 데이터

In [52]:
df_sum = pd.concat([df1, df2], axis=1) # 열 방향으로 결합 ( 좌, 우 )
df_sum.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


#### 7. 날짜/시간 데이터, index 다루기

In [59]:
# 데이터 만들기
df = pd.DataFrame( {
    '날짜': ['20230105', '20230105', '20230223', '20230223', '20230312', '20230422', '20230511'],
    '물품' : ['A', 'B', 'A', 'B', 'A', 'B', 'A' ],
    '판매수' : [5, 10, 15, 15, 20, 25, 40],
    '개당수익': [500, 600, 500, 600, 600, 700, 600] })
df

Unnamed: 0,날짜,물품,판매수,개당수익
0,20230105,A,5,500
1,20230105,B,10,600
2,20230223,A,15,500
3,20230223,B,15,600
4,20230312,A,20,600
5,20230422,B,25,700
6,20230511,A,40,600


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   날짜      7 non-null      object
 1   물품      7 non-null      object
 2   판매수     7 non-null      int64 
 3   개당수익    7 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 352.0+ bytes


In [62]:
# 데이터 타입 datatime으로 변경
df['날짜'] = pd.to_datetime(df['날짜'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   날짜      7 non-null      datetime64[ns]
 1   물품      7 non-null      object        
 2   판매수     7 non-null      int64         
 3   개당수익    7 non-null      int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 352.0+ bytes


In [63]:
# 년, 월, 일 변수(열) 추가하기
df['year'] = df['날짜'].dt.year
df['month'] = df['날짜'].dt.month
df['day'] = df['날짜'].dt.day
df

Unnamed: 0,날짜,물품,판매수,개당수익,year,month,day
0,2023-01-05,A,5,500,2023,1,5
1,2023-01-05,B,10,600,2023,1,5
2,2023-02-23,A,15,500,2023,2,23
3,2023-02-23,B,15,600,2023,2,23
4,2023-03-12,A,20,600,2023,3,12
5,2023-04-22,B,25,700,2023,4,22
6,2023-05-11,A,40,600,2023,5,11


In [65]:
# 날짜 구간 필터링
df [df['날짜'].between('2023-01-01', '2023-02-23') # 좌우 모두 포함
# (주의) 날짜와 시간이 같이 있는 데이터에 between 함수를 쓸 경우 형식이 동일해야 함
# (ex : 2023-01-05 12:30:05 => between('2023-01-05 12:00:00', '2023-01-05 12:44:33') 


SyntaxError: incomplete input (4226567859.py, line 5)