# 데이터 전처리
* 표준화
* 인코딩
* 결측값 처리
* 단순 데이터 분리

## 표준화(정규화)
* 데이터를 정해진 구간 사이의 값으로 표준화 한다.
* sklearn.preprocessing 모듈의 함수 사용
    * scale(x) : 표준 정규분포를 사용해 표준화, 많이씀
    * robust_scale(x): 중위수와 사분위범위를 사용하여 표준화
    * minmax_scale(x): 최댓값과 최솟값을 사용하여 표준화, 0~1, 많이씀
    * maxabs_scale(x): 최대 절댓값을 사용하여 표준화. -1~1

In [1]:
#scale() : 평균을 0, 표준편차가 1이 되도록 표준화함

import seaborn as sns
iris = sns.load_dataset('iris')
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [2]:
x = iris.iloc[:,:-1] #전체행과 레이블(species)를 제외한 나머지 까지 슬라이싱
x

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [6]:
from sklearn.preprocessing import scale

x_scaled = scale(x)
x_scaled.mean(axis=0)

array([-4.73695157e-16, -7.81597009e-16, -4.26325641e-16, -4.73695157e-16])

In [8]:
for scaled_mean in x_scaled.mean(axis=0):
    print('{:10.9f}'.format(scaled_mean))

-0.000000000
-0.000000000
-0.000000000
-0.000000000


In [9]:
from sklearn.preprocessing import robust_scale #robust_scale 사용

iris_robust_scaled = robust_scale(x)
iris_robust_scaled[:5,:]

array([[-0.53846154,  1.        , -0.84285714, -0.73333333],
       [-0.69230769,  0.        , -0.84285714, -0.73333333],
       [-0.84615385,  0.4       , -0.87142857, -0.73333333],
       [-0.92307692,  0.2       , -0.81428571, -0.73333333],
       [-0.61538462,  1.2       , -0.84285714, -0.73333333]])

In [11]:
from sklearn.preprocessing import minmax_scale #minmax_scale 사용

iris_minmax_scaled = minmax_scale(x)
iris_minmax_scaled[:5,:]

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667]])

In [13]:
from sklearn.preprocessing import maxabs_scale #maxabs_scale

iris_maxabs_scaled = maxabs_scale(x)
iris_maxabs_scaled[:5,:]

array([[0.64556962, 0.79545455, 0.20289855, 0.08      ],
       [0.62025316, 0.68181818, 0.20289855, 0.08      ],
       [0.59493671, 0.72727273, 0.1884058 , 0.08      ],
       [0.58227848, 0.70454545, 0.2173913 , 0.08      ],
       [0.63291139, 0.81818182, 0.20289855, 0.08      ]])

## 스케일과 스케일 백
* 표준화 메서드들을 이용해서 표준화를 할 수도 있지만 StandardScaler, MinMaxScaler, MaxAbsScaler 클래스를 이용해서 표준화를 할 수 있다.
* 이들 클래스를 이용하면 표준화 후 표준화한 값을 원래의 값 범위로 되돌릴 수 있다.(스케일 백 가능)
* 이들 클래스의 표준화 방법은 scale(), minmax_scale(), maxabs_scale() 함수와 같다.

In [18]:
# standardscaler 객체를 이용해 표준화

import seaborn as sns

iris = sns.load_dataset('iris')
iris.iloc[:5,:-1]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [15]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
iris_scaled = sc.fit_transform(iris.iloc[:,:-1])
iris_scaled[:5,:]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ]])

In [22]:
iris_origin = sc.inverse_transform(iris_scaled)
iris_origin[:5,:]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [24]:
from sklearn.preprocessing import MinMaxScaler

ms = MinMaxScaler()
iris_minmax_scaled = ms.fit_transform(iris.iloc[:,:-1])
iris_minmax_scaled[:5,:]

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667]])

In [27]:
iris_minmax_origin = ms.inverse_transform(iris_minmax_scaled)
iris_minmax_origin[:5,:]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

## 레이블 인코딩
* 실제 값에 상관없이 0~k-1 까지의 정수로 변환하는 것
* 문자로 되어있는 데이터를 숫자로 변환해야 하며 preprocessing 모듈의 LabelEncoder 클래스를 이용한다.

In [4]:
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

iris = sns.load_dataset('iris')
y = iris.species #문자열 데이터를 y에 저장
le = LabelEncoder() #인코더하겠음
le.fit(y) #y의 값의 종류가 몇가지인지 계산
species = le.transform(y) #계산된 값을 species에 저장
species

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
le.classes_ #인코딩된 값의 종류 다시 확인

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [26]:
import pandas as pd
df = pd.read_csv('../DataAnalysis/titanic/train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [27]:
y = df.Embarked
y1 = df.Sex
le = LabelEncoder() #인코더하겠음
le1 = LabelEncoder()
le.fit(y) #y의 값의 종류가 몇가지인지 계산
le1.fit(y1) #y1의 값의 종류가 몇가지인지 계산
Embarked = le.transform(y) 
Sex = le1.transform(y1)

Embarked

array([2, 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2,
       1, 2, 2, 2, 0, 2, 1, 2, 0, 0, 1, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0,
       1, 2, 1, 1, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 3, 2, 2, 0, 0,
       2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       2, 0, 2, 2, 0, 2, 1, 2, 0, 2, 2, 2, 0, 2, 2, 0, 1, 2, 0, 2, 0, 2,
       2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2,
       2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 0, 0, 1, 2,
       1, 2, 2, 2, 2, 0, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 1, 0, 2, 2, 0, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1,
       2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 1, 2, 2, 2,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 0,
       2, 2, 2, 1, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0, 1,

In [28]:
le.classes_

array(['C', 'Q', 'S', nan], dtype=object)

In [29]:
le1.classes_

array(['female', 'male'], dtype=object)

In [34]:
df['Sex'] = Sex
df['Embarked'] = Embarked
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,2
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,2
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,2
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,,1,2,W./C. 6607,23.4500,,2
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,0


## OneHotEncoder(): 사이킷런
## get_dummies(): 판다스
## to_categorical(): 케라스

In [47]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(Sex.reshape(-1,1)) #reshape(-1,1) 쓰면 2차원이 1차원 배열로 바뀜
sex_onehot = enc.transform(Sex.reshape(-1,1))
sex_onehot
sex_onehot.toarray()

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [50]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(Embarked.reshape(-1,1)) #reshape(-1,1) 쓰면 2차원이 1차원 배열로 바뀜
embarked_onehot = enc.transform(Embarked.reshape(-1,1))
embarked_onehot
embarked_onehot.toarray()

array([[0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.]])

In [60]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
enc1 = OneHotEncoder()

enc.fit(df[['Sex']])
enc1.fit(df[['Embarked']])

sex_onehot = enc.transform(df[['Sex']])
embarked_onehot = enc1.transform(df[['Embarked']])

embarked_df = pd.DataFrame(embarked_onehot.toarray(), columns=['C','Q','S','NaN'])
sex_df = pd.DataFrame(sex_onehot.toarray(), columns=['Female', 'Male'])

sex_df

Unnamed: 0,Female,Male
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,0.0,1.0
...,...,...
886,0.0,1.0
887,1.0,0.0
888,1.0,0.0
889,0.0,1.0


In [56]:
embarked_df

Unnamed: 0,C,Q,S,NaN
0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
886,0.0,0.0,1.0,0.0
887,0.0,0.0,1.0,0.0
888,0.0,0.0,1.0,0.0
889,1.0,0.0,0.0,0.0


## 결측값 처리
* 데이터는 누락된 정보(결측값)가 있을 경우 이 값을 다른 값으로 채워야한다.
* 결측값 처리는 preprocessing 모듈의 SimpleImputer 클래스를 이용하면 데이터의 평균,중앙값 또는 최빈값중 하나로 채울 수 있다.
* sklearn.preprocessing.SimpleImputer(missing_values=nan, strategy=’mean’, verbose=0, copy=True)
* strategy : 결측값을 채울 방법을 지정하며 mean, median, most_frequent가 있다.

In [73]:
import seaborn as sns
import random
iris = sns.load_dataset('iris')
x = iris.iloc[:,:-1]
for col in range(4): #결측값 랜덤생성
    x.iloc[[random.sample(range(len(iris)), 10)],col] = float('nan') #각 칼럼당 레코드(150개)중 10개의 레코드에 NAN부여
x.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,,0.2
3,4.6,,1.5,
4,5.0,3.6,1.4,0.2


In [74]:
x.mean(axis=0)

sepal_length    5.860714
sepal_width     3.043571
petal_length    3.822857
petal_width     1.211429
dtype: float64

In [84]:
from sklearn.impute import SimpleImputer
import numpy as np

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_df = imp_mean.fit_transform(x)[0:5,:]
pd.DataFrame(imp_df, columns=['sepal_length','sepal_width','petal_length','petal_width'])

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.860714,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,3.822857,0.2
3,4.6,3.043571,1.5,1.211429
4,5.0,3.6,1.4,0.2


In [85]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,2
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,2
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,2
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,,1,2,W./C. 6607,23.4500,,2
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,0


In [121]:
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

df = pd.read_csv('../DataAnalysis/titanic/train.csv')
df

x = df[['Age']]
x1 = df[['Embarked']]



imp_agemedian = SimpleImputer(missing_values=np.nan, strategy='median')
imp_embarkedmf = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
                            
imp_df = imp_agemedian.fit_transform(x)
imp_df1 = imp_embarkedmf.fit_transform(x1)


df['Age'] = imp_df
df['Embarked'] = imp_df1


df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
