# Data Preprocessing

준비한 데이터를 알고리즘을 통해 학습시키기 위해 적합한 형태로 가공합니다.

In [2]:
import pandas as pd
import numpy as np

In [4]:
# read data_preprocessing.csv with encoding='UTF-8'
# check df.shape
df = pd.read_csv("data_preprocessing.csv",encoding='UTF-8')
df.shape

(10, 4)

In [5]:
# check df.head()
df.head()


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [9]:
# get value with .iloc method
# X, y에 feature df, label df 할당하기
# 뒤에 values를 추가하면 array형태로 나옴

X = df.iloc[:,:3].values

y = df.iloc[:,3].values


In [19]:
# X 확인
X


array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [11]:
# y 확인
y


array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Missing Value

수집되지 못한 값을 적절하게 추청하여 사용합니다.

In [22]:
# from sklearn.impute import SimpleImputer
# imputer를 SimpleImputer()으로 설정하기 missing value를 평균값으로 처리
# X[:, 1:3] 만큼으로 imputer fitting 하기
# X[:, 1:3] 만큼을 변환시키기 imputer.transform


from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(X[:,1:3])



SimpleImputer()

In [23]:
X[:,1:3] = imputer.transform(X[:,1:3])

In [24]:
# 변화된 X 확인
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Categorical Value

### Encoding Independent Variable

One Hot Encoding: 원-핫 인코딩은 단어 집합의 크기를 벡터의 차원으로 하고, 표현하고 싶은 단어의 인덱스에 1의 값을 부여하고, 다른 인덱스에는 0을 부여하는 단어의 벡터 표현 방식입니다. 이렇게 표현된 벡터를 원-핫 벡터(One-Hot vector)라고 합니다.

Reference. https://wikidocs.net/22647

In [35]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# ColumnTransformer를 이용하여 column 변화하는 object ct 생성
# transformers : list of tuples
# List of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data.
# param: transformers: [name='encoder', transformer=OneHotEncoder(), column=[0]]
# param: reminder='passthrough'
# ct.fit_transform을 활용하여 X에 새롭게 활당

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],
                      remainder='passthrough')

X = ct.fit_transform(X)



In [36]:
# X 확인

X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [27]:
# pandas로 처리하는 방법 pd.get_dummies

#pd.get_dummies(df.iloc[:,:3]) , 첫열부터3번째열까지 열들을 원핫인코딩


pd.get_dummies(df['Country']) #country칼럼값들만 원핫인코딩

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,1,0
5,1,0,0
6,0,0,1
7,1,0,0
8,0,1,0
9,1,0,0


### Encoding Dependent Variable

In [38]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [39]:
# from sklearn.preprocessing import LabelEncoder
# le 에 LabelEncoder() 할당
# y에 fit_transform으로 처리한 값 할당

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)



In [40]:
# y 확인

y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

# Split Dataset into Training set and Test set

In [42]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test 할당. test_size=0.2, random_state=1

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1)


In [49]:
# X_train shape 확인

X_train.shape

(8, 5)

## Numerical Value

### Normalization(MinMaxScaler)

표준화

This estimator scales and translates each feature individually such that it is in the given range on the training set, e.g. between zero and one.  

`X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))`  
`X_scaled = X_std * (max - min) + min`  
`where min, max = feature_range`  


This transformation is often used as an alternative to zero mean, unit variance scaling.

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html?highlight=minmax#sklearn.preprocessing.MinMaxScaler

In [50]:
# from sklearn.preprocessing import MinMaxScaler
# mms를 MinMaxScaler()로 할당
# 스케일링 할 X_train[:, 3:] 만큼의 데이터를 .fit_transform()을 사용하여 같은 오브젝트에 재할당
# 스케일링 정보가 입력된 mms 오브젝트를 활용하여 X_train[:, 3:]에 .transform()을 사용하여 같은 오브젝트에 재할당

from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_train[:,3:] = mms.fit_transform(X_train[:,3:])
X_test[:,3:] = mms.transform(X_test[:,3:])



In [51]:
# 스케일링 된 X_train 확인
X_train


array([[0.0, 0.0, 1.0, 0.5120772946859904, 0.11428571428571432],
       [0.0, 1.0, 0.0, 0.5652173913043479, 0.45079365079365075],
       [1.0, 0.0, 0.0, 0.7391304347826089, 0.6857142857142855],
       [0.0, 0.0, 1.0, 0.4782608695652175, 0.37142857142857144],
       [0.0, 0.0, 1.0, 0.0, 0.0],
       [1.0, 0.0, 0.0, 0.9130434782608696, 0.8857142857142857],
       [0.0, 1.0, 0.0, 1.0, 1.0],
       [1.0, 0.0, 0.0, 0.34782608695652173, 0.2857142857142856]],
      dtype=object)

### Standardization

정규화  

Standardize features by removing the mean and scaling to unit variance.  

`z = (x - u) / s`  
`where u is the mean of the training samples or zero if with_mean=False, and`  
`s is the standard deviation of the training samples or one if with_std=False.`  

Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance).

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html?highlight=scaler#sklearn.preprocessing.StandardScaler

In [52]:
# from sklearn.preprocessing import StandardScaler
# ss로 StandardScaler() 스케일러 오브젝트 할당
# 스케일링 할 X_train[:, 3:] 만큼의 데이터를 .fit_transform()을 사용하여 같은 오브젝트에 재할당
# 스케일링 정보가 입력된 ss 오브젝트를 활용하여 X_train[:, 3:]에 .transform()을 사용하여 같은 오브젝트에 재할당

from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_train[:,3:] = ss.fit_transform(X_train[:,3:])
X_test[:,3:] = ss.transform(X_test[:,3:])




In [53]:
# 스케일링 된 X_train 확인
X_train


array([[0.0, 0.0, 1.0, -0.19159184384578537, -1.0781259408412425],
       [0.0, 1.0, 0.0, -0.014117293757057581, -0.07013167641635436],
       [1.0, 0.0, 0.0, 0.5667085065333245, 0.6335624327104541],
       [0.0, 0.0, 1.0, -0.3045301939022482, -0.3078661727429788],
       [0.0, 0.0, 1.0, -1.9018011447007983, -1.4204636155515822],
       [1.0, 0.0, 0.0, 1.1475343068237058, 1.2326533634535486],
       [0.0, 1.0, 0.0, 1.4379472069688963, 1.5749910381638883],
       [1.0, 0.0, 0.0, -0.740149544120035, -0.5646194287757338]],
      dtype=object)

# Quiz

slkearn.preprocessing.StandardScaler으로 사용하는 표준화 방식으로 계산한 z값과 직접 계산한 z값을 비교해 보세요.