# preprocessing 전처리

- Data cleansing
- Data Encoding 텍스트 데이터를 숫자로 변환(범주형 데이터)
- Data Scaling 숫자값 정규화
- Outlier 제거
- Feature Engineering
    - 속성 생성/수정/가공

## Data Encoding

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Label Encoder
- 범주형 데이터에 대해 적절히 숫자로 변환하는 것

In [5]:
from sklearn.preprocessing import LabelEncoder

items = ['TV', '믹서기', '냉장고', '선풍기', '전자렌지', '컴퓨터', '선풍기', '믹서기']
encoder = LabelEncoder() # fit, transform가 대표적인 전처리

encoder.fit(items)
encoded_items = encoder.transform(items)

encoded_items # 잘보면 알파벳 순서


array([0, 2, 1, 3, 4, 5, 3, 2])

In [6]:
# 중복값을 제거, 가나다순 정렬
encoder.classes_

array(['TV', '냉장고', '믹서기', '선풍기', '전자렌지', '컴퓨터'], dtype='<U4')

### One-hot Encoder

주어진 데이터를 희소배열형태로 변환. (One-Vs-Rest 배열)
희소배열이란 대부분이 0이고, 특정인덱스만 값을 가지고 있는 배열

In [9]:
from sklearn.preprocessing import OneHotEncoder
items = ['TV', '믹서기', '냉장고', '선풍기', '전자렌지', '컴퓨터', '선풍기', '믹서기']

# 2차원 형태로 변환
items = np.array(items).reshape(-1,1)
print(items)

# One-hot Encoding
encoder = OneHotEncoder()
encoder.fit(items)
oh_labels = encoder.transform(items)
oh_labels

[['TV']
 ['믹서기']
 ['냉장고']
 ['선풍기']
 ['전자렌지']
 ['컴퓨터']
 ['선풍기']
 ['믹서기']]


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8 stored elements and shape (8, 6)>

In [10]:
print(oh_labels)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8 stored elements and shape (8, 6)>
  Coords	Values
  (0, 0)	1.0
  (1, 2)	1.0
  (2, 1)	1.0
  (3, 3)	1.0
  (4, 4)	1.0
  (5, 5)	1.0
  (6, 3)	1.0
  (7, 2)	1.0


In [12]:
print(oh_labels.toarray())

[[1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]


In [13]:
encoder.categories_

[array(['TV', '냉장고', '믹서기', '선풍기', '전자렌지', '컴퓨터'], dtype='<U4')]

### DataFrame에서 One-hot Encoding하기

In [15]:
df = pd.DataFrame({'items' : ['TV', '믹서기', '냉장고', '선풍기', '전자렌지', '컴퓨터', '선풍기', '믹서기']})
df

Unnamed: 0,items
0,TV
1,믹서기
2,냉장고
3,선풍기
4,전자렌지
5,컴퓨터
6,선풍기
7,믹서기


In [16]:
pd.get_dummies(df, dtype=int) # 원래는 True, False

Unnamed: 0,items_TV,items_냉장고,items_믹서기,items_선풍기,items_전자렌지,items_컴퓨터
0,1,0,0,0,0,0
1,0,0,1,0,0,0
2,0,1,0,0,0,0
3,0,0,0,1,0,0
4,0,0,0,0,1,0
5,0,0,0,0,0,1
6,0,0,0,1,0,0
7,0,0,1,0,0,0


In [17]:
pd.get_dummies(df)

Unnamed: 0,items_TV,items_냉장고,items_믹서기,items_선풍기,items_전자렌지,items_컴퓨터
0,True,False,False,False,False,False
1,False,False,True,False,False,False
2,False,True,False,False,False,False
3,False,False,False,True,False,False
4,False,False,False,False,True,False
5,False,False,False,False,False,True
6,False,False,False,True,False,False
7,False,False,True,False,False,False


In [22]:
# DataFrame에서 ndarray로 변환
print(pd.get_dummies(df, dtype=int).to_numpy())
print(np.array(pd.get_dummies(df, dtype=int)))

[[1 0 0 0 0 0]
 [0 0 1 0 0 0]
 [0 1 0 0 0 0]
 [0 0 0 1 0 0]
 [0 0 0 0 1 0]
 [0 0 0 0 0 1]
 [0 0 0 1 0 0]
 [0 0 1 0 0 0]]
[[1 0 0 0 0 0]
 [0 0 1 0 0 0]
 [0 1 0 0 0 0]
 [0 0 0 1 0 0]
 [0 0 0 0 1 0]
 [0 0 0 0 0 1]
 [0 0 0 1 0 0]
 [0 0 1 0 0 0]]


## Feature Scaling
- 표준정규화 StandardScaler
    - 평균 0, 표준편차 1인 값으로 변환
    - 이상치에 덜 민감하고, 선형 회귀, 로지스틱 회귀 등의 알고리즘에 적합하다.


- 최소최대정규화 MinMaxScaler
    - 0 ~ 1 사이의 값을 변환
    - SVM, KNN와 같은 거리 기반모델에 적합하다.
    - 이상치에 민감하게 반응한다. 이상치가 있는 경우 데이터가 왜곡될 수 있다.


In [25]:
from sklearn.datasets import load_iris

iris = load_iris()
print(iris.feature_names)
iris.keys()
iris.data


['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [29]:
# stnadardScaler 방식
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
standard_scaler.fit(iris.data) # 평균, 표준편차

standard_scaler.transform(iris.data) # (값 - 평균) / 표준편차

array([[-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.38535265e+00,  3.28414053e-01, -1.39706395e+00,
        -1.31544430e+00],
       [-1.50652052e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00],
       [-1.02184904e+00,  1.24920112e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-5.37177559e-01,  1.93979142e+00, -1.16971425e+00,
        -1.05217993e+00],
       [-1.50652052e+00,  7.88807586e-01, -1.34022653e+00,
        -1.18381211e+00],
       [-1.02184904e+00,  7.88807586e-01, -1.28338910e+00,
        -1.31544430e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.44707648e+00],
       [-5.37177559e-01,  1.47939788e+00, -1.28338910e+00,
        -1.31544430e+00],
       [-1.26418478e+00,  7.88807586e-01, -1.22655167e+00,
      

In [32]:
# MinMaxScaler (값 - 최소값) / (최대값 - 최소값)

from sklearn.preprocessing import MinMaxScaler

minmax_scaler = MinMaxScaler()
minmax_scaler.fit(iris.data)
minmax_scaler.transform(iris.data)




array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667],
       [0.30555556, 0.79166667, 0.11864407, 0.125     ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667],
       [0.02777778, 0.375     , 0.06779661, 0.04166667],
       [0.16666667, 0.45833333, 0.08474576, 0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667],
       [0.13888889, 0.41666667, 0.06779661, 0.        ],
       [0.        , 0.41666667, 0.01694915, 0.        ],
       [0.41666667, 0.83333333, 0.03389831, 0.04166667],
       [0.38888889, 1.        , 0.08474576, 0.125     ],
       [0.30555556, 0.79166667, 0.05084746, 0.125     ],
       [0.22222222, 0.625     ,

In [34]:
# 간단예제
minmax_scaler.fit_transform([[20],[30],[40]])

array([[0. ],
       [0.5],
       [1. ]])

In [None]:
# scaling 작업은 train 데이터, test 데이터 동일하게 적용해야 한다.
# - fit() train 데이터
# - transform() train 데이터, test 데이터

# 훈련/테스트 데이터 분리
from sklearn.model_selection import train_test_split
train_input, test_input, train_label, test_label = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

In [None]:
# 표준정규화 적용
standard_scaler = StandardScaler()
standard_scaler.fit(train_input) # 훈련데이터 기준 평균/표준편차

standard_scaler.transform(train_input) # 훈련데이터 변환
standard_scaler.transform(test_input)  # 테스트데이터 변환

In [None]:
# 최소최대 정규화
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(train_input)

minmax_scaler.transform(train_input)
minmax_scaler.transform(test_input)

## 타이타닉 생존율 예측에 필요한 전처리

In [75]:
df = pd.read_csv('./data/titanic.csv')
df


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [85]:
from sklearn.preprocessing import LabelEncoder

def fillna(df):
    '''
    - Age
    - Cabin
    - Embarked
    '''
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Cabin'] = df['Cabin'].fillna('N')
    df['Embarked'] = df['Embarked'].fillna('N')
    return df

def drop_features(df):
    '''
    모델 훈련과 상관없는 속성 제거

    '''
    columns = ['PassengerId', 'Name', 'Ticket']
    return df.drop(columns, axis=1)

def encode_features(df):
    '''
    범주형 데이터를 숫자로 인코딩
    '''
    df['Cabin'] = df['Cabin'].str[:1] # 0번 인덱스만 사용하겠다.
    categories = ['Sex', 'Cabin', 'Embarked']

    for cate in categories:
        label_encoder = LabelEncoder()
        df[cate] = label_encoder.fit_transform(df[cate])
    return df

def preprocess_data(df):
    df = fillna(df)
    df = drop_features(df)
    df = encode_features(df)
    return df


In [86]:
drop_features(df)
drop_features(df).isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [88]:
temp = drop_features(df)
temp = fillna(temp)
temp = encode_features(temp)
temp

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.000000,1,0,7.2500,7,3
1,1,1,0,38.000000,1,0,71.2833,2,0
2,1,3,0,26.000000,0,0,7.9250,7,3
3,1,1,0,35.000000,1,0,53.1000,2,3
4,0,3,1,35.000000,0,0,8.0500,7,3
...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,7,3
887,1,1,0,19.000000,0,0,30.0000,1,3
888,0,3,0,29.699118,1,2,23.4500,7,3
889,1,1,1,26.000000,0,0,30.0000,2,0


In [92]:
# 훈련/테스트 데이터 분리
from sklearn.model_selection import train_test_split


In [90]:
# 특성 스케일링


In [91]:
# 모델 LogisticRegression 훈련

In [None]:
#