# Preprocessing (전처리)
- Data Cleansing
- Data Encoding: 텍스트 데이터 > 숫자로 변환 (범주형 데이터)
- Data Scaling: 숫자값 정규화
- Data Outlier: 이상치 처리
- Feature Engineering: 속성 생성/수정/가공

In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Data Encoding

### Label Encoder
- 범주형 데이터에 대해 적절히 숫자로 변환하는 것

In [95]:
from sklearn.preprocessing import LabelEncoder

items = ['TV','냉장고','세탁기','PC', '전기난로','PC', 'TV', '믹서기', 'PC']

encoder = LabelEncoder()
encoder.fit(items) # 중복값 제거, 오름차순 정렬 > 적절한 숫자 (인덱스 번호)
encoded_items = encoder.transform(items)
encoded_items

array([1, 2, 4, 0, 5, 0, 1, 3, 0])

In [96]:
encoder.classes_

array(['PC', 'TV', '냉장고', '믹서기', '세탁기', '전기난로'], dtype='<U4')

### One-hot Encoder
- 주어진 데이터를 희소배열로 반환 (One-vs-Rest) 배열
- 희소배열이란 대부분 0이고 특정 인덱스만 값을 가지고 있는 배열

In [97]:
from sklearn.preprocessing import OneHotEncoder

items_2d = np.array(items).reshape(-1,1)

encoder = OneHotEncoder() # 얘 쓸려면 2차원이어야 함
encoder.fit(items_2d)
oh_items = encoder.transform(items_2d)
print(oh_items)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9 stored elements and shape (9, 6)>
  Coords	Values
  (0, 1)	1.0
  (1, 2)	1.0
  (2, 4)	1.0
  (3, 0)	1.0
  (4, 5)	1.0
  (5, 0)	1.0
  (6, 1)	1.0
  (7, 3)	1.0
  (8, 0)	1.0


In [98]:
print(oh_items.toarray())

[[0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0.]]


In [99]:
encoder.categories_

[array(['PC', 'TV', '냉장고', '믹서기', '세탁기', '전기난로'], dtype='<U4')]

- DataFrame에서 One-hot encoding

In [100]:
df = pd.DataFrame({
    'items': ['TV','냉장고','세탁기','PC', '전기난로','PC', 'TV', '믹서기', 'PC']
})
df

Unnamed: 0,items
0,TV
1,냉장고
2,세탁기
3,PC
4,전기난로
5,PC
6,TV
7,믹서기
8,PC


In [101]:
df_dummies = pd.get_dummies(df, dtype=int)
df_dummies

np.array(df_dummies) # 혹은 df_dummies.to_numpy()
df_dummies.to_numpy()

array([[0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0]])

### Data Scaling (Feature Scaling)
- Scaling 작업은 train 데이터, test 데이터에 동일하게 적용해야 함
    - fit(): train 데이터
    - transform(): train 데이터, test 데이터

In [102]:
from sklearn.datasets import load_iris

iris_ds = load_iris()

#### 표준정규화 (Standard Scaler)
- 평균이 0, std가 1인 값으로 변환
- 데이터가 정규분포인 경우 더욱 적합
- 이상치에 덜 민감
- 선형회귀(Linear Regression) 및 로지스틱 회귀(Logistic Regression) 등의 알고리즘에 적합

In [103]:
from sklearn.preprocessing import StandardScaler

standard_sc = StandardScaler()
standard_sc.fit(iris_ds.data)
standard_sc.transform(iris_ds.data)

array([[-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.38535265e+00,  3.28414053e-01, -1.39706395e+00,
        -1.31544430e+00],
       [-1.50652052e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00],
       [-1.02184904e+00,  1.24920112e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-5.37177559e-01,  1.93979142e+00, -1.16971425e+00,
        -1.05217993e+00],
       [-1.50652052e+00,  7.88807586e-01, -1.34022653e+00,
        -1.18381211e+00],
       [-1.02184904e+00,  7.88807586e-01, -1.28338910e+00,
        -1.31544430e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.44707648e+00],
       [-5.37177559e-01,  1.47939788e+00, -1.28338910e+00,
        -1.31544430e+00],
       [-1.26418478e+00,  7.88807586e-01, -1.22655167e+00,
      

#### 최소최대정규화 (MinMaxScaler)
- 0과 1 사이의 값으로 데이터 변환
- 이상치에 민감하게 반응, 이상치에 있는 경우 데이터 왜곡 가능성
- SVM 및 KNN과 같은 거리 기반 모델에 적합

In [104]:
from sklearn.preprocessing import MinMaxScaler

minmax_sc = MinMaxScaler()
#print(minmax_sc.fit_transform([[20],[30],[40]])) #[[0] [0.5], [1]]
minmax_sc.fit(iris_ds.data)
minmax_sc.transform(iris_ds.data)

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667],
       [0.30555556, 0.79166667, 0.11864407, 0.125     ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667],
       [0.02777778, 0.375     , 0.06779661, 0.04166667],
       [0.16666667, 0.45833333, 0.08474576, 0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667],
       [0.13888889, 0.41666667, 0.06779661, 0.        ],
       [0.        , 0.41666667, 0.01694915, 0.        ],
       [0.41666667, 0.83333333, 0.03389831, 0.04166667],
       [0.38888889, 1.        , 0.08474576, 0.125     ],
       [0.30555556, 0.79166667, 0.05084746, 0.125     ],
       [0.22222222, 0.625     ,

---

### 타이타닉 생존율 예측에 필요한 데이터 전처리 연습하기

In [None]:
# [ 전처리 함수 ]
def fillna(df):
    """
    결측치 처리 함수
    - Age: 평균치로 대체
    - Cabin: 'N' 기본값으로 대체
    - Embarked: 'N' 기본값으로 대체
    """
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Cabin'] = df['Cabin'].fillna('N')
    df['Embarked'] = df['Embarked'].fillna('N')
    return df
    

def drop_feature(df):
    """
    모델 훈련과 관련 없는 속성 제거
    - PassengerId, Name, Ticket
    """
    df = df.drop(['PassengerId','Name','Ticket'], axis=1)
    return df

def encode_feature(df):
    """
    범주형 데이터를 숫자로 인코딩
    - Sex, Cabin, Embarked
    """
    
    encoding_items = ['Sex', 'Cabin', 'Embarked']
    
    for word in encoding_items:
        encoder = LabelEncoder()
        encoder.fit(df[word]) 
        df[word] = encoder.transform(df[word])
         
    return df
    
    

def preprocess_data(df):
    """
    전처리 함수 호출
    """
    df = fillna(df)
    df = drop_feature(df)
    df = encode_feature(df)
    return df

def scaling_feature(train_input, test_input):
    """
    특성 스케일링 (정규화)
    """
    std_scaler = StandardScaler()
    std_scaler.fit(train_input)
    scaled_train_input = std_scaler.transform(train_input)
    scaled_test_input = std_scaler.transform(test_input)
    return scaled_train_input, scaled_test_input
    

In [143]:
# 데이터 로드
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

titanic_df = pd.read_csv('./data/titanic.csv')
display(titanic_df.head(3))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [144]:
# 전처리
titanic_df = preprocess_data(titanic_df)
display(titanic_df.head())
print(len(titanic_df))


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,146,3
1,1,1,0,38.0,1,0,71.2833,81,0
2,1,3,0,26.0,0,0,7.925,146,3
3,1,1,0,35.0,1,0,53.1,55,3
4,0,3,1,35.0,0,0,8.05,146,3


891


In [145]:
# 데이터 분리 (입력-라벨 분리 & 훈련-테스트 분리)
titanic_features = titanic_df[['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']].to_numpy()
titanic_label = titanic_df['Survived'].to_numpy()

X_train, X_test, y_train, y_test = \
train_test_split(titanic_features, titanic_label, random_state=42)

print(titanic_features.shape, titanic_label.shape)

(891, 8) (891,)


In [163]:
# 특성 스케일링 (StandardScaler)
X_scaled_train, X_scaled_test = scaling_feature(X_train, X_test)
print('Features type:', X_scaled_train.dtype, X_scaled_test.dtype)
print('Label Type:', y_train.dtype, y_test.dtype)

Features type: float64 float64
Label Type: int64 int64


In [None]:
# LogisticRegression 모델 훈련
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_scaled_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [165]:
# 평가
print("Train Accuracy:", lr.score(X_scaled_train, y_train))
print("Train Accuracy:", lr.score(X_scaled_test, y_test))

Train Accuracy: 0.8023952095808383
Train Accuracy: 0.8026905829596412
