# preprocessing (전처리)

- Data Cleansing (정제)
- Feature Engineering : 속성에 대한 생성/변환
- Data Encoding : 텍스트 데이터(범주형 데이터)를 숫자로 변환
- Data Scaling : 숫자값 정규화
- Outlier : 이상치 처리

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Data Encoding

##### Label Ecoder
- 범주형 데이터에 대해 적절한 숫자로 변환

In [37]:
from sklearn.preprocessing import LabelEncoder

# 범주형 데이터
candies = ['캬라멜', '커피사탕', '땅콩캬라멜', '아몬드사탕', '페레로로쉐', '커피사탕', '아몬드사탕']

lb_enc = LabelEncoder()

lb_enc.fit(candies)
encoded_candies = lb_enc.transform(candies)
encoded_candies

array([2, 3, 0, 1, 4, 3, 1])

In [38]:
lb_enc.classes_

array(['땅콩캬라멜', '아몬드사탕', '캬라멜', '커피사탕', '페레로로쉐'], dtype='<U5')

##### One-hot Encoder
- 주어진 데이터를 희소배열로 변환 (One-vs-Resr 배열)
- 희소배열이란? 대부분이 0이고 특정 인덱스만 값을 가지고 있는 배열

In [39]:
from sklearn.preprocessing import OneHotEncoder

candies_2d = np.array(candies).reshape(-1, 1)

oh_enc = OneHotEncoder()

# 인코딩을 하기 위해서는 2차원이여야됨
oh_enc.fit(candies_2d)  # 중복 제거 및 오름차순 정렬 후, 해당 인덱스에만 1을 준 희소행렬
encoded_candies = oh_enc.transform(candies_2d)
encoded_candies

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7 stored elements and shape (7, 5)>

In [40]:
print(encoded_candies)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7 stored elements and shape (7, 5)>
  Coords	Values
  (0, 2)	1.0
  (1, 3)	1.0
  (2, 0)	1.0
  (3, 1)	1.0
  (4, 4)	1.0
  (5, 3)	1.0
  (6, 1)	1.0


In [41]:
print(encoded_candies.toarray())

[[0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]]


In [42]:
oh_enc.categories_

[array(['땅콩캬라멜', '아몬드사탕', '캬라멜', '커피사탕', '페레로로쉐'], dtype='<U5')]

In [43]:
# DataFrame에서 One-hot encoding 하기
candies_df = pd.DataFrame({
    'Candy' : ['캬라멜', '커피사탕', '땅콩캬라멜', '아몬드사탕', '페레로로쉐', '커피사탕', '아몬드사탕']
})
# candies_df

df_dummies = pd.get_dummies(candies_df)

# dataframe => ndarray 변환
# df_dummies.to_numpy()
np.array(df_dummies)

array([[False, False,  True, False, False],
       [False, False, False,  True, False],
       [ True, False, False, False, False],
       [False,  True, False, False, False],
       [False, False, False, False,  True],
       [False, False, False,  True, False],
       [False,  True, False, False, False]])

### Data Scaling

In [44]:
from sklearn.datasets import load_iris

iris_ds = load_iris()
iris_features = iris_ds.data

##### Standard Scaler (표준정규화, Z-변환)
- 평균이 0, 표준편차가 1인 값으로 데이터 스케일을 변환
- 데이터가 정규분포인 경우 더욱 적합
- 이상치에 덜 민감 (선형회귀, 로지스틱 회귀 등의 알고리즘에 적합)

In [45]:
from sklearn.preprocessing import StandardScaler

standard_sc = StandardScaler()
standard_sc.fit(iris_features)
standard_sc.transform(iris_features)

array([[-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.38535265e+00,  3.28414053e-01, -1.39706395e+00,
        -1.31544430e+00],
       [-1.50652052e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00],
       [-1.02184904e+00,  1.24920112e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-5.37177559e-01,  1.93979142e+00, -1.16971425e+00,
        -1.05217993e+00],
       [-1.50652052e+00,  7.88807586e-01, -1.34022653e+00,
        -1.18381211e+00],
       [-1.02184904e+00,  7.88807586e-01, -1.28338910e+00,
        -1.31544430e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.44707648e+00],
       [-5.37177559e-01,  1.47939788e+00, -1.28338910e+00,
        -1.31544430e+00],
       [-1.26418478e+00,  7.88807586e-01, -1.22655167e+00,
      

##### MinMax Scaler (최소최대정규화, 0-1변환)

- 0~1 사이의 값으로 데이터 스케일을 변환 (최솟값 == 0, 최댓값 1)
- 거리 기반 모델에 적합 (KNN, SVM 등)
- 이상치에 민감하게 반응 (이상치가 있는 경우 데이터 왜곡 가능성 O)

In [51]:
from sklearn.preprocessing import MinMaxScaler

minmax_sc = MinMaxScaler()
minmax_sc.fit(iris_features)
minmax_sc.transform(iris_features)

# minmax_sc.fit_transform([[20], [30], [40]])
# (값 - 최소값) / (최대값 - 최소값)

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667],
       [0.30555556, 0.79166667, 0.11864407, 0.125     ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667],
       [0.02777778, 0.375     , 0.06779661, 0.04166667],
       [0.16666667, 0.45833333, 0.08474576, 0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667],
       [0.13888889, 0.41666667, 0.06779661, 0.        ],
       [0.        , 0.41666667, 0.01694915, 0.        ],
       [0.41666667, 0.83333333, 0.03389831, 0.04166667],
       [0.38888889, 1.        , 0.08474576, 0.125     ],
       [0.30555556, 0.79166667, 0.05084746, 0.125     ],
       [0.22222222, 0.625     ,

### [한번 해보기] 타이타닉 생존율 예측에 필요한 전처리 해보기

##### 1. 데이터 로드

In [50]:
df = pd.read_csv('./data/titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


##### 2. 전처리 (-> 전처리 함수)

In [109]:
from sklearn.model_selection import train_test_split

def fillna(df):
    '''
    결측치 처리
    - Age: 평균치로 대체
    - Cabin: 기본값 'N'으로 대체
    - Embarked: 기본값 'N'으로 대체
    '''
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Cabin'] = df['Cabin'].fillna('N')
    df['Embarked'] = df['Embarked'].fillna('N')
    return df


def drop_feature(df):
    '''
    모델 훈련과 관련 없는 속성 제거
    - PassengerId, Name, Ticket
    '''
    df = df.drop('PassengerId', axis=1)
    df = df.drop('Name', axis=1)
    df = df.drop('Ticket', axis=1)
    return df


def encode_feature(df):
    '''
    범주형 데이터를 "숫자"로 인코딩
    - Sex, Cabin, Embarked
    - [tip] Cabin은 각각 다른 문자열 데이터이므로 앞 글자만 가져와서 범주형으로 치환
    '''
    lb_enc = LabelEncoder()
    
    df['Cabin'] = df['Cabin'].str[:1]
    # 인코딩을 하기 위해서는 2차원이여야됨
    lb_enc.fit(df['Sex'])  # 중복 제거 및 오름차순 정렬 후, 해당 인덱스에만 1을 준 희소행렬
    df['Sex'] = lb_enc.transform(df['Sex'])
    
    lb_enc.fit(df['Embarked'])  # 중복 제거 및 오름차순 정렬 후, 해당 인덱스에만 1을 준 희소행렬
    df['Embarked'] = lb_enc.transform(df['Embarked'])
    return df


def preprocess_data(df):
    '''
    전처리 함수 모두 호출
    '''
    df = drop_feature(df)
    df = fillna(df)
    df = encode_feature(df)
    return df


def scaling_feature(train_data, test_data):
    '''
    특성 스케일링 (정규화)
    '''
    mean = np.mean(train_data, axis=0)
    std = np.std(train_data, axis=0)
    train_scaled = (train_data - mean) / std
    
    mean = np.mean(test_data, axis=0)
    std = np.std(test_data, axis=0)
    test_scaled = (test_data - mean) / std
    return train_scaled, test_scaled

In [53]:
# 전처리 함수 호출
preprocess_data(df)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.000000,1,0,7.2500,N,3
1,1,1,0,38.000000,1,0,71.2833,C,0
2,1,3,0,26.000000,0,0,7.9250,N,3
3,1,1,0,35.000000,1,0,53.1000,C,3
4,0,3,1,35.000000,0,0,8.0500,N,3
...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,N,3
887,1,1,0,19.000000,0,0,30.0000,B,3
888,0,3,0,29.699118,1,2,23.4500,N,3
889,1,1,1,26.000000,0,0,30.0000,C,0


##### 3. 데이터 분리

In [None]:
X_data = df.drop('Survived', axis=1)
y_data = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=42)

array([[299, 1, 'Saalfeld, Mr. Adolphe', ..., 30.5, 'C106', 'S'],
       [885, 3, 'Sutehall, Mr. Henry Jr', ..., 7.05, nan, 'S'],
       [248, 2, 'Hamalainen, Mrs. William (Anna)', ..., 14.5, nan, 'S'],
       ...,
       [861, 3, 'Hansen, Mr. Claus Peter', ..., 14.1083, nan, 'S'],
       [436, 1, 'Carter, Miss. Lucile Polk', ..., 120.0, 'B96 B98', 'S'],
       [103, 1, 'White, Mr. Richard Frasar', ..., 77.2875, 'D26', 'S']],
      shape=(668, 11), dtype=object)

##### 4. 특성 스케일링

In [110]:
# 스케일링 함수 호출
train_scaled, test_scaled = scaling_feature(X_train.to_numpy(), y_train.to_numpy())

TypeError: can only concatenate str (not "float") to str

##### 5. LogisticRegression 모델 학습

In [None]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()
lr_clf.fit(train_scaled, test_scaled)

ValueError: could not convert string to float: 'Saalfeld, Mr. Adolphe'

##### 6. 평가