# 범주형 데이터 다루기

## 5.1 순서가 없는 범주형 특성 인코딩하기

In [1]:
#원핫 인코딩
import numpy as np
from sklearn.preprocessing import LabelBinarizer,MultiLabelBinarizer

In [2]:
feature = np.array([["Texas"],
                    ["California"],
                    ['Texas'],
                    ['Delaware'],
                    ['Texas']])

In [4]:
#원-핫 인코더를 만든다.
one_hot = LabelBinarizer()

#feature를 원-핫 인코딩한다.
one_hot.fit_transform(feature)


array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [7]:
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [8]:
#원-핫 인코딩 되돌리기
one_hot.inverse_transform(one_hot.transform(feature))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [11]:
# 판다스를 사용한 원-핫 인코딩
import pandas as pd

pd.get_dummies(feature[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [12]:
#사이킷런의 장점은 다중클래스도 다룰 수 있다
multiclass_feature=[('Texas','Florida'),
                   ('California','Alabama'),
                   ('Texas', 'Florida'),
                   ('Delware','Florida'),
                   ('Texas','Alabama')]

In [15]:
#다중 클래스 원-핫 인코딩
one_hot_multiclass = MultiLabelBinarizer()

one_hot_multiclass.fit_transform(multiclass_feature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [18]:
one_hot_multiclass.classes_

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

In [21]:
# 다중 클래스이면서 정수형과 문자열 둘다 다룰수 있는 OneGotEncoder

from sklearn.preprocessing import OneHotEncoder

In [31]:
datalist = [["Texas",'1'],["California",'1']]

In [45]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

feature = np.array([["Texas",1],
                  ["California",1],
                  ["Texas",3],
                  ["Delaware",1],
                  ["Texas",1]])

In [47]:
one_hot_encoder = OneHotEncoder(sparse = False)
one_hot_encoder.fit_transform(feature)

array([[0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1.],
       [0., 1., 0., 1., 0.],
       [0., 0., 1., 1., 0.]])

## 5.2 순서가 있는 범주형 특성 인코딩하기

In [33]:
dataframe = pd.DataFrame({"Score": ["Low","Low","Medium","Medium","High","High"]})

In [34]:
scale_mapper ={"Low" : 1,
              "Medium" : 2,
              "High": 3}
#매핑 딕셔너리를 만듦


In [35]:
dataframe['Score'].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
5    3
Name: Score, dtype: int64

In [38]:
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
features =np.array([["Low",10],
                    ["High",50],
                    ['Medium',3]])

In [40]:
oridinal_encoder = OrdinalEncoder()

In [41]:
oridinal_encoder.fit_transform(features)

array([[1., 0.],
       [0., 2.],
       [2., 1.]])

In [42]:
oridinal_encoder.categories_

[array(['High', 'Low', 'Medium'], dtype='<U6'),
 array(['10', '3', '50'], dtype='<U6')]

## 5.3 특성 딕셔너리를 인코딩하기


In [48]:
from sklearn.feature_extraction import DictVectorizer

data_dict = [{"Red":2, "Blue":4},
            {"Red":4, "Blue":3},
            {"Red": 1, "Yellow":2},
            {"Red":2,"Yellow":2}]

In [59]:
dictvectorizer = DictVectorizer(sparse = False)
features= dictvectorizer.fit_transform(data_dict)

In [60]:
features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [61]:
features_name  =dictvectorizer.get_feature_names()

In [62]:
features_name

['Blue', 'Red', 'Yellow']

In [64]:
import pandas as pd
pd.DataFrame(features, columns = features_name)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


In [65]:
doc_1 = {"Red":2, "Blue":4}
doc_2 = {"Red":4,"Blue":3}
doc_3  ={"Red":1,"Yellow":2}
doc_4 = {"Red":2, "Yellow":2}



In [66]:
#리스트를 만든다.
doc_word = [doc_1, doc_2, doc_3, doc_4]

In [67]:
dictvectorizer.fit_transform(doc_word)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

## 5.4 누락된 클래스 값 대처하기


In [68]:
#라이브러리
import numpy as np
from sklearn.neighbors import KNeighborsClassifier


In [69]:
X = np.array([[0,2.10,1.45],
             [1,1.18,1.33],
             [0,1.22,1.27],
             [1,-0.21,-1.19]])


In [70]:
#누락된 값이 있는 특성 행렬
X_with_nan = np.array([[np.nan, 0.87,1.31],
                      [np.nan,-0.67,-0.22]])



In [73]:
#KNN 학습기를 훈련한다.
clf = KNeighborsClassifier(3,weights='distance')
trained_model = clf.fit(X[:,1:],X[:,0])

In [75]:
imputed_values = trained_model.predict(X_with_nan[:,1:])

In [76]:
#예측된 클래스를 원본 특성과 합친다.
x_with_imputed = np.hstack((imputed_values.reshape(-1,1),X_with_nan[:,1:]))

In [80]:
#두 특성행렬을 합친다.
np.vstack((x_with_imputed, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [82]:
# 누락된 값을 자주 등장하는 값으로 채우는 것
from sklearn.impute import SimpleImputer

x_complete = np.vstack((X_with_nan,X))

imputer = SimpleImputer(strategy ="most_frequent")
imputer.fit_transform(x_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

## 5.5 불균형한 클래스 다루기
