In [3]:
import numpy as np
import pandas as pd

# 6.2.1 결측치 처리

In [5]:
df = pd.DataFrame([
    [42,'male',12,'reading','class2'],
    [35,'unknown',3,'cooking','class1'],
    [1000,'female',7,'cycling','class3'],
    [1000,'unknown',21,'unknown','unknown']
])
df.columns = [
    'age','gender','month_birth','hobby','target'
]

In [6]:
df

Unnamed: 0,age,gender,month_birth,hobby,target
0,42,male,12,reading,class2
1,35,unknown,3,cooking,class1
2,1000,female,7,cycling,class3
3,1000,unknown,21,unknown,unknown


In [7]:
df['age'].unique()

array([  42,   35, 1000])

In [8]:
df['gender'].unique()

array(['male', 'unknown', 'female'], dtype=object)

In [9]:
df['month_birth'].unique()

array([12,  3,  7, 21])

In [10]:
df['hobby'].unique()

array(['reading', 'cooking', 'cycling', 'unknown'], dtype=object)

In [11]:
df['target'].unique()

array(['class2', 'class1', 'class3', 'unknown'], dtype=object)

In [12]:
df.loc[df['age']>150,['age']] = np.nan
df.loc[df['gender']=='unknown',['gender']] = np.nan
df.loc[df['month_birth']>12,['month_birth']] = np.nan
df.loc[df['hobby']=='unknown',['hobby']] = np.nan
df.loc[df['target']=='unknown',['target']] = np.nan

In [13]:
df

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3
3,,,,,


In [14]:
df.isnull().sum()

age            2
gender         2
month_birth    1
hobby          1
target         1
dtype: int64

In [15]:
# 결측치를 포함한 행(row) 삭제
df2 = df.dropna(axis=0)
df2

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2


In [16]:
# 결측치를 포함한 열(column) 삭제
df3 = df.dropna(axis=1)
df3

0
1
2
3


In [17]:
# 모든 값이 결측치인 행 삭제
df4 = df.dropna(how='all')
df4

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3


In [18]:
# 값이 2개 미만인 행 삭제
df5 = df.dropna(thresh=2)
df5

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3


In [19]:
# 특정 열에 결측치가 있는 경우 행 삭제
df6 = df.dropna(subset=['gender'])
df6

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
2,,female,7.0,cycling,class3


In [20]:
# 결측치 대체하기
alter_values = {'age':0,
               'gender': 'U',
               'month_birth': 0,
               'hobby': 'U',
               'target': 'class4'}
df7 = df.fillna(value = alter_values)
df7

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,U,3.0,cooking,class1
2,0.0,female,7.0,cycling,class3
3,0.0,U,0.0,U,class4


# 6.2.2 클래스 라벨 설정

In [39]:
from sklearn.preprocessing import LabelEncoder

In [40]:
df8 = df7
class_label = LabelEncoder()
data_value = df8['target'].values
y_new = class_label.fit_transform(data_value)
y_new

array([0, 1, 2, 3])

In [41]:
df8['target'] = y_new
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,0
1,35.0,U,3.0,cooking,1
2,0.0,female,7.0,cycling,2
3,0.0,U,0.0,U,3


In [28]:
y_ori = class_label.inverse_transform(y_new)
y_ori

array(['class2', 'class1', 'class3', 'class4'], dtype=object)

In [29]:
df8['target'] = y_ori
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,U,3.0,cooking,class1
2,0.0,female,7.0,cycling,class3
3,0.0,U,0.0,U,class4


In [32]:
# 클래스 라벨링
y_arr = df8['target'].values
y_arr.sort()
y_arr

array(['class1', 'class2', 'class3', 'class4'], dtype=object)

# 6.2.3 원-핫 인코딩

In [42]:
df9 = df8
df9['target'] = df9['target'].astype(str)
df10 = pd.get_dummies(df9['target'])
print(df10)

   0  1  2  3
0  1  0  0  0
1  0  1  0  0
2  0  0  1  0
3  0  0  0  1


In [43]:
df9['target'] = df9['target'].astype(str)
df11 = pd.get_dummies(df9['target'], drop_first=True)
print(df11)

   1  2  3
0  0  0  0
1  1  0  0
2  0  1  0
3  0  0  1


In [44]:
df12 = df8
df13 = pd.get_dummies(df12)
df13

Unnamed: 0,age,month_birth,gender_U,gender_female,gender_male,hobby_U,hobby_cooking,hobby_cycling,hobby_reading,target_0,target_1,target_2,target_3
0,42.0,12.0,0,0,1,0,0,0,1,1,0,0,0
1,35.0,3.0,1,0,0,0,1,0,0,0,1,0,0
2,0.0,7.0,0,1,0,0,0,1,0,0,0,1,0
3,0.0,0.0,1,0,0,1,0,0,0,0,0,0,1


In [45]:
from sklearn.preprocessing import OneHotEncoder
hot_encoder = OneHotEncoder()
y = df7[['target']]
y_hot = hot_encoder.fit_transform(y)
print(y_hot.toarray())

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


In [46]:
from tensorflow.keras.utils import to_categorical
y_hotec = to_categorical(y)
print(y_hotec)

ModuleNotFoundError: No module named 'tensorflow'

# 6.2.4 데이터 스케일링

In [53]:
# 표준화 스케일링
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std.fit(df8[['month_birth']])
x_std = std.transform(df8[['month_birth']])

In [54]:
x_std2 = std.fit_transform(df8[['month_birth']])
x_std2

array([[ 1.44444444],
       [-0.55555556],
       [ 0.33333333],
       [-1.22222222]])

In [57]:
np.mean(x_std)

-5.551115123125783e-17

In [59]:
np.std(x_std)

1.0

In [62]:
# 로버스트 스케일링
from sklearn.preprocessing import RobustScaler

robust = RobustScaler()
robust.fit(df8[['month_birth']])
x_robust = robust.transform(df8[['month_birth']])
x_robust

array([[ 1.16666667],
       [-0.33333333],
       [ 0.33333333],
       [-0.83333333]])

In [64]:
# 최소-최대 스케일링
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
minmax.fit(df8[['month_birth']])
x_minmax = minmax.transform(df8[['month_birth']])
x_minmax

array([[1.        ],
       [0.25      ],
       [0.58333333],
       [0.        ]])

In [66]:
# 노멀 스케일링
from sklearn.preprocessing import Normalizer

normal = Normalizer()
normal.fit(df8[['age','month_birth']])
x_normal = normal.transform(df8[['age','month_birth']])
x_normal

array([[0.96152395, 0.27472113],
       [0.99634665, 0.08540114],
       [0.        , 1.        ],
       [0.        , 0.        ]])