###### https://medium.com/jameslearningnote/%E8%B3%87%E6%96%99%E5%88%86%E6%9E%90-%E6%A9%9F%E5%99%A8%E5%AD%B8%E7%BF%92-%E7%AC%AC2-4%E8%AC%9B-%E8%B3%87%E6%96%99%E5%89%8D%E8%99%95%E7%90%86-missing-data-one-hot-encoding-feature-scaling-3b70a7839b4a

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from io import StringIO

# 一.缺失值的處理

1.丟棄，如果資料量夠多

2.補值，常見補值方式有補固定值、平均值、眾數、中位數。較進階的補值方法有最近差補法（在資料中找與缺失樣本最接近的樣本來補足缺失的特徵）回歸方法(對帶有缺失的特徵根據其他的特徵屬性來建立回歸模型補值)，或是用插值法（拉格朗日插值法）

In [2]:
csv_data = '''A,B,C,D,E
            5.0,2.0,3.0,,6
            1.0,6.0,,8.0,5
            0.0,11.0,12.0,4.0,5
            3.0,,3.0,5.0
            5.0,1.0,4.0,2.0,4
            '''

In [6]:
# StringIO 把 csv_data 讀進來
# 模擬用 read_csv 讀 csv 檔進來，再轉成 pandas
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


# Missing Data(空值資料處理)

In [8]:
df.dropna() # 只要有空值就去掉

Unnamed: 0,A,B,C,D,E
2,0.0,11.0,12.0,4.0,5.0
4,5.0,1.0,4.0,2.0,4.0


In [9]:
df.dropna(how='all') # 都是空值才丟掉

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [10]:
df.dropna(subset=['C']) # 指定某一欄為空值就丟掉

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [12]:
df.fillna(0) # 空值補為 0

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,0.0,6.0
1,1.0,6.0,0.0,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,0.0,3.0,5.0,0.0
4,5.0,1.0,4.0,2.0,4.0


In [18]:
df['B'] = df['B'].fillna(df['B'].mean()) # B欄的空值用 B 欄的平均值補上
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [28]:
df['C'] = df['C'].fillna(df['C'].mode()[0]) # C欄的空值用 C 欄的眾數補上
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,4.5,6.0
1,1.0,6.0,3.0,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [29]:
df['D'] = df['D'].fillna(df['D'].median()) # D欄的空值用 D 欄的中位數補上
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,4.5,6.0
1,1.0,6.0,3.0,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [31]:
df['E'] = df['E'].fillna(df['E'].min()) # E欄的空值用 E 欄的最小值補上
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,4.5,6.0
1,1.0,6.0,3.0,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,4.0
4,5.0,1.0,4.0,2.0,4.0


# 二.類別資料的處理（有序、無序）

# Categorical Data (類別資料處理)

In [39]:
df2 = pd.DataFrame(
    [['green', 'M', 10.1, 1],
    ['red', 'L', 13.5, 2],
    ['blue', 'XL', 15.3, 1]]
)
df2.columns = ['color', 'size', 'price', 'classlabel']
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,1
1,red,L,13.5,2
2,blue,XL,15.3,1


In [40]:
size_mapping = {
    'XL': 3,
    'L': 2,
    'M': 1
}
df2['size'] = df2['size'].map(size_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,2
2,blue,3,15.3,1


In [42]:
pd.get_dummies(df2['color']) # get_dummies(放要做 one-hot encoding 的欄位)

Unnamed: 0,blue,green,red
0,0,1,0
1,0,0,1
2,1,0,0


In [43]:
onehot_encoding = pd.get_dummies(df2['color'], prefix = 'color')

In [44]:
df2 = df2.drop('color', 1)
df2

  df2 = df2.drop('color', 1)


Unnamed: 0,size,price,classlabel
0,1,10.1,1
1,2,13.5,2
2,3,15.3,1


In [45]:
pd.concat([onehot_encoding, df2], axis=1)

Unnamed: 0,color_blue,color_green,color_red,size,price,classlabel
0,0,1,0,1,10.1,1
1,0,0,1,2,13.5,2
2,1,0,0,3,15.3,1


# 三.資料特徵縮放

# Normalization(資料區間縮放)

In [46]:
from IPython.display import Math

In [47]:
iris = datasets.load_iris()
x = pd.DataFrame(iris['data'], columns=iris['feature_names'])
print("target_names: "+str(iris['target_names']))
y = pd.DataFrame(iris['target'], columns=['target'])
data = pd.concat([x, y], axis=1)
data.head(3)

target_names: ['setosa' 'versicolor' 'virginica']


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [48]:
data['sepal length (cm)'] = (data['sepal length (cm)'] - data['sepal length (cm)'].min())/\ # \ 是做換行的動作
                            (data['sepal length (cm)'].max() - data['sepal length (cm)'].min())
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,0.222222,3.5,1.4,0.2,0
1,0.166667,3.0,1.4,0.2,0
2,0.111111,3.2,1.3,0.2,0
3,0.083333,3.1,1.5,0.2,0
4,0.194444,3.6,1.4,0.2,0


# Standardization(資料標準化)

##### 相較於用 Normalization，Standardization是除標準差，outlier會被標準差縮到很小，outlier的影響也會變小，資料也會回歸到常態分佈的情況

In [49]:
Math(r'x^{(i)}_{std}=\frac{x^{(i)}-\mu_{x}}{\sigma_{x}}')

<IPython.core.display.Math object>

In [52]:
data['sepal width (cm)'] = (data['sepal width (cm)'] - data['sepal width (cm)'].mean())/\
                            (data['sepal width (cm)'].std())
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,0.222222,1.015602,1.4,0.2,0
1,0.166667,-0.131539,1.4,0.2,0
2,0.111111,0.327318,1.3,0.2,0
3,0.083333,0.097889,1.5,0.2,0
4,0.194444,1.24503,1.4,0.2,0
