In [1]:
# 載入函式庫
import numpy as np
# from fancyimpute import KNN ## only applied to python 3.6 and no more maintainance
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer # replace fancyimpute
from sklearn.datasets import make_blobs

In [2]:
# 產生模擬的特徵矩陣
features, _ = make_blobs(n_samples = 1000,
                         n_features = 2,
                         random_state = 1)

In [3]:
# 特徵標準化
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

In [4]:
standardized_features

array([[ 0.87301861,  1.31426523],
       [-0.67073178, -0.22369263],
       [ 2.1048424 ,  1.45332359],
       ...,
       [ 1.18998798,  1.33439442],
       [ 1.22406396,  1.27667052],
       [-0.21664919, -1.19113343]])

In [5]:
# 以一缺漏值替代第一特徵的第一個值
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan

In [6]:
standardized_features

array([[        nan,  1.31426523],
       [-0.67073178, -0.22369263],
       [ 2.1048424 ,  1.45332359],
       ...,
       [ 1.18998798,  1.33439442],
       [ 1.22406396,  1.27667052],
       [-0.21664919, -1.19113343]])

In [7]:
# 在特徴矩陣中預測缺漏值-以KNN方式
imputer = KNNImputer()
features_knn_imputed = imputer.fit_transform(standardized_features)

# 比較真值與推算值
print("原值:", true_value)
print("使用KNN方法補值:", features_knn_imputed[0,0])

原值: 0.8730186113995938
使用KNN方法補值: 1.0959262913919632


In [8]:
# Alternative imutating method
# from sklearn.preprocessing import Imputer 
# New in version 0.20: SimpleImputer replaces the previous sklearn.preprocessing.Imputer estimator which is now removed.
from sklearn.impute import SimpleImputer

# Create imputer
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# 推算數值
features_mean_imputed = mean_imputer.fit_transform(features)

# 比較真值與推算值
print("原值:", true_value)
print("使用平均法補值:", features_mean_imputed[0,0])

原值: 0.8730186113995938
使用平均法補值: -3.058372724614996
