In [2]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 加载Iris数据集
iris = load_iris()

# 获取特征数据和目标数据
X, y = iris.data, iris.target

# 定义随机种子，以确保可重复实验
random_seed = 42
np.random.seed(random_seed)

# 计算要删除的属性值数量
num_values_to_remove = int(0.05 * X.size)

# 随机选择要删除的属性值的索引
values_to_remove = np.random.choice(X.size, num_values_to_remove, replace=False)

# 将要删除的属性值设置为np.nan
X_flat = X.flatten()
X_flat[values_to_remove] = np.nan

# 将数据重新构造为原始形状
X = X_flat.reshape(X.shape)

In [3]:
# 创建简单填充器，使用均值填充缺失值
imputer = SimpleImputer(strategy='mean')

# 使用填充器对数据进行填充
X_mean = imputer.fit_transform(X)

# 打印结果
print(X_mean[:5,:])

[[5.1        3.5        3.86478873 0.2       ]
 [4.9        3.         1.4        0.2       ]
 [4.7        3.2        3.86478873 0.2       ]
 [4.6        3.1        1.5        0.2       ]
 [5.         3.6        1.4        0.2       ]]


In [4]:
# 创建简单填充器，使用众数填充缺失值
imputer = SimpleImputer(strategy='most_frequent')

# 使用填充器对数据进行填充
X_most_frequent = imputer.fit_transform(X)

# 打印结果
print(X_most_frequent[:5,:])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.4 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]


In [5]:
# 创建简单填充器，使用中位数填充缺失值
imputer = SimpleImputer(strategy='median')

# 使用填充器对数据进行填充
X_median = imputer.fit_transform(X)

# 打印结果
print(X_median[:5,:])

[[5.1 3.5 4.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 4.4 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]


# 无缺失值

In [6]:
# 读取iris数据集
iris = load_iris()

# k值设为3
k = 3

# 划分训练集和测试集，设置种子可重复实验
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# 创建knn分类器
knn = KNeighborsClassifier(n_neighbors=k)

# 训练模型
knn.fit(X_train, y_train)

# 预测测试集
y_pred = knn.predict(X_test)

# 计算性能指标
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1-score:", f1_score(y_test, y_pred, average='macro'))


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0


# 均值填充

In [7]:
# 划分训练集和测试集，设置种子可重复实验
X_mean_train, X_mean_test, y_mean_train, y_mean_test = train_test_split(X_mean, iris.target, test_size=0.2, random_state=42)

# 训练模型
knn.fit(X_mean_train, y_mean_train)

# 预测测试集
y_mean_pred = knn.predict(X_mean_test)

# 计算性能指标
print("Accuracy:", accuracy_score(y_mean_test, y_mean_pred))
print("Precision:", precision_score(y_mean_test, y_mean_pred, average='macro'))
print("Recall:", recall_score(y_mean_test, y_mean_pred, average='macro'))
print("F1-score:", f1_score(y_mean_test, y_mean_pred, average='macro'))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0


# 中位数填充

In [8]:
# 划分训练集和测试集，设置种子可重复实验
X_median_train, X_median_test, y_median_train, y_median_test = train_test_split(X_median, iris.target, test_size=0.2, random_state=42)

# 训练模型
knn.fit(X_median_train, y_median_train)

# 预测测试集
y_median_pred = knn.predict(X_median_test)

# 计算性能指标
print("Accuracy:", accuracy_score(y_median_test, y_median_pred))
print("Precision:", precision_score(y_median_test, y_median_pred, average='macro'))
print("Recall:", recall_score(y_median_test, y_median_pred, average='macro'))
print("F1-score:", f1_score(y_median_test, y_median_pred, average='macro'))


Accuracy: 0.9666666666666667
Precision: 0.9722222222222222
Recall: 0.9629629629629629
F1-score: 0.9658994032395567


# 众数填充

In [9]:
# 划分训练集和测试集，设置种子可重复实验
X_most_frequent_train, X_most_frequent_test, y_most_frequent_train, y_most_frequent_test = train_test_split(X_most_frequent, iris.target, test_size=0.2, random_state=42)

# 训练模型
knn.fit(X_most_frequent_train, y_most_frequent_train)

# 预测测试集
y_most_frequent_pred = knn.predict(X_most_frequent_test)

# 计算性能指标
print("Accuracy:", accuracy_score(y_most_frequent_test, y_most_frequent_pred))
print("Precision:", precision_score(y_most_frequent_test, y_most_frequent_pred, average='macro'))
print("Recall:", recall_score(y_most_frequent_test, y_most_frequent_pred, average='macro'))
print("F1-score:", f1_score(y_most_frequent_test, y_most_frequent_pred, average='macro'))


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0


|   |Accuracy|Precision|Recall|F1-score|
|----|--------|---------|------|--------|
|源数据|1.0|1.0|1.0|1.0|
|中位数填充|1.0|1.0|1.0|1.0|
|众数填充|1.0|1.0|1.0|1.0|
|均值填充|0.9666666666666667|0.9722222222222222|0.9629629629629629|0.9658994032395567|