# 任务：半监督学习算法进行数字识别

## 1. 导入各种包

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import datasets
from sklearn.semi_supervised import label_propagation
from sklearn.metrics import classification_report,confusion_matrix

## 2. 加载digits数据集，从中随机选取100个样本打类别标签

In [None]:
digits = datasets.load_digits()
# indices是随机产生的0-1796个数字，且打乱
# _________________________
indices = np.random.permutation(len(digits.data))[:100]

X = digits.data[indices]
y = digits.target[indices]
images = digits.images[indices]

n_total_samples = len(y) 
n_labeled_points = 100 # 标注好的数据共100条
max_iterations = 10 # 迭代次数

# unlabeled_indices = _________________________ # 未标注的数据索引
unlabeled_indices = np.arange(n_labeled_points, n_total_samples)

## 3. 运用sklearn包中label_propagation算法训练模型，对未标记样本进行预测

In [None]:
for i in range(max_iterations):
    if len(unlabeled_indices) == 0:
        print("no unlabeled items left to label") # 没有未标记的标签了，全部标注好了
        break

    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1 #把未标注的数据全部标记为-1
    
    # 请在下方补充训练模型与预测代码
    lp_model = label_propagation.LabelSpreading()
    lp_model.fit(X, y_train)
    
    predicted_labels = lp_model.transduction_[unlabeled_indices]
    true_labels = y[unlabeled_indices] # 真实的标签
    
cm = confusion_matrix(true_labels,predicted_labels,labels = lp_model.classes_)
print(cm)
print(classification_report(true_labels,predicted_labels))

## 4. 请根据cotraining算法自行编写代码，实现半监督学习数字识别任务，与上一个算法进行比较分析。（选做）