# 任务：半监督学习算法进行数字识别

## 1. 导入各种包

In [26]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import datasets
from sklearn.semi_supervised import label_propagation
from sklearn.metrics import classification_report,confusion_matrix

## 2. 加载digits数据集，从中随机选取100个样本打类别标签

In [27]:
digits = datasets.load_digits()
# indices是随机产生的0-1796个数字，且打乱
# _________________________
indices = list(range(1797))
np.random.shuffle(indices)

X = digits.data[indices]
y = digits.target[indices]
images = digits.images[indices]

n_total_samples = len(y) 
n_labeled_points = 100 # 标注好的数据共100条
max_iterations = 10 # 迭代次数

# unlabeled_indices = _________________________ # 未标注的数据索引
unlabeled_indices = np.arange(n_labeled_points, n_total_samples)

## 3. 运用sklearn包中label_propagation算法训练模型，对未标记样本进行预测

In [28]:
for i in range(max_iterations):
    if len(unlabeled_indices) == 0:
        print("no unlabeled items left to label") # 没有未标记的标签了，全部标注好了
        break

    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1 #把未标注的数据全部标记为-1

    # 请在下方补充训练模型与预测代码
    lp_model = label_propagation.LabelSpreading(max_iter=max_iterations, gamma=0.25)
    lp_model.fit(X, y_train)

    predicted_labels = lp_model.transduction_[unlabeled_indices]
    true_labels = y[unlabeled_indices] # 真实的标签

cm = confusion_matrix(true_labels,predicted_labels,labels = lp_model.classes_)
print(cm)
print(classification_report(true_labels,predicted_labels))

[[162   0   0   0   0   0   0   0   0   0]
 [  0 138  13   0   0   0  11   0   1  10]
 [  0   2 163   0   0   0   0   1   2   0]
 [  0   0   1 158   0   0   0   1  11   3]
 [  0   0   0   0 163   3   0   3   0   0]
 [  0   0   0   0   0 168   1   0   0   2]
 [  0   8   0   0   0   0 162   0   1   0]
 [  0   4   0   0   0   0   0 160   6   0]
 [  0   4   4   0   0   0   0   0 157   1]
 [  0   0   0   1   0   4   0   0  13 155]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       162
           1       0.88      0.80      0.84       173
           2       0.90      0.97      0.93       168
           3       0.99      0.91      0.95       174
           4       1.00      0.96      0.98       169
           5       0.96      0.98      0.97       171
           6       0.93      0.95      0.94       171
           7       0.97      0.94      0.96       170
           8       0.82      0.95      0.88       166
       

## 4. 请根据cotraining算法自行编写代码，实现半监督学习数字识别任务，与上一个算法进行比较分析。（选做）

In [31]:
from sklearn.naive_bayes import GaussianNB

# 分割特征集为两部分
X1, X2 = X[:, :32], X[:, 32:]

# Co-training算法
def cotraining(X1, X2, y, labeled_indices, unlabeled_indices, max_iter=10):
    clf1 = GaussianNB()
    clf2 = GaussianNB()

    for _ in range(max_iter):
        clf1.fit(X1[labeled_indices], y[labeled_indices])
        clf2.fit(X2[labeled_indices], y[labeled_indices])
        
        y_pred1 = clf1.predict(X1[unlabeled_indices])
        y_pred2 = clf2.predict(X2[unlabeled_indices])
        
        newly_labeled_indices = []
        for i in range(len(unlabeled_indices)):
            if y_pred1[i] == y_pred2[i]:
                newly_labeled_indices.append(unlabeled_indices[i])
        
        if not newly_labeled_indices:
            break
        
        labeled_indices = np.concatenate((labeled_indices, newly_labeled_indices))
        unlabeled_indices = np.setdiff1d(unlabeled_indices, newly_labeled_indices)
    
    # 最终模型训练
    clf1.fit(X1[labeled_indices], y[labeled_indices])
    return clf1

# 训练Co-training模型
y_train = y.copy()
labeled_indices = np.arange(0, n_labeled_points)
cotrained_model = cotraining(X1, X2, y_train, labeled_indices, unlabeled_indices, max_iterations)

# 预测并评估Co-training模型
predicted_labels = cotrained_model.predict(X1[unlabeled_indices])
true_labels = y[unlabeled_indices]
cm = confusion_matrix(true_labels, predicted_labels, labels = lp_model.classes_)
print(cm)
print(classification_report(true_labels, predicted_labels))


[[159   2   0   0   0   0   0   0   0   1]
 [  0 129   1   0   0   0  36   0   7   0]
 [  4   9 101   6   0   0   2   1  45   0]
 [  0   3  24  52   0   0   1   6  88   0]
 [  5  29   0   0  95   0  10   1  18  11]
 [  2   4   3   0   0 102   7   1  49   3]
 [  4   1   0   0   5   0 150   0  10   1]
 [  0   8   5   4   0   0   0  55  46  52]
 [  0  20   8   1   0   0   0   0 132   5]
 [  0  16   4   0   0   0   0  10  78  65]]
              precision    recall  f1-score   support

           0       0.91      0.98      0.95       162
           1       0.58      0.75      0.65       173
           2       0.69      0.60      0.64       168
           3       0.83      0.30      0.44       174
           4       0.95      0.56      0.71       169
           5       1.00      0.60      0.75       171
           6       0.73      0.88      0.80       171
           7       0.74      0.32      0.45       170
           8       0.28      0.80      0.41       166
       