# 两步走方法
## 1.先利用正样本和无标记训练的分类器训练，并得到可靠的分类器

In [37]:
from sklearn.datasets import fetch_mldata
import pandas as pd
import numpy as np

In [38]:
# 读取数据
mnist = fetch_mldata('MNIST original',data_home='./datasets/')
# 得到数据X和标签y
X, y = mnist['data'], mnist['target']
# 标签的副本拷贝
y_orig = y.copy()



In [39]:
select_index = np.array(np.where(y==1)).reshape(-1).tolist()
# 随机选择size大小的index作为positive数据
select_index_size = np.random.choice(select_index,replace=False, size=3000)
# 其他的标签都改成0，认为是unlabelled
other_index = [ i for i in range(len(X)) if i not in select_index_size]
y[other_index] = -1

In [40]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)

In [41]:
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [42]:
results = pd.DataFrame({
    'truth'      : y_orig,                    # True labels
    'label'      : y,                         # Labels shown to models
    'output_std' : rf.predict_proba(X)[:,1]   # Random forest's scores
}, columns = ['truth', 'label', 'output_std'])

# 利用可靠负样例与确定的正样例重新训练个分类器，再对无标签数据进行再次预测

In [56]:
# 拿到预测结果
prob = rf.predict_proba(X)[:,1]

In [76]:
# 预测到的可靠负样本为38714个
select_all_index_negative = np.array(np.where(prob == 0)).reshape(-1).tolist()
# 随机抽取3000个
select_index_negative = np.random.choice(select_all_index_negative, replace=False, size=3000)

array([ 5250, 43015, 21943, ..., 18747, 20014, 36674])

In [80]:
# 记录剩余的未标注的数据的索引
other_index_unlabel = [i for i in range(len(X)) if i not in select_index_size and i not in select_index_negative]
# 将可靠负样例标记为0
y[select_index_negative] = 0

In [105]:
# 正样例的索引为 select_index_size，标签为1， 负样例的索引为select_index_negative，标签为0， 
# 待测的数据索引集为select_index_negative，为-1，构造训练集
X_postive = X[select_index_size]
X_negative = X[select_index_negative]
y_postive = y[select_index_size]
y_negative = y[select_index_negative]
X_train = np.concatenate((X_postive, X_negative),axis = 0)
y_train = np.concatenate((y_postive, y_negative),axis = 0)
X_test = X[other_index_unlabel]

In [107]:
rf1 = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
rf1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [108]:
predicts = rf1.predict_proba(X_test)[:,1] 

In [112]:
predicts.shape

(64000,)

In [113]:
results1 = pd.DataFrame({
    'truth'      : y_orig[other_index_unlabel],                    # True labels
    'label'      : y[other_index_unlabel],                         # Labels shown to models
    'output_std' : predicts   # Random forest's scores
}, columns = ['truth', 'label', 'output_std'])

In [121]:
results1[(results1['output_std'] > 0.9) & (results1['truth'] == 1)]

Unnamed: 0,truth,label,output_std
5556,1.0,-1.0,0.999
5557,1.0,-1.0,1.000
5558,1.0,-1.0,1.000
5560,1.0,-1.0,1.000
5562,1.0,-1.0,1.000
5563,1.0,-1.0,1.000
5564,1.0,-1.0,0.997
5565,1.0,-1.0,1.000
5566,1.0,-1.0,1.000
5567,1.0,-1.0,1.000


In [123]:
#模型输出的正样本为A，真正的正样本集合为B
# 计算精确率（Precision）,指的是模型判别为正的所有样本中有多少是真正的正样本,则Precision（A,B）=|A∩B| / |A|
A_B = len(results1[(results1['output_std'] >= 0.08) & (results1['truth'] == 1)])
A = len(results1[results1['output_std'] >= 0.08])
Precision_A_B = A_B / A * 100
print("直接应用标准分类器，计算精确率Precision = %.3f %%" % Precision_A_B)
# 计算召回率（Recall）,指的是所有正样本有多少被模型判定为正样本Recall（A,B） = |A∩B| / |B|
B = len(results1[results1['truth'] == 1])
Recall_A_B = A_B / B * 100
print("直接应用标准分类器，计算精确率Recall = %.3f %%" % Recall_A_B)

直接应用标准分类器，计算精确率Precision = 23.708 %
直接应用标准分类器，计算精确率Recall = 99.979 %
