## 功能简介

使用pu-learning算法解决样本负例非常少的情况。

pu-learning主要有三种思路，这里使用pu-bagging和two-step的方法，介绍详解参考文章或者博客：

参考文章：https://roywright.me/2017/11/16/positive-unlabeled-learning/

引用的baggingPU.py来自：https://github.com/roywright/pu_learning/blob/master/baggingPU.py

In [37]:
import pymysql
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer  
from collections import Counter
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from xgboost import XGBClassifier
from baggingPU import BaggingClassifierPU
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from math import isnan
from numpy import NaN
from numpy import nan
import pickle
import json
from collections import Counter
from copy import deepcopy

import warnings
warnings.filterwarnings("ignore")

## 1、获取数据

In [38]:
connection = pymysql.Connect(
    host="localhost",
    port=3306,
    user="root",
    passwd="root",
    charset="utf8",
    db="project_researchers"
)

In [39]:
def get_train_data(connection):
    """
    查询数据，包括特征和标签
    :param connection:
    :return:
    """
    sql_select = """
     SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, label 
     FROM classifier_isTeacher_xgbc WHERE label = 1 and teac_id > 174 and category is not null
     UNION ALL
     SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, label 
     FROM classifier_isTeacher_xgbc WHERE label = 0 and teac_id > 64438 and category is not null
     UNION ALL 
     SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, 0
     FROM classifier_isTeacher_xgbc WHERE label is null and category is not null
    """
    df = pd.read_sql_query(sql_select, connection)
    all_features = ['bys_cn', 'hindex_cn', 'a_paper', 'b_paper', 'c_paper', 'papernum2017', 'papernum2016', 'papernum2015', 'papernum2014', 'papernum2013', 'num_journal', 'num_conference',  'degree', 'pagerank', 'degree_centrality', 'diff_year', 'coauthors_top10000', 'coauthors_top20000', 'coauthors_top30000', 'category', 'label']
    data = df[all_features]
    return data

train_data = get_train_data(connection)
print("shape of train_data:", train_data.shape)
print("train_data.info():", train_data.info())

shape of train_data: (198901, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198901 entries, 0 to 198900
Data columns (total 21 columns):
bys_cn                194419 non-null float64
hindex_cn             198331 non-null float64
a_paper               198901 non-null int64
b_paper               198901 non-null int64
c_paper               198901 non-null int64
papernum2017          198901 non-null int64
papernum2016          198901 non-null int64
papernum2015          198901 non-null int64
papernum2014          198901 non-null int64
papernum2013          198901 non-null int64
num_journal           198901 non-null int64
num_conference        198901 non-null int64
degree                198621 non-null float64
pagerank              198621 non-null float64
degree_centrality     198621 non-null float64
diff_year             198621 non-null float64
coauthors_top10000    198901 non-null int64
coauthors_top20000    198901 non-null int64
coauthors_top30000    198901 non-null int64
categor

In [40]:
def get_test_data(connection):
    """
    查询测试数据，包括特征和标签
    :param connection:
    :return:
    """
    sql_select = """
     SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, label 
     FROM classifier_isTeacher_xgbc WHERE label = 1 and teac_id <= 174 and category is not null
     UNION ALL
     SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, label 
     FROM classifier_isTeacher_xgbc WHERE label = 0 and teac_id <= 64438 and category is not null
    """
    df = pd.read_sql_query(sql_select, connection)
    all_features = ['bys_cn', 'hindex_cn', 'a_paper', 'b_paper', 'c_paper', 'papernum2017', 'papernum2016', 'papernum2015', 'papernum2014', 'papernum2013', 'num_journal', 'num_conference',  'degree', 'pagerank', 'degree_centrality', 'diff_year', 'coauthors_top10000', 'coauthors_top20000', 'coauthors_top30000', 'category', 'label']
    data = df[all_features]
    return data

test_data = get_test_data(connection)
print("shape of test_data:", test_data.shape)
print("test_data.info():", test_data.info())

shape of test_data: (850, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 21 columns):
bys_cn                834 non-null float64
hindex_cn             850 non-null int64
a_paper               850 non-null int64
b_paper               850 non-null int64
c_paper               850 non-null int64
papernum2017          850 non-null int64
papernum2016          850 non-null int64
papernum2015          850 non-null int64
papernum2014          850 non-null int64
papernum2013          850 non-null int64
num_journal           850 non-null int64
num_conference        850 non-null int64
degree                849 non-null float64
pagerank              849 non-null float64
degree_centrality     849 non-null float64
diff_year             849 non-null float64
coauthors_top10000    850 non-null int64
coauthors_top20000    850 non-null int64
coauthors_top30000    850 non-null int64
category              850 non-null int64
label                 850 non-null 

## 2、处理数据

In [41]:
# 对缺失值进行处理
# Method1：直接将含有缺失字段的值去掉
columns_name_zero = ['bys_cn', 'hindex_cn', 'degree', 'pagerank', 'degree_centrality', 'diff_year']
for column_name in columns_name_zero:
    train_data[column_name].fillna(0, inplace=True)
    test_data[column_name].fillna(0, inplace=True)
print("info of train_data::", train_data.info())
print("info of test_data::", test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198901 entries, 0 to 198900
Data columns (total 21 columns):
bys_cn                198901 non-null float64
hindex_cn             198901 non-null float64
a_paper               198901 non-null int64
b_paper               198901 non-null int64
c_paper               198901 non-null int64
papernum2017          198901 non-null int64
papernum2016          198901 non-null int64
papernum2015          198901 non-null int64
papernum2014          198901 non-null int64
papernum2013          198901 non-null int64
num_journal           198901 non-null int64
num_conference        198901 non-null int64
degree                198901 non-null float64
pagerank              198901 non-null float64
degree_centrality     198901 non-null float64
diff_year             198901 non-null float64
coauthors_top10000    198901 non-null int64
coauthors_top20000    198901 non-null int64
coauthors_top30000    198901 non-null int64
category              198901 non-null int

In [44]:
# 将category变成离散值（object），并将train_data变成向量（离散值变成one-hot == 使用DictVectorizer）

# training data
train_data[['category']] = train_data[['category']].astype(object)
y_train = train_data['label']
X_train = train_data.drop(columns=['label'])
print("****", X_train.shape)

vec = DictVectorizer()   

# 因为fDictVectorizer().fit_transform()需要的参数时list of dict，所以这里将其转化
headers_train = list(X_train.columns)
value_df_train = X_train.values
feature_list_train = []
for value_train in value_df_train:
    feature_dict = {}
    for i in range(0, len(headers_train)):
        if headers_train[i]=='category':
            feature_dict[headers_train[i]] = str(value_train[i])
        else:
            feature_dict[headers_train[i]] = value_train[i]
    feature_list_train.append(feature_dict)
print(len(feature_list_train), ' ', len(feature_list_train[0]))

X_train = vec.fit_transform(feature_list_train)
print("info of X_train:", X_train.shape)
print("y_train::", Counter(y_train))

# testing data
test_data[['category']] = test_data[['category']].astype(object)
y_test = test_data['label']
X_test = test_data.drop(columns=['label'])

headers_test = list(X_test.columns)
value_df_test = X_test.values
# print(headers)
# print(value_df.shape)
feature_list_test = []
for value_test in value_df_test:
    feature_dict = {}
    for i in range(0, len(headers_test)):
        if headers_test[i]=='category':
            feature_dict[headers_test[i]] = str(value_test[i])
        else:
            feature_dict[headers_test[i]] = value_test[i]
    feature_list_test.append(feature_dict)
print(len(feature_list_test), ' ', len(feature_list_test[0]))

X_test = vec.transform(feature_list_test)
print("info of X_test:", X_test.shape)
print("y_test::", Counter(y_test))

**** (198901, 20)
198901   20
info of X_train: (198901, 22)
y_train:: Counter({0: 182066, 1: 16835})
850   20
info of X_test: (850, 22)
y_test:: Counter({0: 700, 1: 150})


## 3、PU-Learning

### 3.1 pu-bagging方法

pu-bagging借助了bagging的思想，步骤如下：

（1）采样与正例相同大小的无标签数据当做负样本

（2）使用正例和负例训练分类器，预测除此正例和负例之外的数据标签

（3）重复多次，取预测的平均值

In [47]:
def train_and_test_XGBC_puBagging(X_train, y_train, X_test, y_test):
    
    # 归一化
    ss = StandardScaler(with_mean=False)
    X_continuous_train_new = ss.fit_transform(X_train[:, 0:-3])
    print("type of X_continuous_train_new::", type(X_continuous_train_new))
    print("shape of X_continuous_train_new::", X_continuous_train_new.shape)

    # 将连续值和离散值拼接
    X_train = np.hstack((X_continuous_train_new.A, X_train[:, -3:].A))   # .A转化为.numpy.ndarray
    print("shape of X_train::", X_train.shape)

    # 归一化
    X_continuous_test_new = ss.transform(X_test[:, 0:-3])
    print("type of X_continuous_test_new::", type(X_continuous_test_new))
    print("shape of X_continuous_test_new::", X_continuous_test_new.shape)

    # 将连续值和离散值拼接
    X_test = np.hstack((X_continuous_test_new.A, X_test[:, -3:].A))
    print("shape of X_test::", X_test.shape)
    
    y_train_origin = y_train.copy()
    
#     MAX_SAMPLES = 2 * np.array(y_train).tolist().count(1)
#     print(MAX_SAMPLES)

    bc = BaggingClassifierPU(
        DecisionTreeClassifier(),
        n_estimators=300,  # 1000 trees as usual
        max_samples=sum(y_train),  # Balance the positives and unlabeled in each bag
    )
    bc.fit(X_train, y_train)
    
    y_test_predict = bc.predict(X_test)
    
    print(classification_report(y_test_predict, y_test, target_names=['0', '1'], digits=5))
    
    # 统计总共的数据预测出来多少教师
    y_all_predict = bc.predict(X_train)
    print(Counter(y_all_predict))


# 调用预测函数
X_train_copy = deepcopy(X_train)
y_train_copy = deepcopy(y_train)
X_test_copy = deepcopy(X_test)
y_test_copy = deepcopy(y_test)
train_and_test_XGBC_puBagging(X_train, y_train, X_test, y_test)

type of X_continuous_train_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_train_new:: (198901, 19)
shape of X_train:: (198901, 22)
type of X_continuous_test_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_test_new:: (850, 19)
shape of X_test:: (850, 22)
              precision    recall  f1-score   support

           0    0.95714   0.97953   0.96821       684
           1    0.90667   0.81928   0.86076       166

   micro avg    0.94824   0.94824   0.94824       850
   macro avg    0.93190   0.89940   0.91448       850
weighted avg    0.94729   0.94824   0.94722       850

Counter({0: 147926, 1: 50975})


### 3.2 Two-step

two-step的思想如下：

（1）首先将所有的无标签数据当做负样本，和所有正例当做训练集训练分类器，识别出无标签样本数据中可靠的负例，将其当做真正的负例。

（2）使用正例和Step1中的可靠负例训练分类器，在挑选中可靠负例，不断迭代（本次实验迭代了10次）。

In [49]:
def train_and_test_XGBC_puTwoStep(X_train, y_train, X_test, y_test):
    
    # 归一化
    ss = StandardScaler(with_mean=False)
    X_continuous_train_new = ss.fit_transform(X_train[:, 0:-3])
    print("type of X_continuous_train_new::", type(X_continuous_train_new))
    print("shape of X_continuous_train_new::", X_continuous_train_new.shape)

    # 将连续值和离散值拼接
    X_train = np.hstack((X_continuous_train_new.A, X_train[:, -3:].A))   # .A转化为.numpy.ndarray
    print("shape of X_train::", X_train.shape)

    # 归一化
    X_continuous_test_new = ss.transform(X_test[:, 0:-3])
    print("type of X_continuous_test_new::", type(X_continuous_test_new))
    print("shape of X_continuous_test_new::", X_continuous_test_new.shape)

    # 将连续值和离散值拼接
    X_test = np.hstack((X_continuous_test_new.A, X_test[:, -3:].A))
    print("shape of X_test::", X_test.shape)
    
    # Create a new target vector, with 1 for positive, -1 for unlabeled, and 
    # 0 for "reliable negative" (there are no reliable negatives to start with)
    y_train_c = 2*y_train - 1 
    
    # Get the scores from RandomForestClassifier
    rf = RandomForestClassifier(n_estimators = 10)   # Use 1000 trees
    rf.fit(X_train, y_train_c)
    pred = rf.predict_proba(X_train)[:,1]
    print("pred is::", pred)
    
    # Find the range of scores given to positive data points
    # range_P = [min(pred * (y_train_c > 0)), max(pred * (y_train_c > 0))]
    range_P = [0.0, 0.9]

    # STEP 1
    # If any unlabeled point has a score above all known positives, 
    # or below all known positives, label it accordingly
    iP_new = y_train_c[(y_train_c < 0) & (pred >= range_P[1])].index
    iN_new = y_train_c[(y_train_c < 0) & (pred <= range_P[0])].index
    y_train_c.loc[iP_new] = 1
    y_train_c.loc[iN_new] = 0
    
    
    # Classifier to be used for step 2
    rf2 = RandomForestClassifier(n_estimators = 10)

    # Limit to 10 iterations (this is arbitrary, but 
    # otherwise this approach can take a very long time)
    for i in range(10):
        # If step 1 didn't find new labels, we're done
        if len(iP_new) + len(iN_new) == 0 and i > 0:
            break

        print('Step 1 labeled %d new positives and %d new negatives.' % (len(iP_new), len(iN_new)))
        print('Doing step' + str(i+1) + '... ', end = '')

        # STEP 2
        # Retrain on new labels and get new scores
        rf2.fit(X_train, y_train_c)
        pred = rf2.predict_proba(X_train)[:,-1]

        # Find the range of scores given to positive data points
        # range_P = [min(pred * (y_train_c > 0)), max(pred * (y_train_c > 0))]
        range_P = [0.0, 0.9]
        print("range_P is::", range_P)

        # Repeat step 1
        iP_new = y_train_c[(y_train_c < 0) & (pred >= range_P[1])].index
        iN_new = y_train_c[(y_train_c < 0) & (pred <= range_P[0])].index
        y_train_c.loc[iP_new] = 1
        y_train_c.loc[iN_new] = 0
        
    y_test_predict = rf2.predict(X_test)
    for i in range(len(y_test_predict)):
        if y_test_predict[i] == -1:
            y_test_predict[i] = 0
    print(classification_report(y_test_predict, y_test, target_names=[ '0', '1'], digits=5))
    
    # 统计总共的数据预测出来多少教师
    y_all_predict = rf2.predict(X_train)
    print(Counter(y_all_predict))

# 调用预测函数
X_train_copy = deepcopy(X_train)
y_train_copy = deepcopy(y_train)
X_test_copy = deepcopy(X_test)
y_test_copy = deepcopy(y_test)
train_and_test_XGBC_puTwoStep(X_train, y_train, X_test, y_test)

type of X_continuous_train_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_train_new:: (198901, 19)
shape of X_train:: (198901, 22)
type of X_continuous_test_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_test_new:: (850, 19)
shape of X_test:: (850, 22)
pred is:: [0.7 0.8 0.8 ... 0.3 0.  0. ]
Step 1 labeled 0 new positives and 143341 new negatives.
Doing step1... range_P is:: [0.0, 0.9]
Step 1 labeled 1 new positives and 15622 new negatives.
Doing step2... range_P is:: [0.0, 0.9]
Step 1 labeled 0 new positives and 5528 new negatives.
Doing step3... range_P is:: [0.0, 0.9]
Step 1 labeled 0 new positives and 2774 new negatives.
Doing step4... range_P is:: [0.0, 0.9]
Step 1 labeled 0 new positives and 1493 new negatives.
Doing step5... range_P is:: [0.0, 0.9]
Step 1 labeled 0 new positives and 969 new negatives.
Doing step6... range_P is:: [0.0, 0.9]
Step 1 labeled 0 new positives and 663 new negatives.
Doing step7... range_P is:: [0.0, 0.9]
Step 1 lab

### 分析

通过上述结果发现，对于本任务而言，使用pu-bagging的效果要好于使用two-step的效果。