## 功能简介

在有标签数据集中，正例数目17000左右，负例数目1709个，所以存在严重的不平衡问题，所以我们尝试解决这个问题。

为了解决这个问题，我们需要处理不平衡数据，本py文件使用的是过采样的方法，使用的SMOTE，RandomOverSampler和ADASYN。

特征选择使用的是RFECV（由featureSelectionBasic.ipynb得到）

参考文章：https://beckernick.github.io/oversampling-modeling/

In [12]:
import pymysql
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.feature_extraction import DictVectorizer  
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import Counter
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, precision_recall_fscore_support
from imblearn.over_sampling import SMOTE,ADASYN,RandomOverSampler
import pickle
import json
from copy import deepcopy

import warnings
warnings.filterwarnings("ignore")

## 1、获取数据

In [13]:
connection = pymysql.Connect(
    host="localhost",
    port=3306,
    user="root",
    passwd="root",
    charset="utf8",
    db="project_researchers"
)

In [14]:
def get_train_data(connection):
    """
    查询训练数据，包括特征和标签
    :param connection:
    :return:
    """
    sql_select = """
     SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, label 
     FROM classifier_isTeacher_xgbc WHERE label = 1 and teac_id > 174 and category is not null
     UNION ALL
     SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, label 
     FROM classifier_isTeacher_xgbc WHERE label = 0 and teac_id > 64438 and category is not null
    """
    df = pd.read_sql_query(sql_select, connection)
    all_features = ['bys_cn', 'hindex_cn', 'a_paper', 'b_paper', 'c_paper', 'papernum2017', 'papernum2016', 'papernum2015', 'papernum2014', 'papernum2013', 'num_journal', 'num_conference',  'degree', 'pagerank', 'degree_centrality', 'diff_year', 'coauthors_top10000', 'coauthors_top20000', 'coauthors_top30000', 'category', 'label']
    data = df[all_features]
    return data

train_data = get_train_data(connection)
print("shape of train_data:", train_data.shape)
print("train_data.info():", train_data.info())

shape of train_data: (17844, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 21 columns):
bys_cn                17608 non-null float64
hindex_cn             17707 non-null float64
a_paper               17844 non-null int64
b_paper               17844 non-null int64
c_paper               17844 non-null int64
papernum2017          17844 non-null int64
papernum2016          17844 non-null int64
papernum2015          17844 non-null int64
papernum2014          17844 non-null int64
papernum2013          17844 non-null int64
num_journal           17844 non-null int64
num_conference        17844 non-null int64
degree                17774 non-null float64
pagerank              17774 non-null float64
degree_centrality     17774 non-null float64
diff_year             17774 non-null float64
coauthors_top10000    17844 non-null int64
coauthors_top20000    17844 non-null int64
coauthors_top30000    17844 non-null int64
category              17844 n

In [15]:
def get_test_data(connection):
    """
    查询测试数据，包括特征和标签
    :param connection:
    :return:
    """
    sql_select = """
     SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, label 
     FROM classifier_isTeacher_xgbc WHERE label = 1 and teac_id <= 174 and category is not null
     UNION ALL
     SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, label 
     FROM classifier_isTeacher_xgbc WHERE label = 0 and teac_id <= 64438 and category is not null
    """
    df = pd.read_sql_query(sql_select, connection)
    all_features = ['bys_cn', 'hindex_cn', 'a_paper', 'b_paper', 'c_paper', 'papernum2017', 'papernum2016', 'papernum2015', 'papernum2014', 'papernum2013', 'num_journal', 'num_conference',  'degree', 'pagerank', 'degree_centrality', 'diff_year', 'coauthors_top10000', 'coauthors_top20000', 'coauthors_top30000', 'category', 'label']
    data = df[all_features]
    return data

test_data = get_test_data(connection)
print("shape of test_data:", test_data.shape)
print("test_data.info():", test_data.info())

shape of test_data: (850, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 21 columns):
bys_cn                834 non-null float64
hindex_cn             850 non-null int64
a_paper               850 non-null int64
b_paper               850 non-null int64
c_paper               850 non-null int64
papernum2017          850 non-null int64
papernum2016          850 non-null int64
papernum2015          850 non-null int64
papernum2014          850 non-null int64
papernum2013          850 non-null int64
num_journal           850 non-null int64
num_conference        850 non-null int64
degree                849 non-null float64
pagerank              849 non-null float64
degree_centrality     849 non-null float64
diff_year             849 non-null float64
coauthors_top10000    850 non-null int64
coauthors_top20000    850 non-null int64
coauthors_top30000    850 non-null int64
category              850 non-null int64
label                 850 non-null 

## 2、处理数据

In [16]:
# 对缺失值进行处理
# Method1：直接将含有缺失字段的值去掉
train_data = train_data.dropna()
print("shape of train_data::", train_data.shape)
print("train_data.info()::", train_data.info())
test_data = test_data.dropna()
print("shape of test_data::", test_data.shape)
print("test_data.info()::", test_data.info())
print("y_test_label::", Counter(test_data["label"]))

shape of train_data:: (17539, 21)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 17539 entries, 0 to 17843
Data columns (total 21 columns):
bys_cn                17539 non-null float64
hindex_cn             17539 non-null float64
a_paper               17539 non-null int64
b_paper               17539 non-null int64
c_paper               17539 non-null int64
papernum2017          17539 non-null int64
papernum2016          17539 non-null int64
papernum2015          17539 non-null int64
papernum2014          17539 non-null int64
papernum2013          17539 non-null int64
num_journal           17539 non-null int64
num_conference        17539 non-null int64
degree                17539 non-null float64
pagerank              17539 non-null float64
degree_centrality     17539 non-null float64
diff_year             17539 non-null float64
coauthors_top10000    17539 non-null int64
coauthors_top20000    17539 non-null int64
coauthors_top30000    17539 non-null int64
category              17539 

In [17]:
# 将category变成离散值（object），并将train_data变成向量（离散值变成one-hot）

# training data
train_data[['category']] = train_data[['category']].astype(object)
y_train = train_data['label']
X_train = train_data.drop(columns=['label'])
print("****", X_train.shape)

vec = DictVectorizer()   

# 因为fDictVectorizer().fit_transform()需要的参数时list of dict，所以这里将其转化
headers_train = list(X_train.columns)
value_df_train = X_train.values
feature_list_train = []
for value_train in value_df_train:
    feature_dict = {}
    for i in range(0, len(headers_train)):
        if headers_train[i]=='category':
            feature_dict[headers_train[i]] = str(value_train[i])
        else:
            feature_dict[headers_train[i]] = value_train[i]
    feature_list_train.append(feature_dict)
print(len(feature_list_train), ' ', len(feature_list_train[0]))

X_train = vec.fit_transform(feature_list_train)
print("info of X_train:", X_train.shape)
print("y_train::", Counter(y_train))

# testing data
test_data[['category']] = test_data[['category']].astype(object)
y_test = test_data['label']
X_test = test_data.drop(columns=['label'])

headers_test = list(X_test.columns)
value_df_test = X_test.values
# print(headers)
# print(value_df.shape)
feature_list_test = []
for value_test in value_df_test:
    feature_dict = {}
    for i in range(0, len(headers_test)):
        if headers_test[i]=='category':
            feature_dict[headers_test[i]] = str(value_test[i])
        else:
            feature_dict[headers_test[i]] = value_test[i]
    feature_list_test.append(feature_dict)
print(len(feature_list_test), ' ', len(feature_list_test[0]))

X_test = vec.transform(feature_list_test)
print("info of X_test:", X_test.shape)
print("y_test::", Counter(y_test))

**** (17539, 20)
17539   20
info of X_train: (17539, 22)
y_train:: Counter({1: 16551, 0: 988})
833   20
info of X_test: (833, 22)
y_test:: Counter({0: 683, 1: 150})


## 3、获取需要预测的数据

In [18]:
def get_predict_data(connection):
    """
    获取需要预测的数据，包括训练集中的特征
    :param connection:
    :return:
    """
    sql_select = """
    SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category 
    FROM classifier_isTeacher_xgbc WHERE label is null and category is not null
    """
    df = pd.read_sql_query(sql_select, connection)
    all_features = ['bys_cn', 'hindex_cn', 'a_paper', 'b_paper', 'c_paper', 'papernum2017', 'papernum2016', 'papernum2015', 'papernum2014', 'papernum2013', 'num_journal', 'num_conference',  'degree', 'pagerank', 'degree_centrality', 'diff_year', 'coauthors_top10000', 'coauthors_top20000', 'coauthors_top30000', 'category']
    data = df[all_features]
    return data

data_test = get_predict_data(connection)
print("shape of data_test:", data_test.shape)
print("data_test.info():", data_test.info())

shape of data_test: (181057, 20)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181057 entries, 0 to 181056
Data columns (total 20 columns):
bys_cn                176811 non-null float64
hindex_cn             180624 non-null float64
a_paper               181057 non-null int64
b_paper               181057 non-null int64
c_paper               181057 non-null int64
papernum2017          181057 non-null int64
papernum2016          181057 non-null int64
papernum2015          181057 non-null int64
papernum2014          181057 non-null int64
papernum2013          181057 non-null int64
num_journal           181057 non-null int64
num_conference        181057 non-null int64
degree                180847 non-null float64
pagerank              180847 non-null float64
degree_centrality     180847 non-null float64
diff_year             180847 non-null float64
coauthors_top10000    181057 non-null int64
coauthors_top20000    181057 non-null int64
coauthors_top30000    181057 non-null int64
category

## 4、处理predictData

In [19]:
# 使用0进行填充
data_test_fill = data_test.fillna(0)
print("info of data_test_fill::", data_test_fill.info())

# 因为fDictVectorizer().fit_transform()需要的参数时list of dict，所以这里将其转化
headers_test_all = list(data_test_fill.columns)
value_df_train_all = data_test_fill.values
feature_list_train_all = []
for value_test_all in value_df_train_all:
    feature_dict = {}
    for i in range(0, len(headers_test_all)):
        if headers_test_all[i]=='category':
            feature_dict[headers_test_all[i]] = str(value_test_all[i])
        else:
            feature_dict[headers_test_all[i]] = value_test_all[i]
    feature_list_train_all.append(feature_dict)
print(len(feature_list_train_all), ' ', len(feature_list_train_all[0]))

X_test_all = vec.transform(feature_list_train_all)

print("shape of X_test_all::", X_test_all.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181057 entries, 0 to 181056
Data columns (total 20 columns):
bys_cn                181057 non-null float64
hindex_cn             181057 non-null float64
a_paper               181057 non-null int64
b_paper               181057 non-null int64
c_paper               181057 non-null int64
papernum2017          181057 non-null int64
papernum2016          181057 non-null int64
papernum2015          181057 non-null int64
papernum2014          181057 non-null int64
papernum2013          181057 non-null int64
num_journal           181057 non-null int64
num_conference        181057 non-null int64
degree                181057 non-null float64
pagerank              181057 non-null float64
degree_centrality     181057 non-null float64
diff_year             181057 non-null float64
coauthors_top10000    181057 non-null int64
coauthors_top20000    181057 non-null int64
coauthors_top30000    181057 non-null int64
category              181057 non-null int

## 5、使用SMOTE方法

In [23]:
def train_and_test_XGBC_SMOTE(X_train, y_train, X_test, y_test, X_test_all):
    
    print("X_train.shape", X_train.shape)
    print("X_test.shape", X_test.shape)

    # 归一化
    ss = StandardScaler(with_mean=False)
    X_continuous_train_new = ss.fit_transform(X_train[:, 0:-3])
    print("type of X_continuous_train_new::", type(X_continuous_train_new))
    print("shape of X_continuous_train_new::", X_continuous_train_new.shape)

    # 将连续值和离散值拼接
    X_train = np.hstack((X_continuous_train_new.A, X_train[:, -3:].A))   # .A转化为.numpy.ndarray
    print("shape of X_train::", X_train.shape)

    # 归一化
    X_continuous_test_new = ss.transform(X_test[:, 0:-3])
    print("type of X_continuous_test_new::", type(X_continuous_test_new))
    print("shape of X_continuous_test_new::", X_continuous_test_new.shape)

    # 将连续值和离散值拼接
    X_test = np.hstack((X_continuous_test_new.A, X_test[:, -3:].A))
    print("shape of X_test::", X_test.shape)

    # 划分训练集和测试集
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    sm = SMOTE(random_state=12, ratio=1.0)
    X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
    print("shape of X_train_res::", X_train_res.shape)
    print("shape of y_train_res::", y_train_res.shape)
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train_res, y_train_res)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['0', '1'], digits=5))

    # 归一化
    X_continuous_test_all = ss.transform(X_test_all[:, 0:-3])
    print("type of X_continuous_test_all::", type(X_continuous_test_all))
    print("shape of X_continuous_test_all::", X_continuous_test_all.shape)
    
    # 将连续值和离散值拼接
    print("shape:::", X_test_all[:, -3:].shape)
    X_test_all = np.hstack((X_continuous_test_all.A, X_test_all[:, -3:].A))
    print("shape of X_test::", X_test_all.shape)
    
    y_predict = xgbc.predict(X_test_all)
    print("y_predict::", Counter(y_predict))
    
# 调用预测函数
X_train_copy = deepcopy(X_train)    
X_test_copy = deepcopy(X_test)
y_train_copy = deepcopy(y_train)
y_test_copy = deepcopy(y_test)
train_and_test_XGBC_SMOTE(X_train_copy, y_train_copy, X_test_copy, y_test_copy, X_test_all)

X_train.shape (17539, 22)
X_test.shape (833, 22)
type of X_continuous_train_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_train_new:: (17539, 19)
shape of X_train:: (17539, 22)
type of X_continuous_test_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_test_new:: (833, 19)
shape of X_test:: (833, 22)
shape of X_train:: (17539, 22)
shape of X_test:: (833, 22)
shape of y_train:: (17539,)
Counter of y_train:: Counter({1: 16551, 0: 988})
shape of y_test:: (833,)
Counter of y_test:: Counter({0: 683, 1: 150})
shape of X_train_res:: (33102, 22)
shape of y_train_res:: (33102,)
              precision    recall  f1-score   support

           0    0.86823   0.99164   0.92584       598
           1    0.96667   0.61702   0.75325       235

   micro avg    0.88595   0.88595   0.88595       833
   macro avg    0.91745   0.80433   0.83954       833
weighted avg    0.89600   0.88595   0.87715       833

type of X_continuous_test_all:: <class 'scipy.sparse.csr.csr_

## 6、使用ADASYN方法

In [24]:
def train_and_test_XGBC_ADASYN(X_train, y_train, X_test, y_test, X_test_all):
    
    print("X_train.shape", X_train.shape)
    print("X_test.shape", X_test.shape)

    # 归一化
    ss = StandardScaler(with_mean=False)
    X_continuous_train_new = ss.fit_transform(X_train[:, 0:-3])
    print("type of X_continuous_train_new::", type(X_continuous_train_new))
    print("shape of X_continuous_train_new::", X_continuous_train_new.shape)

    # 将连续值和离散值拼接
    X_train = np.hstack((X_continuous_train_new.A, X_train[:, -3:].A))   # .A转化为.numpy.ndarray
    print("shape of X_train::", X_train.shape)

    # 归一化
    X_continuous_test_new = ss.transform(X_test[:, 0:-3])
    print("type of X_continuous_test_new::", type(X_continuous_test_new))
    print("shape of X_continuous_test_new::", X_continuous_test_new.shape)

    # 将连续值和离散值拼接
    X_test = np.hstack((X_continuous_test_new.A, X_test[:, -3:].A))
    print("shape of X_test::", X_test.shape)

    # 划分训练集和测试集
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    sm = ADASYN(random_state=12, ratio=1.0)
    X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
    print("shape of X_train_res::", X_train_res.shape)
    print("shape of y_train_res::", y_train_res.shape)
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train_res, y_train_res)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['0', '1'], digits=5))

    # 归一化
    X_continuous_test_all = ss.transform(X_test_all[:, 0:-3])
    print("type of X_continuous_test_all::", type(X_continuous_test_all))
    print("shape of X_continuous_test_all::", X_continuous_test_all.shape)
    
    # 将连续值和离散值拼接
    print("shape:::", X_test_all[:, -3:].shape)
    X_test_all = np.hstack((X_continuous_test_all.A, X_test_all[:, -3:].A))
    print("shape of X_test::", X_test_all.shape)
    
    y_predict = xgbc.predict(X_test_all)
    print("y_predict::", Counter(y_predict))
    
# 调用预测函数
X_train_copy = deepcopy(X_train)    
X_test_copy = deepcopy(X_test)
y_train_copy = deepcopy(y_train)
y_test_copy = deepcopy(y_test)
train_and_test_XGBC_ADASYN(X_train_copy, y_train_copy, X_test_copy, y_test_copy, X_test_all)

X_train.shape (17539, 22)
X_test.shape (833, 22)
type of X_continuous_train_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_train_new:: (17539, 19)
shape of X_train:: (17539, 22)
type of X_continuous_test_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_test_new:: (833, 19)
shape of X_test:: (833, 22)
shape of X_train:: (17539, 22)
shape of X_test:: (833, 22)
shape of y_train:: (17539,)
Counter of y_train:: Counter({1: 16551, 0: 988})
shape of y_test:: (833,)
Counter of y_test:: Counter({0: 683, 1: 150})
shape of X_train_res:: (33028, 22)
shape of y_train_res:: (33028,)
              precision    recall  f1-score   support

           0    0.90776   0.98569   0.94512       629
           1    0.94000   0.69118   0.79661       204

   micro avg    0.91357   0.91357   0.91357       833
   macro avg    0.92388   0.83843   0.87087       833
weighted avg    0.91566   0.91357   0.90875       833

type of X_continuous_test_all:: <class 'scipy.sparse.csr.csr_

## 7、使用随机上采样方法

In [25]:
def train_and_test_XGBC_ROS(X_train, y_train, X_test, y_test, X_test_all):
    
    print("X_train.shape", X_train.shape)
    print("X_test.shape", X_test.shape)

    # 归一化
    ss = StandardScaler(with_mean=False)
    X_continuous_train_new = ss.fit_transform(X_train[:, 0:-3])
    print("type of X_continuous_train_new::", type(X_continuous_train_new))
    print("shape of X_continuous_train_new::", X_continuous_train_new.shape)

    # 将连续值和离散值拼接
    X_train = np.hstack((X_continuous_train_new.A, X_train[:, -3:].A))   # .A转化为.numpy.ndarray
    print("shape of X_train::", X_train.shape)

    # 归一化
    X_continuous_test_new = ss.transform(X_test[:, 0:-3])
    print("type of X_continuous_test_new::", type(X_continuous_test_new))
    print("shape of X_continuous_test_new::", X_continuous_test_new.shape)

    # 将连续值和离散值拼接
    X_test = np.hstack((X_continuous_test_new.A, X_test[:, -3:].A))
    print("shape of X_test::", X_test.shape)

    # 划分训练集和测试集
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    sm = RandomOverSampler(random_state=12, ratio=1.0)
    X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
    print("shape of X_train_res::", X_train_res.shape)
    print("shape of y_train_res::", y_train_res.shape)
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train_res, y_train_res)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['0', '1'], digits=5))

    # 归一化
    X_continuous_test_all = ss.transform(X_test_all[:, 0:-3])
    print("type of X_continuous_test_all::", type(X_continuous_test_all))
    print("shape of X_continuous_test_all::", X_continuous_test_all.shape)
    
    # 将连续值和离散值拼接
    print("shape:::", X_test_all[:, -3:].shape)
    X_test_all = np.hstack((X_continuous_test_all.A, X_test_all[:, -3:].A))
    print("shape of X_test::", X_test_all.shape)
    
    y_predict = xgbc.predict(X_test_all)
    print("y_predict::", Counter(y_predict))
    
# 调用预测函数
X_train_copy = deepcopy(X_train)    
X_test_copy = deepcopy(X_test)
y_train_copy = deepcopy(y_train)
y_test_copy = deepcopy(y_test)
train_and_test_XGBC_ROS(X_train_copy, y_train_copy, X_test_copy, y_test_copy, X_test_all)

X_train.shape (17539, 22)
X_test.shape (833, 22)
type of X_continuous_train_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_train_new:: (17539, 19)
shape of X_train:: (17539, 22)
type of X_continuous_test_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_test_new:: (833, 19)
shape of X_test:: (833, 22)
shape of X_train:: (17539, 22)
shape of X_test:: (833, 22)
shape of y_train:: (17539,)
Counter of y_train:: Counter({1: 16551, 0: 988})
shape of y_test:: (833,)
Counter of y_test:: Counter({0: 683, 1: 150})
shape of X_train_res:: (33102, 22)
shape of y_train_res:: (33102,)
              precision    recall  f1-score   support

           0    0.95461   0.98341   0.96880       663
           1    0.92667   0.81765   0.86875       170

   micro avg    0.94958   0.94958   0.94958       833
   macro avg    0.94064   0.90053   0.91877       833
weighted avg    0.94891   0.94958   0.94838       833

type of X_continuous_test_all:: <class 'scipy.sparse.csr.csr_

从上面我们可以知道，使用随机上采用的结果比较好，这个结果和使用pu-learning的相当。