In [1]:
import pandas as pd
import sys
import os
import pickle
import numpy as np
import h5py
import config # 自定义配置文件
import time
from datetime import datetime

# 不显示VisibleDeprecation警告
import warnings
warnings.filterwarnings("ignore", category=Warning)

In [2]:
# 导入自定义模块
sys.path.append(config.Py_path) # 添加路径
from SaveAndLoad import save_pkl,load_pkl # 数据文件持久化与加载
from BuildModel import get_train_test,BF_nn_Model,BF_XGB_Model
from ModelEvaluation import nnModel_ACC,Model_ACC

In [3]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.python.keras.utils import np_utils
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

In [4]:
data = pd.read_csv(config.AF_Data_path + "AFAfter.csv")
col_list = ["Education","age","gender"]

In [38]:
# 加载TFIDF词向量数据
TFIDF_HW_sp_toarray = load_pkl("TFIDF_HW_sp_toarray")

In [39]:
with h5py.File(config.WV_Data_path + "TFIDF_sp_toarray.hdf5", 'r') as f:  # 读取的时候是‘r’
    print(f.keys())
    TFIDF_sp_toarray = f.get("TFIDF_sp_toarray")[:]

KeysView(<HDF5 file "TFIDF_sp_toarray.hdf5" (mode r)>)


In [40]:
# 加载W2V词向量数据
W2V_X_sp = load_pkl("W2V_X_sp")
W2V_HW_sp = load_pkl("W2V_HW_sp")

In [41]:
# 加载D2V词向量数据
D2V_X_sp = load_pkl("D2V_X_sp")

In [5]:
X_col = ['SpaceNum', 'SpaceRATIO','LinkNum', 'LinkRATIO', 'TextSum', 'TextMax', 'TextMin', 'TextMedian','TextMean', 'SearchNum',]

In [7]:
pred_dict = {}
for y_col in col_list:
    # 构建特征与目标
    X = np.array(data[X_col])
    Y = data[y_col]
    num_class = len(Y.value_counts())
    if len(Y.unique()) > 2:
        Y[Y==6] = 0
    else:
        Y[Y==2] = 0
        
    y = np.array([Y]).T
    
    # 交叉检验
    KF = KFold(
    n_splits=3, # 折叠次数；拆分数量
    shuffle=True, # 是否在拆分成批之前对数据进行混洗
 )
    for k,(tr,te) in enumerate(KF.split(X,y)):
        # 构建训练集与测试集
        X_train = X.take(tr,axis = 0) # numpy按行取值
        X_test = X.take(te,axis = 0)
        y_train = Y.take(tr,axis = 0)
        y_test = Y.take(te,axis = 0)
        
        XG = BF_XGB_Model(X_train, X_test, y_train, y_test)
        print('{} {} stack:{}  XG'.format(datetime.now(),y_col,k+1),end="\t")
        Model_ACC(XG,X_train,X_test,y_train,y_test)
    XG_pred = XG.predict_proba(X)
    pred_dict[y_col] = XG_pred

2023-03-07 15:40:06.653841 Education stack:1  XG	训练集准确度: 0.586471, 测试集准确度: 0.406866
2023-03-07 15:40:08.844978 Education stack:2  XG	训练集准确度: 0.593779, 测试集准确度: 0.396499
2023-03-07 15:40:10.993231 Education stack:3  XG	训练集准确度: 0.586676, 测试集准确度: 0.413225
2023-03-07 15:40:13.192347 age stack:1  XG	训练集准确度: 0.570239, 测试集准确度: 0.392080
2023-03-07 15:40:15.362540 age stack:2  XG	训练集准确度: 0.568369, 测试集准确度: 0.387661
2023-03-07 15:40:17.508798 age stack:3  XG	训练集准确度: 0.563307, 测试集准确度: 0.373279
2023-03-07 15:40:18.424348 gender stack:1  XG	训练集准确度: 0.731282, 测试集准确度: 0.565262
2023-03-07 15:40:19.310976 gender stack:2  XG	训练集准确度: 0.729498, 测试集准确度: 0.559313
2023-03-07 15:40:20.186633 gender stack:3  XG	训练集准确度: 0.723148, 测试集准确度: 0.562128


In [126]:
pred = np.hstack([pred_dict["Education"],pred_dict["age"],pred_dict["gender"]])

In [127]:
# 划分数据集
X_tr, X_te, Y_tr, Y_te = get_train_test(pred,data["Education"])

In [128]:
NN_Model = BF_nn_Model(X_tr, X_te, Y_tr, Y_te)

Train on 12355 samples, validate on 5296 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [48]:
NN_pred = NN_Model.predict_proba(pred) # 测试集预测

In [129]:
nn_pred_dict = {}
for y_col in col_list:
    print('{} {} stack: NN'.format(datetime.now(),y_col),end="\t")
    # 划分数据集
    X_tr, X_te, Y_tr, Y_te = get_train_test(pred,data[y_col])
    NN_Model = BF_nn_Model(X_tr, X_te, Y_tr, Y_te)
    NN_pred = NN_Model.predict_proba(pred) # 测试集预测
    nn_pred_dict[y_col] = NN_pred # 保存结果
    

2023-03-08 10:16:08.516506 Education stack: NN	Train on 12355 samples, validate on 5296 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
2023-03-08 10:16:13.836679 age stack: NN	Train on 12355 samples, validate on 5296 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
2023-03-08 10:16:18.780885 gender stack: NN	Train on 12355 samples, validate on 5296 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [133]:
nn_pred = np.hstack([nn_pred_dict["Education"],nn_pred_dict["age"],nn_pred_dict["gender"]])

In [134]:
nn_pred.shape

(17651, 14)

In [149]:
save_pkl(nn_pred,"AF_nn_pred")

持久化存储路径：./data/WordVectorData/AF_nn_pred.feat


In [141]:
# 划分数据集
X_tr, X_te, Y_tr, Y_te = get_train_test(nn_pred,data["Education"])

In [142]:
LR_RF_model = LogisticRegression(max_iter=10000)
LR_RF_model.fit(X_tr,Y_tr)
Model_ACC(LR_RF_model,X_tr,X_te,Y_tr,Y_te)

训练集准确度: 0.543828, 测试集准确度: 0.533233


In [143]:
XG_Model = xgb.XGBClassifier()
XG_Model.fit(X_tr,Y_tr,eval_metric='auc')
Model_ACC(XG_Model,X_tr,X_te,Y_tr,Y_te)

训练集准确度: 0.886443, 测试集准确度: 0.505287


In [52]:
LR_pred = LR_RF_model.predict_proba(NN_pred)

In [114]:
LR_pred.shape

(17651, 6)

In [144]:
X_tr, X_te, Y_tr, Y_te = get_train_test(np.hstack([W2V_X_sp,nn_pred]),data["Education"])

In [145]:
model = LogisticRegression(max_iter=10000)
model.fit(X_tr,Y_tr)
Model_ACC(model,X_tr,X_te,Y_tr,Y_te)

训练集准确度: 0.653824, 测试集准确度: 0.630665


In [146]:
model_nn = BF_nn_Model(X_tr,X_te,Y_tr,Y_te)
nnModel_ACC(model_nn,X_tr,X_te,Y_tr,Y_te)

Train on 12355 samples, validate on 5296 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
训练集准确度: 0.637070, 测试集准确度: 0.624622


In [94]:
from sklearn.model_selection import GridSearchCV

In [93]:
model_xgb = xgb.XGBClassifier(
    booster="gbtree",  # # 基分类器；gbtree 树模型，gbliner 线性模型
    objective="multi:softprob",  # 目标函数；multi：softprob 返回概率，multi：softmax multi：softmax
    max_depth=12,  # 树的深度
    min_child_weight=1,  # 最小叶子节点样本权重和
    subsample=0.8,  # 随机选择XX%样本建立决策树
    colsample_bytree=0.8,  # 构造每个树时列的子采样率
)
model_xgb.fit(X_tr,Y_tr,
              eval_metric="merror" # 多分类对数损失（交叉熵）
             )
Model_ACC(model_xgb,X_tr,X_te,Y_tr,Y_te)

训练集准确度: 1.000000, 测试集准确度: 0.632175


In [100]:
XGB_CV_params = {
        'max_depth': range(3,16,1),
        'subsample': [round(i,1) for i in np.arange(0.1,1.1,0.1)],
        'colsample_bytree' : [round(i,1) for i in np.arange(0.1,1.1,0.1)],
}

In [101]:
model_xgb = xgb.XGBClassifier(
    booster="gbtree",  # # 基分类器；gbtree 树模型，gbliner 线性模型
    objective="multi:softprob",  # 目标函数；multi：softprob 返回概率，multi：softmax multi：softmax
    min_child_weight=1,  # 最小叶子节点样本权重和
)
Grid_xgb = GridSearchCV(model_xgb,param_grid=XGB_CV_params,cv=5)

In [103]:
start = time.time()
Grid_xgb.fit(X_tr,Y_tr,eval_metric="merror")
end = time.time()
print('程序运行时间：%.2f分' %((end - start)/60))

































































































































































































































































































































































































































































程序运行时间：892.50分


In [104]:
means = Grid_xgb.cv_results_['mean_test_score']
params = Grid_xgb.cv_results_['params']
for i in range(len(means)):
    print('训练过程分数及参数：',means[i],params[i] )
print('\n最好的分数以及参数：',Grid_xgb.best_score_,Grid_xgb.best_params_)

训练过程分数及参数： 0.5969243221367867 {'colsample_bytree': 0.1, 'max_depth': 3, 'subsample': 0.1}
训练过程分数及参数： 0.6188587616349656 {'colsample_bytree': 0.1, 'max_depth': 3, 'subsample': 0.2}
训练过程分数及参数： 0.6246054229057062 {'colsample_bytree': 0.1, 'max_depth': 3, 'subsample': 0.3}
训练过程分数及参数： 0.6298664508296236 {'colsample_bytree': 0.1, 'max_depth': 3, 'subsample': 0.4}
训练过程分数及参数： 0.6322136786726021 {'colsample_bytree': 0.1, 'max_depth': 3, 'subsample': 0.5}
训练过程分数及参数： 0.6340752731687576 {'colsample_bytree': 0.1, 'max_depth': 3, 'subsample': 0.6}
训练过程分数及参数： 0.635936867664913 {'colsample_bytree': 0.1, 'max_depth': 3, 'subsample': 0.7}
训练过程分数及参数： 0.6391744233104006 {'colsample_bytree': 0.1, 'max_depth': 3, 'subsample': 0.8}
训练过程分数及参数： 0.6365843787940105 {'colsample_bytree': 0.1, 'max_depth': 3, 'subsample': 0.9}
训练过程分数及参数： 0.6369081343585593 {'colsample_bytree': 0.1, 'max_depth': 3, 'subsample': 1.0}
训练过程分数及参数： 0.5820315661675435 {'colsample_bytree': 0.1, 'max_depth': 4, 'subsample': 0.1}
训练过程分数及参数： 

训练过程分数及参数： 0.634237150951032 {'colsample_bytree': 0.6, 'max_depth': 3, 'subsample': 0.7}
训练过程分数及参数： 0.6370700121408337 {'colsample_bytree': 0.6, 'max_depth': 3, 'subsample': 0.8}
训练过程分数及参数： 0.6396600566572237 {'colsample_bytree': 0.6, 'max_depth': 3, 'subsample': 0.9}
训练过程分数及参数： 0.6376365843787941 {'colsample_bytree': 0.6, 'max_depth': 3, 'subsample': 1.0}
训练过程分数及参数： 0.5838931606636989 {'colsample_bytree': 0.6, 'max_depth': 4, 'subsample': 0.1}
训练过程分数及参数： 0.606717927964387 {'colsample_bytree': 0.6, 'max_depth': 4, 'subsample': 0.2}
训练过程分数及参数： 0.6202347227842978 {'colsample_bytree': 0.6, 'max_depth': 4, 'subsample': 0.3}
训练过程分数及参数： 0.6260623229461756 {'colsample_bytree': 0.6, 'max_depth': 4, 'subsample': 0.4}
训练过程分数及参数： 0.6260623229461756 {'colsample_bytree': 0.6, 'max_depth': 4, 'subsample': 0.5}
训练过程分数及参数： 0.630028328611898 {'colsample_bytree': 0.6, 'max_depth': 4, 'subsample': 0.6}
训练过程分数及参数： 0.6292998785916633 {'colsample_bytree': 0.6, 'max_depth': 4, 'subsample': 0.7}
训练过程分数及参数： 0.

In [147]:
model_xgb = xgb.XGBClassifier(
#         num_class=num_class,
    booster="gbtree",  # # 基分类器；gbtree 树模型，gbliner 线性模型
    objective="multi:softprob",  # 目标函数；multi：softprob 返回概率，multi：softmax multi：softmax
    max_depth=8,  # 最大树深度
    min_child_weight=1,  # 最小叶子节点样本权重和
    subsample=0.8,  # 随机选择训练实例的子样本比率
    colsample_bytree=0.7,  # 构造每个树时列的子采样率
)
model_xgb.fit(X_tr,Y_tr,eval_metric="merror")
Model_ACC(model_xgb,X_tr,X_te,Y_tr,Y_te)

训练集准确度: 1.000000, 测试集准确度: 0.623489
