In [1]:
import pandas as pd
import sys
import os
import pickle
import config # 自定义配置文件
import numpy as np

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.python.keras.utils import np_utils
import xgboost as xgb
from sklearn import svm

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [4]:
# 传入特征与目标列进行划分训练集与测试集
def get_train_test(X,Y):
    X_train, X_test,  Y_train, Y_test = train_test_split(X, # 特征
                                                         Y, # 目标
                                                         test_size = 0.3, # 测试集大小为30%
                                                         random_state = 10)
    return X_train, X_test,  Y_train, Y_test

In [5]:
data = pd.read_csv(config.AF_Data_path + "AFAfter.csv")
col_list = ["Education","age","gender"]

In [6]:
W2V_nn_pred_df = pd.read_csv(config.FEData_path + "W2V_nn.csv")
W2V_HW_pred_df = pd.read_csv(config.FEData_path + "W2V_HW_nn.csv")
TFIDF_HW_pred_df = pd.read_csv(config.FEData_path + "TFIDF_HW_nn.csv")
TFIDF_nn_pred_df = pd.read_csv(config.FEData_path + "TFIDF_nn.csv")

In [52]:
pred_ALL = np.array(pd.concat([W2V_nn_pred_df,W2V_HW_pred_df,TFIDF_HW_pred_df,TFIDF_nn_pred_df,
                              data[['SpaceNum', 'SpaceRATIO','LinkNum', 'LinkRATIO', 'TextSum', 
                                    'TextMax', 'TextMin', 'TextMedian','TextMean', 'SearchNum',]]],axis=1))

In [53]:
X_tr, X_te, Y_tr, Y_te = get_train_test(pred_ALL,data["Education"])

In [61]:
# 目标列类别顺序化
uq = Y_tr.unique() # 取目标列类别
uq.sort() # 排序

Y_tr[Y_tr == uq[-1]] = 0
Y_te[Y_te == uq[-1]] = 0

In [54]:
def XGB_Model(X,Y):
    # 划分数据集
    X_tr, X_te, Y_tr, Y_te = get_train_test(X,Y)
    
    # 目标列类别顺序化
    uq = Y_tr.unique() # 取目标列类别
    uq.sort() # 排序
        
    Y_tr[Y_tr == uq[-1]] = 0
    Y_te[Y_te == uq[-1]] = 0
    
    # 模型实例化
    XGB_Model = xgb.XGBClassifier(use_label_encoder=False)
    # 模型训练
    XGB_Model = XGB_Model.fit(X_tr,Y_tr)
    print(XGB_Model)
    # 模型预测
    pred_train = XGB_Model.predict(X_tr)
    pred_test = XGB_Model.predict(X_te)
    
    # 输出预测结果
    train_acc = accuracy_score(Y_tr, pred_train)
    test_acc = accuracy_score(Y_te, pred_test)
    print ("训练集准确率: {0:.4f}, 测试集准确率: {1:.4f}".format(train_acc, test_acc))
    
    # 输出模型
    return XGB_Model

### W2V_XGB

#### W2V_nn_XGB

In [64]:
pred = np.array(pd.concat([W2V_nn_pred_df,
                          data[['SpaceNum', 'SpaceRATIO','LinkNum', 'LinkRATIO', 'TextSum', 
                                'TextMax', 'TextMin', 'TextMedian','TextMean', 'SearchNum',]]],axis=1))

In [65]:
W2V_nn_XGB = XGB_Model(pred,data["Education"])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)
训练集准确率: 0.8836, 测试集准确率: 0.4031


#### W2V_HW_XGB

In [66]:
pred = np.array(pd.concat([W2V_HW_pred_df,
                          data[['SpaceNum', 'SpaceRATIO','LinkNum', 'LinkRATIO', 'TextSum', 
                                'TextMax', 'TextMin', 'TextMedian','TextMean', 'SearchNum',]]],axis=1))

In [67]:
W2V_HW_XGB = XGB_Model(pred,data["Education"])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)
训练集准确率: 0.8853, 测试集准确率: 0.4045


### TFIDF_XGB

#### TFIDF_nn_XGB

In [68]:
pred = np.array(pd.concat([TFIDF_nn_pred_df,
                          data[['SpaceNum', 'SpaceRATIO','LinkNum', 'LinkRATIO', 'TextSum', 
                                'TextMax', 'TextMin', 'TextMedian','TextMean', 'SearchNum',]]],axis=1))

In [69]:
TFIDF_nn_XGB = XGB_Model(pred,data["Education"])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)
训练集准确率: 0.8687, 测试集准确率: 0.4052


#### TFIDF_HW_XGB

In [70]:
pred = np.array(pd.concat([TFIDF_HW_pred_df,
                          data[['SpaceNum', 'SpaceRATIO','LinkNum', 'LinkRATIO', 'TextSum', 
                                'TextMax', 'TextMin', 'TextMedian','TextMean', 'SearchNum',]]],axis=1))

In [71]:
TFIDF_HW_XGB = XGB_Model(pred,data["Education"])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)
训练集准确率: 0.8902, 测试集准确率: 0.4092


### Stacking

In [72]:
from mlxtend.classifier import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [73]:
STKC = StackingClassifier(classifiers=[W2V_nn_XGB, W2V_HW_XGB, TFIDF_nn_XGB, TFIDF_HW_XGB],
                          use_probas=False, # 类别概率值作为meta-classfier的输入
                          average_probas=False,  # 是否对每一个类别产生的概率值做平均
                          meta_classifier=xgb.XGBClassifier())

In [74]:
STKC.fit(X_tr,Y_tr)







StackingClassifier(classifiers=[XGBClassifier(base_score=0.5, booster='gbtree',
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1, gamma=0,
                                              gpu_id=-1, importance_type='gain',
                                              interaction_constraints='',
                                              learning_rate=0.300000012,
                                              max_delta_step=0, max_depth=6,
                                              min_child_weight=1, missing=nan,
                                              monotone_constraints='()',
                                              n_estimators=100, n_jobs=12,
                                              num_par...
                                                 importance_type='gain',
                                                 i

In [75]:
# 模型预测
pred_train = STKC.predict(X_tr)
pred_test = STKC.predict(X_te)

# 输出预测结果
train_acc = accuracy_score(Y_tr, pred_train)
test_acc = accuracy_score(Y_te, pred_test)
print ("训练集准确率: {0:.4f}, 测试集准确率: {1:.4f}".format(train_acc, test_acc))

训练集准确率: 0.9657, 测试集准确率: 0.4065
