In [1]:
import pandas as pd
import sys
import os
import pickle
import config # 自定义配置文件
import numpy as np

### 数据载入

#### DF数据

In [2]:
data = pd.read_csv(config.AF_Data_path + "AFAfter.csv")
col_list = ["Education","age","gender"]

#### 词向量数据

In [19]:
def load_pkl(pkl_name):
    # 加载文件
    pkl_file = open(config.WV_Data_path + pkl_name + '.feat','rb')
    load_data = pickle.load(pkl_file)
    pkl_file.close()
    return load_data

In [4]:
# 加载TFIDF词向量数据
TFIDF_sp = load_pkl("TFIDF_sp")
TFIDF_HW_sp = load_pkl("TFIDF_HW_sp")

print("TFIDF_sp\t",TFIDF_sp.shape)
print("TFIDF_HW_sp\t",TFIDF_HW_sp.shape)

TFIDF_sp	 (17651, 336460)
TFIDF_HW_sp	 (17651, 12699)


In [5]:
# 加载W2V词向量数据
W2V_X_sp = load_pkl("W2V_X_sp")
W2V_HW_sp = load_pkl("W2V_HW_sp")

print("W2V_X_sp\t",W2V_X_sp.shape)
print("W2V_HW_sp\t",W2V_HW_sp.shape)

W2V_X_sp	 (17651, 100)
W2V_HW_sp	 (17651, 100)


In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.python.keras.utils import np_utils

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [5]:
# 传入特征与目标列进行划分训练集与测试集
def get_train_test(X,Y):
    X_train, X_test,  Y_train, Y_test = train_test_split(X, # 特征
                                                         Y, # 目标
                                                         test_size = 0.3, # 测试集大小为30%
                                                         random_state = 10)
    return X_train, X_test,  Y_train, Y_test

In [6]:
def SNN_Model(X_tr, X_te, Y_tr, Y_te):

    num_class = len(Y_tr.value_counts()) # 计算目标列类别数量
    input_num = X_tr.shape[1] # 输入层尺寸
    
    # One-Hot编码
    y_train = np_utils.to_categorical(Y_tr)
    y_test = np_utils.to_categorical(Y_te)
    
    model = tf.keras.Sequential() # 实例化
    # 输入层
    model.add(Dense(300,input_shape=(input_num,))) # 全连接层
    
    # 隐含层
    model.add(Dropout(0.5)) # 随机失活
    model.add(Activation('tanh')) # 激活函数,tanh
    model.add(Dense(y_train.shape[1])) # 全连接层
    
    # 输出层
    model.add(Activation('softmax')) # 激活函数,softmax
    
    # 配置训练方法
    model.compile(loss='categorical_crossentropy', # 损失函数，分类交叉熵
                          optimizer='adadelta', # 优化器，自适应增量 Adaptive Delta
                          metrics=['accuracy']) # 准确率评测，精确度
    
    print("模型各层的参数状况")
    print(model.summary()) # 查看模型
    
    # 模型训练
    history = model.fit(
            X_tr, y_train, # XY
            verbose=2,# 0 为不在标准输出流输出日志信息；1 为输出进度条记录；2 没有进度条，只是输出一行记录
            epochs=35, # 训练次数,训练模型的迭代数
#             batch_size=128, # 批处理大小,每次梯度更新的样本数
            validation_data=(X_te, y_test),# 验证数据
            shuffle=True, # 在每个epoch之前对训练数据进行洗牌
           )
    
    # 预测
    pred_train = model.predict_proba(X_tr) # 训练集预测
    pred_test = model.predict_proba(X_te) # 测试集预测
    
    return pred_train,pred_test

In [7]:
def ACC(y_train,y_test,pred_train,pred_test):
    # 准确度计算
    train_acc = accuracy_score(y_train,pred_train)
    test_acc = accuracy_score(y_test,pred_test)
    print("训练集准确度: {0:.4f}, 测试集准确度: {1:.4f}".format(train_acc, test_acc))
    

In [45]:
pred_df = pd.DataFrame()
for col in col_list:
    
    print(col,end="\t")# 打印当前训练目标列列名
    
    # 划分数据集
    # 训练特征集，测试特征集合，训练目标列，测试目标列
    X_tr, X_te, Y_tr, Y_te = get_train_test(TFIDF_sp,data[col]) 
    
    # 构建模型并进行训练
    # 输出为One-Hot编码后对应索引上概率值
    pred_tr,pred_te = SNN_Model(X_tr, X_te, Y_tr, Y_te) 
    
    # 根据概率值最大值输出预测类别
    pred_train = np.argmax(pred_tr,axis=1)
    pred_test = np.argmax(pred_te,axis=1)
    
    # 准确度计算
    ACC(Y_tr,Y_te,pred_train,pred_test)
    
    # 输出结果进行纵向拼接
    pred = np.vstack([pred_tr,pred_te])
    # 输出结果存储
    for i in range(pred.shape[1]):
        col_name = "{}_{}".format(col,i+1)
        pred_df[col_name] = pred[:,i]
        
    # 打印存储对象情况
    print(pred_df.columns)

    print("="*50)

Education	模型各层的参数状况
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 300)               100938300 
_________________________________________________________________
dropout_6 (Dropout)          (None, 300)               0         
_________________________________________________________________
activation_12 (Activation)   (None, 300)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 7)                 2107      
_________________________________________________________________
activation_13 (Activation)   (None, 7)                 0         
Total params: 100,940,407
Trainable params: 100,940,407
Non-trainable params: 0
_________________________________________________________________
None
Train on 12355 samples, validate on 5296 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
训练集准

#### TFIDF_X_sp

In [70]:
TFIDF_nn_pred_df = pd.DataFrame()
for col in col_list:
    
    print(col,end="\t")# 打印当前训练目标列列名
    
    # 划分数据集
    # 训练特征集，测试特征集合，训练目标列，测试目标列
    X_tr, X_te, Y_tr, Y_te = get_train_test(TFIDF_sp,data[col]) 
    
    # 构建模型并进行训练
    # 输出为One-Hot编码后对应索引上概率值
    pred_tr,pred_te = SNN_Model(X_tr, X_te, Y_tr, Y_te) 
    
    # 根据概率值最大值输出预测类别
    pred_train = np.argmax(pred_tr,axis=1)
    pred_test = np.argmax(pred_te,axis=1)
    
    # 准确度计算
    ACC(Y_tr,Y_te,pred_train,pred_test)
    
    # 输出结果进行纵向拼接
    pred = np.vstack([pred_tr,pred_te])
    # 输出结果存储
    for i in range(pred.shape[1]):
        col_name = "{}_{}".format(col,i+1)
        TFIDF_nn_pred_df[col_name] = pred[:,i]
        
    # 打印存储对象情况
    print(TFIDF_nn_pred_df.columns)

    print("="*50)

Education	模型各层的参数状况
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_18 (Dense)             (None, 300)               100938300 
_________________________________________________________________
dropout_9 (Dropout)          (None, 300)               0         
_________________________________________________________________
activation_18 (Activation)   (None, 300)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 7)                 2107      
_________________________________________________________________
activation_19 (Activation)   (None, 7)                 0         
Total params: 100,940,407
Trainable params: 100,940,407
Non-trainable params: 0
_________________________________________________________________
None
Train on 12355 samples, validate on 5296 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35

Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
训练集准确度: 1.0000, 测试集准确度: 0.5903
Index(['Education_1', 'Education_2', 'Education_3', 'Education_4',
       'Education_5', 'Education_6', 'Education_7', 'age_1', 'age_2', 'age_3',
       'age_4', 'age_5', 'age_6', 'age_7'],
      dtype='object')
gender	模型各层的参数状况
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 300)               100938300 
_________________________________________________________________
dropout_11 (Dropout)         (None, 300)               0         
_________________________________________________________________
activation

Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
训练集准确度: 1.0000, 测试集准确度: 0.8259
Index(['Education_1', 'Education_2', 'Education_3', 'Education_4',
       'Education_5', 'Education_6', 'Education_7', 'age_1', 'age_2', 'age_3',
       'age_4', 'age_5', 'age_6', 'age_7', 'gender_1', 'gender_2', 'gender_3'],
      dtype='object')


In [None]:
# 保存模型输出结果
TFIDF_nn_pred_df.to_csv("TFIDF_nn.csv",index=False)

#### TFIDF_HW_sp

In [164]:
TFIDF_HW_pred_df = pd.DataFrame()
for col in col_list:
    
    print(col,end="\t")# 打印当前训练目标列列名
    
    # 划分数据集
    # 训练特征集，测试特征集合，训练目标列，测试目标列
    X_tr, X_te, Y_tr, Y_te = get_train_test(TFIDF_HW_sp,data[col]) 
    
    # 构建模型并进行训练
    # 输出为One-Hot编码后对应索引上概率值
    pred_tr,pred_te = SNN_Model(X_tr, X_te, Y_tr, Y_te) 
    
    # 根据概率值最大值输出预测类别
    pred_train = np.argmax(pred_tr,axis=1)
    pred_test = np.argmax(pred_te,axis=1)
    
    # 准确度计算
    ACC(Y_tr,Y_te,pred_train,pred_test)
    
    # 输出结果进行纵向拼接
    pred = np.vstack([pred_tr,pred_te])
    # 输出结果存储
    for i in range(pred.shape[1]):
        col_name = "{}_{}".format(col,i+1)
        TFIDF_HW_pred_df[col_name] = pred[:,i]
        
    # 打印存储对象情况
    print(TFIDF_HW_pred_df.columns)

    print("="*50)

Education	模型各层的参数状况
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_54 (Dense)             (None, 300)               3810000   
_________________________________________________________________
dropout_27 (Dropout)         (None, 300)               0         
_________________________________________________________________
activation_54 (Activation)   (None, 300)               0         
_________________________________________________________________
dense_55 (Dense)             (None, 7)                 2107      
_________________________________________________________________
activation_55 (Activation)   (None, 7)                 0         
Total params: 3,812,107
Trainable params: 3,812,107
Non-trainable params: 0
_________________________________________________________________
None
Train on 12355 samples, validate on 5296 samples
Epoch 1/35
 - 23s - loss: 1.2196 - acc: 0.4718 - val_loss:

Epoch 35/35
 - 21s - loss: 0.1304 - acc: 0.9645 - val_loss: 2.7164 - val_acc: 0.4449
训练集准确度: 0.9771, 测试集准确度: 0.4449
Index(['Education_1', 'Education_2', 'Education_3', 'Education_4',
       'Education_5', 'Education_6', 'Education_7', 'age_1', 'age_2', 'age_3',
       'age_4', 'age_5', 'age_6', 'age_7'],
      dtype='object')
gender	模型各层的参数状况
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_58 (Dense)             (None, 300)               3810000   
_________________________________________________________________
dropout_29 (Dropout)         (None, 300)               0         
_________________________________________________________________
activation_58 (Activation)   (None, 300)               0         
_________________________________________________________________
dense_59 (Dense)             (None, 3)                 903       
_____________________________________________________________

In [165]:
# 保存模型输出结果
TFIDF_HW_pred_df.to_csv("TFIDF_HW_nn.csv",index=False)

#### W2V_X_sp

In [166]:
W2V_nn_pred_df = pd.DataFrame()
for col in col_list:
    
    print(col,end="\t")# 打印当前训练目标列列名
    
    # 划分数据集
    # 训练特征集，测试特征集合，训练目标列，测试目标列
    X_tr, X_te, Y_tr, Y_te = get_train_test(W2V_X_sp,data[col]) 
    
    # 构建模型并进行训练
    # 输出为One-Hot编码后对应索引上概率值
    pred_tr,pred_te = SNN_Model(X_tr, X_te, Y_tr, Y_te) 
    
    # 根据概率值最大值输出预测类别
    pred_train = np.argmax(pred_tr,axis=1)
    pred_test = np.argmax(pred_te,axis=1)
    
    # 准确度计算
    ACC(Y_tr,Y_te,pred_train,pred_test)
    
    # 输出结果进行纵向拼接
    pred = np.vstack([pred_tr,pred_te])
    # 输出结果存储
    for i in range(pred.shape[1]):
        col_name = "{}_{}".format(col,i+1)
        W2V_nn_pred_df[col_name] = pred[:,i]
        
    # 打印存储对象情况
    print(W2V_nn_pred_df.columns)

    print("="*50)

Education	模型各层的参数状况
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_60 (Dense)             (None, 300)               30300     
_________________________________________________________________
dropout_30 (Dropout)         (None, 300)               0         
_________________________________________________________________
activation_60 (Activation)   (None, 300)               0         
_________________________________________________________________
dense_61 (Dense)             (None, 7)                 2107      
_________________________________________________________________
activation_61 (Activation)   (None, 7)                 0         
Total params: 32,407
Trainable params: 32,407
Non-trainable params: 0
_________________________________________________________________
None
Train on 12355 samples, validate on 5296 samples
Epoch 1/35
 - 2s - loss: 1.1215 - acc: 0.5184 - val_loss: 1.1013

训练集准确度: 0.5428, 测试集准确度: 0.5387
Index(['Education_1', 'Education_2', 'Education_3', 'Education_4',
       'Education_5', 'Education_6', 'Education_7', 'age_1', 'age_2', 'age_3',
       'age_4', 'age_5', 'age_6', 'age_7'],
      dtype='object')
gender	模型各层的参数状况
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_64 (Dense)             (None, 300)               30300     
_________________________________________________________________
dropout_32 (Dropout)         (None, 300)               0         
_________________________________________________________________
activation_64 (Activation)   (None, 300)               0         
_________________________________________________________________
dense_65 (Dense)             (None, 3)                 903       
_________________________________________________________________
activation_65 (Activation)   (None, 3)                 0         
Total params: 

In [167]:
# 保存模型输出结果
W2V_nn_pred_df.to_csv("W2V_nn.csv",index=False)

#### W2V_HW_sp

In [168]:
W2V_HW_pred_df = pd.DataFrame()
for col in col_list:
    
    print(col,end="\t")# 打印当前训练目标列列名
    
    # 划分数据集
    # 训练特征集，测试特征集合，训练目标列，测试目标列
    X_tr, X_te, Y_tr, Y_te = get_train_test(W2V_HW_sp,data[col]) 
    
    # 构建模型并进行训练
    # 输出为One-Hot编码后对应索引上概率值
    pred_tr,pred_te = SNN_Model(X_tr, X_te, Y_tr, Y_te) 
    
    # 根据概率值最大值输出预测类别
    pred_train = np.argmax(pred_tr,axis=1)
    pred_test = np.argmax(pred_te,axis=1)
    
    # 准确度计算
    ACC(Y_tr,Y_te,pred_train,pred_test)
    
    # 输出结果进行纵向拼接
    pred = np.vstack([pred_tr,pred_te])
    # 输出结果存储
    for i in range(pred.shape[1]):
        col_name = "{}_{}".format(col,i+1)
        W2V_HW_pred_df[col_name] = pred[:,i]
        
    # 打印存储对象情况
    print(W2V_HW_pred_df.columns)

    print("="*50)

Education	模型各层的参数状况
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_66 (Dense)             (None, 300)               30300     
_________________________________________________________________
dropout_33 (Dropout)         (None, 300)               0         
_________________________________________________________________
activation_66 (Activation)   (None, 300)               0         
_________________________________________________________________
dense_67 (Dense)             (None, 7)                 2107      
_________________________________________________________________
activation_67 (Activation)   (None, 7)                 0         
Total params: 32,407
Trainable params: 32,407
Non-trainable params: 0
_________________________________________________________________
None
Train on 12355 samples, validate on 5296 samples
Epoch 1/35
 - 2s - loss: 1.1340 - acc: 0.5051 - val_loss: 1.2427

训练集准确度: 0.5335, 测试集准确度: 0.5321
Index(['Education_1', 'Education_2', 'Education_3', 'Education_4',
       'Education_5', 'Education_6', 'Education_7', 'age_1', 'age_2', 'age_3',
       'age_4', 'age_5', 'age_6', 'age_7'],
      dtype='object')
gender	模型各层的参数状况
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_70 (Dense)             (None, 300)               30300     
_________________________________________________________________
dropout_35 (Dropout)         (None, 300)               0         
_________________________________________________________________
activation_70 (Activation)   (None, 300)               0         
_________________________________________________________________
dense_71 (Dense)             (None, 3)                 903       
_________________________________________________________________
activation_71 (Activation)   (None, 3)                 0         
Total params: 

In [169]:
# 保存模型输出结果
W2V_HW_pred_df.to_csv("W2V_HW_nn.csv",index=False)

In [8]:
W2V_nn_pred_df = pd.read_csv(config.FEData_path + "W2V_nn.csv")
W2V_HW_pred_df = pd.read_csv(config.FEData_path + "W2V_HW_nn.csv")
TFIDF_HW_pred_df = pd.read_csv(config.FEData_path + "TFIDF_HW_nn.csv")
TFIDF_nn_pred_df = pd.read_csv(config.FEData_path + "TFIDF_nn.csv")

In [14]:
X_tr, X_te, Y_tr, Y_te = get_train_test(np.array(pd.concat([data[['SpaceNum', 'SpaceRATIO','LinkNum', 'LinkRATIO', 'TextSum', 'TextMax', 'TextMin', 'TextMedian','TextMean', 'SearchNum']],
                                                            W2V_nn_pred_df,W2V_HW_pred_df,TFIDF_HW_pred_df,TFIDF_nn_pred_df],axis=1)),data["Education"])

In [48]:
X_tr, X_te, Y_tr, Y_te = get_train_test(W2V_nn_pred_df,data["Education"])

In [60]:
from sklearn import svm
# 模型实例化
clf = svm.SVC()

In [61]:
clf.fit(X_tr,Y_tr) # 模型训练

SVC()

In [62]:
# 预测
pred_train = clf.predict(X_tr)
pred_test = clf.predict(X_te)

#准确率
train_acc = accuracy_score(Y_tr, pred_train)
test_acc = accuracy_score(Y_te, pred_test)
print ("训练集准确率: {0:.4f}, 测试集准确率: {1:.4f}".format(train_acc, test_acc))

训练集准确率: 0.3180, 测试集准确率: 0.0646


In [35]:
import xgboost as xgb

In [36]:
XGB_Model = xgb.XGBClassifier(use_label_encoder=False)

In [37]:
Y_tr[Y_tr==6] = 0
Y_te[Y_te==6] = 0

In [38]:
XGB_Model = XGB_Model.fit(X_tr,Y_tr)
XGB_Model



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [39]:
XGB_Model.score(X_tr,Y_tr)

0.9546742209631728

In [40]:
XGB_Model.score(X_te,Y_te)

0.40823262839879154

In [151]:
np.argmax(XGB_Model.predict_proba(X_te),axis=1)

array([1, 1, 2, ..., 1, 1, 1], dtype=int64)

In [None]:
model_list = ["W2V","W2V_HW","TFIDF","TFIDF_HW"]
for i in model_list:
    