In [1]:
import pandas as pd
import sys
import os
import pickle
import config # 自定义配置文件
import numpy as np

In [7]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.python.keras.utils import np_utils
import xgboost as xgb
from sklearn import svm

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
# 传入特征与目标列进行划分训练集与测试集
def get_train_test(X,Y):
    X_train, X_test,  Y_train, Y_test = train_test_split(X, # 特征
                                                         Y, # 目标
                                                         test_size = 0.3, # 测试集大小为30%
                                                         random_state = 10)
    return X_train, X_test,  Y_train, Y_test

In [4]:
data = pd.read_csv(config.AF_Data_path + "AFAfter.csv")
col_list = ["Education","age","gender"]

In [5]:
W2V_nn_pred_df = pd.read_csv(config.FEData_path + "W2V_nn.csv")
W2V_HW_pred_df = pd.read_csv(config.FEData_path + "W2V_HW_nn.csv")
TFIDF_HW_pred_df = pd.read_csv(config.FEData_path + "TFIDF_HW_nn.csv")
TFIDF_nn_pred_df = pd.read_csv(config.FEData_path + "TFIDF_nn.csv")

In [6]:
pred_ALL = np.array(pd.concat([W2V_nn_pred_df,W2V_HW_pred_df,TFIDF_HW_pred_df,TFIDF_nn_pred_df,
                              data[['SpaceNum', 'SpaceRATIO','LinkNum', 'LinkRATIO', 'TextSum', 
                                    'TextMax', 'TextMin', 'TextMedian','TextMean', 'SearchNum',]]],axis=1))

In [8]:
X_tr, X_te, Y_tr, Y_te = get_train_test(pred_ALL,data["Education"])

In [9]:
num_class = len(Y_tr.value_counts()) # 计算目标列类别数量
input_num = X_tr.shape[1] # 输入层尺寸

In [10]:
# One-Hot编码
y_train = np_utils.to_categorical(Y_tr)
y_test = np_utils.to_categorical(Y_te)

In [11]:
model = tf.keras.Sequential() # 实例化
# 输入层
model.add(Dense(300,input_shape=(input_num,))) # 全连接层

# 隐含层
model.add(Dropout(0.5)) # 随机失活
model.add(Activation('tanh')) # 激活函数,tanh
model.add(Dense(y_train.shape[1])) # 全连接层

# 输出层
model.add(Activation('softmax')) # 激活函数,softmax

# 配置训练方法
model.compile(loss='categorical_crossentropy', # 损失函数，分类交叉熵
                      optimizer='adadelta', # 优化器，自适应增量 Adaptive Delta
                      metrics=['accuracy']) # 准确率评测，精确度

print("模型各层的参数状况")
print(model.summary()) # 查看模型

模型各层的参数状况
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 300)               23700     
_________________________________________________________________
dropout (Dropout)            (None, 300)               0         
_________________________________________________________________
activation (Activation)      (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 2107      
_________________________________________________________________
activation_1 (Activation)    (None, 7)                 0         
Total params: 25,807
Trainable params: 25,807
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
# 模型训练
history = model.fit(
        X_tr, y_train, # XY
        verbose=2,# 0 为不在标准输出流输出日志信息；1 为输出进度条记录；2 没有进度条，只是输出一行记录
        epochs=35, # 训练次数,训练模型的迭代数
#             batch_size=128, # 批处理大小,每次梯度更新的样本数
        validation_data=(X_te, y_test),# 验证数据
        shuffle=True, # 在每个epoch之前对训练数据进行洗牌
       )

Train on 12355 samples, validate on 5296 samples
Epoch 1/35
 - 1s - loss: 1.4038 - acc: 0.3568 - val_loss: 1.5261 - val_acc: 0.4156
Epoch 2/35
 - 0s - loss: 1.3117 - acc: 0.3841 - val_loss: 1.8038 - val_acc: 0.4156
Epoch 3/35
 - 0s - loss: 1.2857 - acc: 0.3891 - val_loss: 1.4162 - val_acc: 0.4156
Epoch 4/35
 - 0s - loss: 1.2722 - acc: 0.3950 - val_loss: 1.4739 - val_acc: 0.3153
Epoch 5/35
 - 0s - loss: 1.2693 - acc: 0.3986 - val_loss: 1.3338 - val_acc: 0.3148
Epoch 6/35
 - 0s - loss: 1.2654 - acc: 0.4035 - val_loss: 1.4013 - val_acc: 0.4158
Epoch 7/35
 - 0s - loss: 1.2630 - acc: 0.3988 - val_loss: 1.6429 - val_acc: 0.4158
Epoch 8/35
 - 0s - loss: 1.2627 - acc: 0.4011 - val_loss: 1.4598 - val_acc: 0.4158
Epoch 9/35
 - 0s - loss: 1.2652 - acc: 0.4007 - val_loss: 1.4894 - val_acc: 0.3153
Epoch 10/35
 - 0s - loss: 1.2659 - acc: 0.3991 - val_loss: 1.7027 - val_acc: 0.4158
Epoch 11/35
 - 0s - loss: 1.2635 - acc: 0.4049 - val_loss: 1.3947 - val_acc: 0.4156
Epoch 12/35
 - 0s - loss: 1.2622 - a

In [13]:
# 预测
pred_train = model.predict_proba(X_tr) # 训练集预测
pred_test = model.predict_proba(X_te) # 测试集预测

In [26]:
# 准确度计算
train_acc = accuracy_score(Y_tr,np.argmax(pred_train,axis=1))
test_acc = accuracy_score(Y_te,np.argmax(pred_test,axis=1))
print("训练集准确度: {0:.4f}, 测试集准确度: {1:.4f}".format(train_acc, test_acc))

训练集准确度: 0.3191, 测试集准确度: 0.3153
