In [1]:
import pandas as pd
import sys
import os
import pickle
import numpy as np

import config # 自定义配置文件

In [2]:
# 导入自定义模块
sys.path.append(config.Py_path) # 添加路径
from SaveAndLoad import save_pkl,load_pkl # 数据文件持久化与加载
from BuildModel import get_train_test,BF_nn_Model
from ModelEvaluation import nnModel_ACC,Model_ACC

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
from tensorflow.keras.models import load_model

#### DF数据

In [4]:
data = pd.read_csv(config.AF_Data_path + "AFAfter.csv")
col_list = ["Education","age","gender"]

#### 词向量数据

In [5]:
# 加载TFIDF词向量数据
TFIDF_sp = load_pkl("TFIDF_sp")
TFIDF_HW_sp = load_pkl("TFIDF_HW_sp")

In [6]:
# 加载W2V词向量数据
W2V_X_sp = load_pkl("W2V_X_sp")
W2V_HW_sp = load_pkl("W2V_HW_sp")

### DBSCAN

In [7]:
from sklearn.cluster import DBSCAN

In [21]:
DB_Model = DBSCAN(
        eps=0.5,  # 邻域的距离阈值ϵ\epsilon
        min_samples=5,  # 核心对象所需要的邻域的样本数阈值
        metric='euclidean', # 度量方式;欧氏距离
        algorithm='auto',  # 近邻算法求解方式;auto为自动选择
#         leaf_size=30,  # 近邻算法求解方式
        n_jobs=6 # CPU并行数
        )

In [None]:
DB_CLUSTER = DB_Model.fit_predict(TFIDF_sp.toarray())

In [None]:
DB_CLUSTER

In [None]:
DB_Model.labels_ # 分类结果

In [8]:
DB_Model_2 = DBSCAN(
        eps=0.5,  # 邻域的距离阈值ϵ\epsilon
        min_samples=5,  # 核心对象所需要的邻域的样本数阈值
        metric='euclidean', # 度量方式;欧氏距离
        algorithm='auto',  # 近邻算法求解方式;auto为自动选择
#         leaf_size=30,  # 近邻算法求解方式
        n_jobs=6 # CPU并行数
        )

In [9]:
DB_CLUSTER_2 = DB_Model_2.fit_predict(W2V_X_sp)
DB_CLUSTER_2

array([ 0,  0,  0, ..., -1,  0, -1], dtype=int64)

In [60]:
pd.DataFrame(DB_CLUSTER_2).value_counts()

-1     10148
 0      7386
 10       14
 9        11
 8        10
 12        7
 7         7
 6         7
 5         7
 16        7
 17        6
 13        5
 11        5
 14        5
 4         5
 3         5
 1         5
 18        5
 15        3
 2         3
dtype: int64

### TFIDF_NN

In [39]:
DB_CLUSTER_2.reshape((DB_CLUSTER_2.shape[0], 1))

array([[ 0],
       [ 0],
       [ 0],
       ...,
       [-1],
       [ 0],
       [-1]], dtype=int64)

In [41]:
np.hstack((W2V_X_sp, DB_CLUSTER_2.reshape((DB_CLUSTER_2.shape[0], 1))))

(17651, 101)

In [93]:
X_tr, X_te, Y_tr, Y_te = get_train_test(np.hstack((W2V_X_sp, DB_CLUSTER_2.reshape((DB_CLUSTER_2.shape[0], 1)))),data["age"])

In [94]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.python.keras.utils import np_utils

In [95]:
# One-Hot编码
y_train = np_utils.to_categorical(Y_tr)
y_test = np_utils.to_categorical(Y_te)

num_class = len(Y_tr.value_counts())  # 计算目标列类别数量
input_num = X_tr.shape[1]  # 输入层尺寸

In [96]:
model = tf.keras.Sequential()  # 实例化
# 输入层
model.add(Dense(300, input_shape=(input_num,)))  # 全连接层

# 隐含层
model.add(Dropout(0.3))  # 随机失活
model.add(Activation('tanh'))  # 激活函数,tanh
model.add(Dense(y_train.shape[1]))  # 全连接层

# 输出层
model.add(Activation('softmax'))  # 激活函数,softmax

# 配置训练方法
model.compile(loss='categorical_crossentropy',  # 损失函数，分类交叉熵
              optimizer='adadelta',  # 优化器，自适应增量 Adaptive Delta
              metrics=['accuracy'])  # 准确率评测，精确度

print("模型各层的参数状况")
print(model.summary())  # 查看模型

模型各层的参数状况
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 300)               30600     
_________________________________________________________________
dropout_2 (Dropout)          (None, 300)               0         
_________________________________________________________________
activation_4 (Activation)    (None, 300)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 7)                 2107      
_________________________________________________________________
activation_5 (Activation)    (None, 7)                 0         
Total params: 32,707
Trainable params: 32,707
Non-trainable params: 0
_________________________________________________________________
None


In [97]:
# 早停
from tensorflow.keras.callbacks import EarlyStopping
earlystop = EarlyStopping(monitor = 'val_loss',
                          mode='min',
                          min_delta = 0.01,
                          patience = 5,
                          verbose = 1,
                          )

In [98]:
# 早停并保存最优模型
from tensorflow.keras.callbacks import ModelCheckpoint
mc_earlystop = ModelCheckpoint(filepath=config.Model_path + "text_model.h5",
                     monitor='val_loss',
                     mode='max',
                     verbose=1,
                     save_best_only=True)

In [99]:
# 模型训练
history = model.fit(
    X_tr, y_train,  # XY
    # verbose=2,  # 0 为不在标准输出流输出日志信息；1 为输出进度条记录；2 没有进度条，只是输出一行记录
    epochs=100,  # 训练次数,训练模型的迭代数
    batch_size=128, # 批处理大小,每次梯度更新的样本数
    validation_data=(X_te, y_test),  # 验证数据
    shuffle=True,  # 在每个epoch之前对训练数据进行洗牌
    callbacks = [mc_earlystop], # 早停
)

Train on 12355 samples, validate on 5296 samples
Epoch 1/100

Epoch 00001: val_loss improved from -inf to 1.17624, saving model to ./Model/text_model.h5
Epoch 2/100

Epoch 00002: val_loss did not improve from 1.17624
Epoch 3/100

Epoch 00003: val_loss did not improve from 1.17624
Epoch 4/100

Epoch 00004: val_loss did not improve from 1.17624
Epoch 5/100

Epoch 00005: val_loss did not improve from 1.17624
Epoch 6/100

Epoch 00006: val_loss did not improve from 1.17624
Epoch 7/100

Epoch 00007: val_loss did not improve from 1.17624
Epoch 8/100

Epoch 00008: val_loss did not improve from 1.17624
Epoch 9/100

Epoch 00009: val_loss did not improve from 1.17624
Epoch 10/100

Epoch 00010: val_loss did not improve from 1.17624
Epoch 11/100

Epoch 00011: val_loss did not improve from 1.17624
Epoch 12/100

Epoch 00012: val_loss did not improve from 1.17624
Epoch 13/100

Epoch 00013: val_loss did not improve from 1.17624
Epoch 14/100

Epoch 00014: val_loss did not improve from 1.17624
Epoch 15/1


Epoch 00043: val_loss did not improve from 1.17624
Epoch 44/100

Epoch 00044: val_loss did not improve from 1.17624
Epoch 45/100

Epoch 00045: val_loss did not improve from 1.17624
Epoch 46/100

Epoch 00046: val_loss did not improve from 1.17624
Epoch 47/100

Epoch 00047: val_loss did not improve from 1.17624
Epoch 48/100

Epoch 00048: val_loss did not improve from 1.17624
Epoch 49/100

Epoch 00049: val_loss did not improve from 1.17624
Epoch 50/100

Epoch 00050: val_loss did not improve from 1.17624
Epoch 51/100

Epoch 00051: val_loss did not improve from 1.17624
Epoch 52/100

Epoch 00052: val_loss did not improve from 1.17624
Epoch 53/100

Epoch 00053: val_loss did not improve from 1.17624
Epoch 54/100

Epoch 00054: val_loss did not improve from 1.17624
Epoch 55/100

Epoch 00055: val_loss did not improve from 1.17624
Epoch 56/100

Epoch 00056: val_loss did not improve from 1.17624
Epoch 57/100

Epoch 00057: val_loss did not improve from 1.17624
Epoch 58/100

Epoch 00058: val_loss di


Epoch 00086: val_loss did not improve from 1.17624
Epoch 87/100

Epoch 00087: val_loss did not improve from 1.17624
Epoch 88/100

Epoch 00088: val_loss did not improve from 1.17624
Epoch 89/100

Epoch 00089: val_loss did not improve from 1.17624
Epoch 90/100

Epoch 00090: val_loss did not improve from 1.17624
Epoch 91/100

Epoch 00091: val_loss did not improve from 1.17624
Epoch 92/100

Epoch 00092: val_loss did not improve from 1.17624
Epoch 93/100

Epoch 00093: val_loss did not improve from 1.17624
Epoch 94/100

Epoch 00094: val_loss did not improve from 1.17624
Epoch 95/100

Epoch 00095: val_loss did not improve from 1.17624
Epoch 96/100

Epoch 00096: val_loss did not improve from 1.17624
Epoch 97/100

Epoch 00097: val_loss did not improve from 1.17624
Epoch 98/100

Epoch 00098: val_loss did not improve from 1.17624
Epoch 99/100

Epoch 00099: val_loss did not improve from 1.17624
Epoch 100/100

Epoch 00100: val_loss did not improve from 1.17624


In [100]:
# 计算模型准确度
model_pred_tr,model_pred_te = nnModel_ACC(model,X_tr, X_te, Y_tr, Y_te,ReYN = True)

训练集准确度: 0.5658, 测试集准确度: 0.5657


In [101]:
from sklearn.linear_model import LogisticRegression
LR_model = LogisticRegression(max_iter=10000)

LR_model.fit(model_pred_tr,Y_tr)

Model_ACC(LR_model,model_pred_tr, model_pred_te, Y_tr, Y_te)

训练集准确度: 0.5723, 测试集准确度: 0.5712


In [102]:
from sklearn import svm

# 模型实例化
clf = svm.SVC(probability=True)

clf.fit(model_pred_tr,Y_tr) # 模型训练

Model_ACC(clf,model_pred_tr, model_pred_te, Y_tr, Y_te)

训练集准确度: 0.5759, 测试集准确度: 0.5689


In [103]:
import xgboost as xgb

XGB_Model = xgb.XGBClassifier(use_label_encoder=False)

Y_tr[Y_tr==6] = 0
Y_te[Y_te==6] = 0

XGB_Model = XGB_Model.fit(model_pred_tr,Y_tr)
XGB_Model



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [104]:
Model_ACC(XGB_Model,model_pred_tr, model_pred_te, Y_tr, Y_te)

训练集准确度: 0.8376, 测试集准确度: 0.5508


In [105]:
from sklearn.tree import DecisionTreeClassifier

In [106]:
for Mnum in range(350,410,5):
    Tr_Model = DecisionTreeClassifier(criterion='entropy',min_samples_leaf=Mnum,max_depth=500)
    Tr_Model.fit(model_pred_tr,Y_tr)
    Model_ACC(Tr_Model,model_pred_tr, model_pred_te, Y_tr, Y_te)

训练集准确度: 0.5733, 测试集准确度: 0.5674
训练集准确度: 0.5733, 测试集准确度: 0.5638
训练集准确度: 0.5732, 测试集准确度: 0.5633
训练集准确度: 0.5732, 测试集准确度: 0.5633
训练集准确度: 0.5732, 测试集准确度: 0.5633
训练集准确度: 0.5730, 测试集准确度: 0.5633
训练集准确度: 0.5730, 测试集准确度: 0.5633
训练集准确度: 0.5730, 测试集准确度: 0.5633
训练集准确度: 0.5730, 测试集准确度: 0.5657
训练集准确度: 0.5730, 测试集准确度: 0.5657
训练集准确度: 0.5735, 测试集准确度: 0.5646
训练集准确度: 0.5730, 测试集准确度: 0.5657


In [89]:
Tr_Model = DecisionTreeClassifier(criterion='gini',min_samples_leaf=30)

In [90]:
Tr_Model.fit(model_pred_tr,Y_tr)

DecisionTreeClassifier(min_samples_leaf=30)

In [91]:
Model_ACC(Tr_Model,model_pred_tr, model_pred_te, Y_tr, Y_te)

训练集准确度: 0.8323, 测试集准确度: 0.8121


In [107]:
from mlxtend.classifier import StackingClassifier
from sklearn.linear_model import LogisticRegression

STKC = StackingClassifier(classifiers=[LR_model, clf, XGB_Model],
                          use_probas=False, # 类别概率值作为meta-classfier的输入
                          average_probas=False,  # 是否对每一个类别产生的概率值做平均
                          meta_classifier=xgb.XGBClassifier(use_label_encoder=False))

STKC.fit(model_pred_tr,Y_tr)

Model_ACC(STKC,model_pred_tr, model_pred_te, Y_tr, Y_te)

训练集准确度: 0.8376, 测试集准确度: 0.5508
