In [1]:
import pandas as pd
import sys
import os
import pickle
import numpy as np
import h5py
import config # 自定义配置文件
import time
from datetime import datetime

# 不显示VisibleDeprecation警告
import warnings
warnings.filterwarnings("ignore", category=Warning)

In [2]:
# 导入自定义模块
sys.path.append(config.Py_path) # 添加路径
from SaveAndLoad import save_pkl,load_pkl # 数据文件持久化与加载
from BuildModel import get_train_test,BF_nn_Model,BF_XGB_Model
from ModelEvaluation import Model_ACC_proba,Model_ACC

In [3]:
from tensorflow.keras.models import load_model

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.python.keras.utils import np_utils
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier

### 数据载入

#### DF数据

In [5]:
data = pd.read_csv(config.AF_Data_path + "AFAfter.csv")
col_list = ["Education","age","gender"]

In [6]:
# 加载TFIDF词向量数据
TFIDF_HW_sp_toarray = load_pkl("TFIDF_HW_sp_toarray")

In [7]:
with h5py.File(config.WV_Data_path + "TFIDF_sp_toarray.hdf5", 'r') as f:  # 读取的时候是‘r’
    print(f.keys())
    TFIDF_sp_toarray = f.get("TFIDF_sp_toarray")[:]

KeysView(<HDF5 file "TFIDF_sp_toarray.hdf5" (mode r)>)


In [8]:
# 加载W2V词向量数据
W2V_X_sp = load_pkl("W2V_X_sp")
W2V_HW_sp = load_pkl("W2V_HW_sp")

In [9]:
# 加载D2V词向量数据
D2V_X_sp = load_pkl("D2V_X_sp")
D2V_HW_X_sp = load_pkl("D2V_HW_X_sp")

In [10]:
# 加载人工特征二层模型输出
AF_nn_pred = load_pkl("AF_nn_pred")

In [11]:
pred_dict = {}
for col_y in col_list:
    # 生成特征与目标
    X = np.hstack([D2V_HW_X_sp,D2V_X_sp])
    Y = data[col_y]
    
    if len(Y.unique()) > 2:
        Y[Y==6] = 0
    else:
        Y[Y==2] = 0
    # 交叉检验
    KF = KFold(
        n_splits=3, # 折叠次数；拆分数量
        shuffle=True, # 是否在拆分成批之前对数据进行混洗
     )
    for k,(tr,te) in enumerate(KF.split(X,Y)):
        # 训练集与测试集划分
        X_train = X.take(tr,axis = 0) # numpy按行取值
        X_test = X.take(te,axis = 0) 
        y_train = Y[tr]
        y_test = Y[te]

        print('{} {}stack:{}  NN_Model'.format(datetime.now(),col_y,k+1),end="\t")
        NN_Model = BF_nn_Model(X_train, X_test, y_train, y_test) # NN模型构建及训练
        Model_ACC_proba(NN_Model,X_train, X_test, y_train, y_test) # 测试泛化效果
    NN_pred = NN_Model.predict_proba(X) # 输出概率结果
    pred_dict[col_y] = NN_pred # 保存结果


2023-03-09 10:47:47.243984 Educationstack:1  NN_Model	训练集准确度: 0.585281, 测试集准确度: 0.574269
2023-03-09 10:47:49.317417 Educationstack:2  NN_Model	训练集准确度: 0.579332, 测试集准确度: 0.571890
2023-03-09 10:47:51.354480 Educationstack:3  NN_Model	训练集准确度: 0.583956, 测试集准确度: 0.596294
2023-03-09 10:47:53.267725 agestack:1  NN_Model	训练集准确度: 0.564035, 测试集准确度: 0.570360
2023-03-09 10:47:55.082377 agestack:2  NN_Model	训练集准确度: 0.566500, 测试集准确度: 0.552175
2023-03-09 10:47:56.940036 agestack:3  NN_Model	训练集准确度: 0.565517, 测试集准确度: 0.563488
2023-03-09 10:47:58.983812 genderstack:1  NN_Model	训练集准确度: 0.831818, 测试集准确度: 0.827498
2023-03-09 10:48:00.947540 genderstack:2  NN_Model	训练集准确度: 0.830798, 测试集准确度: 0.832257
2023-03-09 10:48:02.911110 genderstack:3  NN_Model	训练集准确度: 0.831322, 测试集准确度: 0.825429


In [105]:
pred_dict = {}
for col_y in col_list:
    # 生成特征与目标
    X = np.hstack([D2V_HW_X_sp,D2V_X_sp])
    Y = data[col_y]
    
    if len(Y.unique()) > 2:
        Y[Y==6] = 0
    else:
        Y[Y==2] = 0
    # 交叉检验
    KF = KFold(
        n_splits=3, # 折叠次数；拆分数量
        shuffle=True, # 是否在拆分成批之前对数据进行混洗
     )
    for k,(tr,te) in enumerate(KF.split(X,Y)):
        # 训练集与测试集划分
        X_train = X.take(tr,axis = 0) # numpy按行取值
        X_test = X.take(te,axis = 0) 
        y_train = Y[tr]
        y_test = Y[te]

        print('{} {}stack:{}  NN_Model'.format(datetime.now(),col_y,k+1),end="\t")
        NN_Model = BF_nn_Model(X_train, X_test, y_train, y_test) # NN模型构建及训练
        Model_ACC_proba(NN_Model,X_train, X_test, y_train, y_test) # 测试泛化效果
    NN_pred = NN_Model.predict_proba(X) # 输出概率结果
    pred_dict[col_y] = NN_pred # 保存结果


2023-03-08 14:28:02.869740 Educationstack:1  NN_Model	训练集准确度: 0.675533, 测试集准确度: 0.609789
2023-03-08 14:28:14.294966 Educationstack:2  NN_Model	训练集准确度: 0.674259, 测试集准确度: 0.615738
2023-03-08 14:28:25.984380 Educationstack:3  NN_Model	训练集准确度: 0.676496, 测试集准确度: 0.610573
2023-03-08 14:28:38.371096 agestack:1  NN_Model	训练集准确度: 0.665250, 测试集准确度: 0.582427
2023-03-08 14:28:50.251622 agestack:2  NN_Model	训练集准确度: 0.664145, 测试集准确度: 0.584296
2023-03-08 14:29:01.693766 agestack:3  NN_Model	训练集准确度: 0.663664, 测试集准确度: 0.587455
2023-03-08 14:29:14.633857 genderstack:1  NN_Model	训练集准确度: 0.863092, 测试集准确度: 0.834636
2023-03-08 14:29:27.883278 genderstack:2  NN_Model	训练集准确度: 0.861902, 测试集准确度: 0.831747
2023-03-08 14:29:40.235484 genderstack:3  NN_Model	训练集准确度: 0.862339, 测试集准确度: 0.824919


In [12]:
first_floor_pred_nn = np.hstack([pred_dict["Education"],pred_dict["age"],pred_dict["gender"]])

In [13]:
# 划分数据集
X_tr, X_te, Y_tr, Y_te = get_train_test(first_floor_pred_nn,data["Education"])

In [14]:
XG = BF_XGB_Model(X_tr, X_te, Y_tr, Y_te)
Model_ACC(XG,X_tr, X_te, Y_tr, Y_te)

训练集准确度: 0.772481, 测试集准确度: 0.573452


In [15]:
lr = LogisticRegression(max_iter=100000)
lr.fit(X_tr,Y_tr)
Model_ACC(lr,X_tr, X_te, Y_tr, Y_te)

训练集准确度: 0.592635, 测试集准确度: 0.581193


In [16]:
pred_dict = {}
for col_y in col_list:
    # 生成特征与目标
    X = np.hstack([D2V_HW_X_sp,D2V_X_sp])
    Y = data[col_y]
    
    if len(Y.unique()) > 2:
        Y[Y==6] = 0
    else:
        Y[Y==2] = 0
    # 交叉检验
    KF = KFold(
        n_splits=3, # 折叠次数；拆分数量
        shuffle=True, # 是否在拆分成批之前对数据进行混洗
     )
    for k,(tr,te) in enumerate(KF.split(X,Y)):
        # 训练集与测试集划分
        X_train = X.take(tr,axis = 0) # numpy按行取值
        X_test = X.take(te,axis = 0) 
        y_train = Y[tr]
        y_test = Y[te]

        print('{} {} stack:{}  XGB_Model'.format(datetime.now(),col_y,k+1),end="\t")
        XGB_Model = BF_XGB_Model(X_train, X_test, y_train, y_test) # NN模型构建及训练
        Model_ACC_proba(XGB_Model,X_train, X_test, y_train, y_test) # 测试泛化效果
    XGB_pred = XGB_Model.predict_proba(X) # 输出概率结果
    pred_dict[col_y] = XGB_pred # 保存结果


2023-03-09 10:48:31.422039 Educationstack:1  XGB_Model	训练集准确度: 0.976715, 测试集准确度: 0.563052
2023-03-09 10:49:02.497978 Educationstack:2  XGB_Model	训练集准确度: 0.973315, 测试集准确度: 0.575289
2023-03-09 10:49:33.876108 Educationstack:3  XGB_Model	训练集准确度: 0.979181, 测试集准确度: 0.552949
2023-03-09 10:50:05.094666 agestack:1  XGB_Model	训练集准确度: 0.980624, 测试集准确度: 0.539939
2023-03-09 10:50:35.907880 agestack:2  XGB_Model	训练集准确度: 0.980454, 测试集准确度: 0.531441
2023-03-09 10:51:05.727177 agestack:3  XGB_Model	训练集准确度: 0.980031, 测试集准确度: 0.530682
2023-03-09 10:51:37.025521 genderstack:1  XGB_Model	训练集准确度: 0.986148, 测试集准确度: 0.818831
2023-03-09 10:51:49.107229 genderstack:2  XGB_Model	训练集准确度: 0.989972, 测试集准确度: 0.805744
2023-03-09 10:52:01.398377 genderstack:3  XGB_Model	训练集准确度: 0.988273, 测试集准确度: 0.813700


In [110]:
pred_dict = {}
for col_y in col_list:
    # 生成特征与目标
    X = np.hstack([D2V_HW_X_sp,D2V_X_sp])
    Y = data[col_y]
    
    if len(Y.unique()) > 2:
        Y[Y==6] = 0
    else:
        Y[Y==2] = 0
    # 交叉检验
    KF = KFold(
        n_splits=3, # 折叠次数；拆分数量
        shuffle=True, # 是否在拆分成批之前对数据进行混洗
     )
    for k,(tr,te) in enumerate(KF.split(X,Y)):
        # 训练集与测试集划分
        X_train = X.take(tr,axis = 0) # numpy按行取值
        X_test = X.take(te,axis = 0) 
        y_train = Y[tr]
        y_test = Y[te]

        print('{} {}stack:{}  XGB_Model'.format(datetime.now(),col_y,k+1),end="\t")
        XGB_Model = BF_XGB_Model(X_train, X_test, y_train, y_test) # NN模型构建及训练
        Model_ACC_proba(XGB_Model,X_train, X_test, y_train, y_test) # 测试泛化效果
    XGB_pred = XGB_Model.predict_proba(X) # 输出概率结果
    pred_dict[col_y] = XGB_pred # 保存结果


2023-03-08 14:29:58.122976 Educationstack:1  XGB_Model	训练集准确度: 0.989292, 测试集准确度: 0.551496
2023-03-08 14:30:28.012006 Educationstack:2  XGB_Model	训练集准确度: 0.989207, 测试集准确度: 0.538239
2023-03-08 14:30:58.153359 Educationstack:3  XGB_Model	训练集准确度: 0.988528, 测试集准确度: 0.541730
2023-03-08 14:31:28.200964 agestack:1  XGB_Model	训练集准确度: 0.990567, 测试集准确度: 0.524643
2023-03-08 14:31:58.675426 agestack:2  XGB_Model	训练集准确度: 0.990652, 测试集准确度: 0.521244
2023-03-08 14:32:29.267575 agestack:3  XGB_Model	训练集准确度: 0.991417, 测试集准确度: 0.512324
2023-03-08 14:32:59.929541 genderstack:1  XGB_Model	训练集准确度: 0.991842, 测试集准确度: 0.788239
2023-03-08 14:33:12.062074 genderstack:2  XGB_Model	训练集准确度: 0.992691, 测试集准确度: 0.782291
2023-03-08 14:33:24.178654 genderstack:3  XGB_Model	训练集准确度: 0.991247, 测试集准确度: 0.783104


In [17]:
first_floor_pred_XGB = np.hstack([pred_dict["Education"],pred_dict["age"],pred_dict["gender"]])

In [19]:
np.hstack([first_floor_pred_XGB,AF_nn_pred]).shape

(17651, 28)

In [84]:
# 划分数据集
X_tr, X_te, Y_tr, Y_te = get_train_test(np.hstack([first_floor_pred_XGB,AF_nn_pred]),data["Education"])

In [20]:
# 划分数据集
X_tr, X_te, Y_tr, Y_te = get_train_test(first_floor_pred_XGB,data["Education"])

In [116]:
X_tr.shape

(12355, 14)

In [117]:
X_te.shape

(5296, 14)

In [21]:
nn = BF_nn_Model(X_tr, X_te, Y_tr, Y_te)
Model_ACC_proba(nn,X_tr, X_te, Y_tr, Y_te)

训练集准确度: 0.838365, 测试集准确度: 0.828927


In [22]:
lr = LogisticRegression(max_iter=100000)
lr.fit(X_tr,Y_tr)
Model_ACC(lr,X_tr, X_te, Y_tr, Y_te)

训练集准确度: 0.844031, 测试集准确度: 0.835536


In [23]:
# 模型实例化
clf = svm.SVC(probability=True)
clf.fit(X_tr,Y_tr) # 模型训练
Model_ACC(clf,X_tr, X_te, Y_tr, Y_te)

训练集准确度: 0.853420, 测试集准确度: 0.842145
