In [1]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import HuichuanFlow as ms
from HuichuanFlow.trainer import SimpleTrainer

# 读取数据，去掉无用列
data = pd.read_csv("../data/titanic.csv").drop(["PassengerId", 
                  "Name", "Ticket", "Cabin"], axis=1)

# 构造编码类
le = LabelEncoder()
ohe = OneHotEncoder(sparse=False)

# 对类别型特征做One-Hot编码
Pclass = ohe.fit_transform(le.fit_transform(data["Pclass"].fillna(0)).reshape(-1, 1))
Sex = ohe.fit_transform(le.fit_transform(data["Sex"].fillna("")).reshape(-1, 1))
Embarked = ohe.fit_transform(le.fit_transform(data["Embarked"].fillna("")).reshape(-1, 1))

# 组合特征列
features = np.concatenate([Pclass,
                           Sex,
                           data[["Age"]].fillna(0),
                           data[["SibSp"]].fillna(0),
                           data[["Parch"]].fillna(0),
                           data[["Fare"]].fillna(0),
                           Embarked
                           ], axis=1)

# 标签
labels = data["Survived"].values * 2 - 1

# 特征维数
dimension = features.shape[1]

# 嵌入向量维度
k = 2

# 一次项
x = ms.core.Variable(dim=(dimension, 1), init=False, trainable=False)

# 三个类别类特征的三套One-Hot
x_Pclass = ms.core.Variable(dim=(Pclass.shape[1], 1), init=False, trainable=False)
x_Sex = ms.core.Variable(dim=(Sex.shape[1], 1), init=False, trainable=False)
x_Embarked = ms.core.Variable(dim=(Embarked.shape[1], 1), init=False, trainable=False)


# 标签
label = ms.core.Variable(dim=(1, 1), init=False, trainable=False)

# 一次项权值向量
w = ms.core.Variable(dim=(1, dimension), init=True, trainable=True)

# 类别类特征的嵌入矩阵
E_Pclass = ms.core.Variable(dim=(k, Pclass.shape[1]), init=True, trainable=True)
E_Sex = ms.core.Variable(dim=(k, Sex.shape[1]), init=True, trainable=True)
E_Embarked = ms.core.Variable(dim=(k, Embarked.shape[1]), init=True, trainable=True)

# 偏置
b = ms.core.Variable(dim=(1, 1), init=True, trainable=True)


# Wide部分
wide = ms.ops.MatMul(w, x)


# Deep部分，三个嵌入向量
embedding_Pclass = ms.ops.MatMul(E_Pclass, x_Pclass)
embedding_Sex = ms.ops.MatMul(E_Sex, x_Sex)
embedding_Embarked = ms.ops.MatMul(E_Embarked, x_Embarked)

# 将三个嵌入向量连接在一起
embedding = ms.ops.Concat(
        embedding_Pclass,
        embedding_Sex,
        embedding_Embarked
        )

# 第一隐藏层
hidden_1 = ms.layer.fc(embedding, 3 * k, 8, "ReLU")

# 第二隐藏层
hidden_2 = ms.layer.fc(hidden_1, 8, 4, "ReLU")

# 输出层
deep = ms.layer.fc(hidden_2, 4, 1, None)

# 输出
output = ms.ops.Add(wide, deep, b)

# 预测概率
predict = ms.ops.Logistic(output)

# 损失函数
loss = ms.ops.loss.LogLoss(ms.ops.Multiply(label, output))

learning_rate = 0.005
optimizer = ms.optimizer.Adam(ms.default_graph, loss, learning_rate)
accuracy = ms.ops.metrics.Accuracy(output, label)
precision = ms.ops.metrics.Precision(output, label)
recall = ms.ops.metrics.Recall(output, label)
auc = ms.ops.metrics.ROC_AUC(output, label)
roc = ms.ops.metrics.ROC(output, label)

batch_size = 16

trainer = SimpleTrainer([x, x_Pclass, x_Sex, x_Embarked], label,
                        loss, optimizer, epoches=20, batch=16,eval_on_train=True,metrics_ops=[accuracy,precision,recall,auc])
train_inputs = {
    x.name: features,
    x_Pclass.name: features[:, :3],
    x_Sex.name: features[:, 3:5],
    x_Embarked.name: features[:, 9:]
}

trainer.train_and_eval(train_inputs, labels, train_inputs, labels)


[INIT] Variable weights init finished
- Epoch [1] train start, batch size: 8, train data size: 4
-- iteration [99] finished, time cost: 0.12  and loss value: 0.659642
-- iteration [199] finished, time cost: 0.11  and loss value: 0.669629
-- iteration [299] finished, time cost: 0.12  and loss value: 1.825668
-- iteration [399] finished, time cost: 0.10  and loss value: 0.839026
-- iteration [499] finished, time cost: 0.13  and loss value: 0.523250
-- iteration [599] finished, time cost: 0.11  and loss value: 1.321726
-- iteration [699] finished, time cost: 0.11  and loss value: 0.385233
-- iteration [799] finished, time cost: 0.13  and loss value: 0.401682
- Epoch [1] train finished, time cost: 1.03
Epoch [1] evaluation metrics Accuracy: 0.6162 Precision: 0.0000 Recall: 0.0000 ROC_AUC: 0.4325 
- Epoch [2] train start, batch size: 8, train data size: 4
-- iteration [99] finished, time cost: 0.12  and loss value: 0.349593
-- iteration [199] finished, time cost: 0.10  and loss value: 0.455

-- iteration [799] finished, time cost: 0.14  and loss value: 0.487644
- Epoch [11] train finished, time cost: 1.28
Epoch [11] evaluation metrics Accuracy: 0.6824 Precision: 0.8554 Recall: 0.2076 ROC_AUC: 0.8064 
- Epoch [12] train start, batch size: 8, train data size: 4
-- iteration [99] finished, time cost: 0.15  and loss value: 0.494306
-- iteration [199] finished, time cost: 0.14  and loss value: 0.708669
-- iteration [299] finished, time cost: 0.13  and loss value: 0.084468
-- iteration [399] finished, time cost: 0.13  and loss value: 0.708765
-- iteration [499] finished, time cost: 0.14  and loss value: 0.359338
-- iteration [599] finished, time cost: 0.15  and loss value: 0.835247
-- iteration [699] finished, time cost: 0.13  and loss value: 0.288332
-- iteration [799] finished, time cost: 0.14  and loss value: 0.472165
- Epoch [12] train finished, time cost: 1.23
Epoch [12] evaluation metrics Accuracy: 0.6712 Precision: 0.8451 Recall: 0.1754 ROC_AUC: 0.8141 
- Epoch [13] train

NameError: name 'sig' is not defined

In [None]:
saver = ms.trainer.Saver('./epoches10')
saver.save(model_file_name='my_model.json',
           weights_file_name='my_weights.npz')