note: 这个代码是为了测试sentence embedding model的效果--将每个属性分开训练向量化模型

## 1 训练sentence embedding model，将元组转换为向量

### 1.1 加载数据，取出无关元组

In [1]:
import pandas as pd
import numpy as np
import string
import math
import matplotlib.pyplot as plt
import warnings
import random
warnings.filterwarnings("ignore")  #忽略告警
import os
import torch
from sentence_transformers import util
os.environ['CUDA_VISIBLE_DEVICES']='0'

# config = tf.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 0.5  # 程序最多只能占用指定gpu50%的显存
# config.gpu_options.allow_growth = True      #程序按需申请内存
# sess = tf.Session(config = config)

file_path = 'data/Structured/DBLP-Scholar/'

table_a_df = pd.read_csv(file_path + 'tableA.csv')
table_b_df = pd.read_csv(file_path + 'tableB.csv')
train_df = pd.read_csv(file_path + 'train.csv')
valid_df = pd.read_csv(file_path + 'valid.csv')
test_df = pd.read_csv(file_path + 'test.csv')

In [2]:
train_df.shape, test_df.shape, valid_df.shape, table_a_df.shape, table_b_df.shape

((17223, 3), (5742, 3), (5742, 3), (2616, 5), (64263, 5))

In [3]:
table_a_df.head()

Unnamed: 0,id,title,authors,venue,year
0,0,towards a cooperative transaction model - the ...,"m rusinkiewicz , w klas , t tesch , j wфsch , ...",vldb,1995
1,1,sql/xml is making good progress,"a eisenberg , j melton",sigmod record,2002
2,2,using formal methods to reason about semantics...,"p ammann , s jajodia , i ray",vldb,1995
3,3,editor 's notes,l liu,sigmod record,2002
4,4,report on the acm fourth international worksho...,,,2002


In [4]:
train_df.head()

Unnamed: 0,ltable_id,rtable_id,label
0,2563,6425,0
1,1861,64163,1
2,2187,64008,1
3,611,15632,0
4,631,27684,0


In [5]:
table_a_df.loc[730,:]

id                                                       730
title      reconciling schemas of disparate data sources ...
authors                       a doan , p domingos , a halevy
venue                                      sigmod conference
year                                                    2001
Name: 730, dtype: object

## 2 构建AVB模型，无监督训练encoder,用于后续的分类模型

In [6]:
# 导入相关库
import tensorflow as tf
import numpy as np
from tqdm import tqdm_notebook
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
import os
from tensorflow.contrib import slim
from tensorflow.contrib import layers as tflayers
from torch.autograd import Variable
import torchvision
import torch.utils.data as Data
import torch

from sklearn import metrics

In [7]:
# 定义全局超参数变量
global max_batch_n, batch_index
global LR_PRIMAL, LR_DUAL, X_dim, h_dim, z_dim, eps_dim, eps_nbasis
global encoder, decoder, discriminator

### 2.1 加载相关所需参数和函数

In [8]:
# 构造一个将所有句子，转换为tuple vector 的函数
def TupleEmbedding(all_sentences, sentence_embedding_model):
    '''
    all_sentences: 包含所有元组句子的列表
    sentence_embedding_model: 将句子转换为句子向量的模型
    founction：将所有的句子，转换为对应的sentence vector
    '''
    sentences_vector = []
    for sen in all_sentences:
        sentences_vector.append(sentence_embedding_model.encode(sen))
    return np.array(sentences_vector)

# 构造一个产生分批数据的函数
def GetBatchData(train_loader):
    '''
    传入torch的dataloader,然后生成对应的分批数据
    '''
    global batch_index
    for step, (batch_x, batch_y) in enumerate(train_loader):
        if step != batch_index:
            continue
        # batch_index = (batch_index + 1) % max_batch_n 
        # 对batch_x的维度进行修改，转换为（batch_size, -1）
        sample_x = batch_x.view(BATCH_SIZE, -1).data.numpy()
    return sample_x


def lrelu(x, leak=0.2, name="lrelu"):
    return tf.maximum(x, leak*x)

# 模型计算图构建当中，所需的相关函数
def get_zlogprob(z):
    # temp = tf.clip_by_value(predict_1, 1e-8, 1.0)
    logprob = -0.5 * tf.reduce_sum(z*z  + np.log(2*np.pi), [1])
    return logprob

def bit_product_sum(x, y):
    return sum([item[0] * item[1] for item in zip(x, y)])

def get_reconstr_err(de_x, x):
    x_multi = tf.multiply(x, x)
    de_x_multi = tf.multiply(de_x, de_x)
    x_de_x_multi = tf.multiply(x, de_x)

    x_mul_sum = tf.reduce_sum(x_multi)
    de_x_mul_sum = tf.reduce_sum(de_x_multi)
    x_de_mul_sum = tf.reduce_sum(x_de_x_multi)

    cosine = tf.divide(x_de_mul_sum, tf.add(x_mul_sum, de_x_mul_sum))
    reconst_err = tf.subtract(2., cosine)

    return reconst_err

### 2.2 构建编码器、解码器、判别器函数

In [9]:
## 构建编码器，解码器，判别器模型
def encoder_func(x):
    size = x.shape[0]
    # eps = tf.random_normal(tf.stack([eps_nbasis, size, eps_dim]))

    net = slim.fully_connected(x, h_dim, activation_fn=tf.nn.softplus)
    net = slim.fully_connected(net, h_dim, activation_fn=tf.nn.softplus)
    # net = slim.dropout(net, 0.3, is_training=False)
    # net = slim.fully_connected(net, h_dim, activation_fn=tf.nn.relu)
    net = slim.fully_connected(net, h_dim, activation_fn=tf.nn.softplus)
    net = slim.fully_connected(net, h_dim, activation_fn=tf.nn.softplus)

    z_mean = slim.fully_connected(net, z_dim, activation_fn=None)
    z_logstd = slim.fully_connected(net, z_dim, activation_fn=None)
    # logstd = log(std)
    return z_mean, z_logstd

def decoder_func(z):
    net = z
    net = slim.fully_connected(net, h_dim, activation_fn=tf.nn.softplus)
    net = slim.fully_connected(net, h_dim, activation_fn=tf.nn.softplus)
    net = slim.fully_connected(net, h_dim, activation_fn=tf.nn.softplus)
    xlogits = slim.fully_connected(net, X_dim, activation_fn=None)
    return xlogits

def discriminator_func(x, z):
    # Theta
    with tf.variable_scope("theta"):
        fc_argscope = slim.arg_scope([slim.fully_connected], activation_fn=lrelu)
        with fc_argscope:
            net = slim.fully_connected(x, 1024, scope='fc_0')
            net = slim.fully_connected(net, 1024, scope='fc_1')
            net = slim.fully_connected(net, 1024, scope='fc_2')
        theta = slim.fully_connected(net, 8192, activation_fn=tf.nn.elu, scope='theta',
                  weights_initializer=tf.truncated_normal_initializer(stddev=1e-5))

    with tf.variable_scope("s"):
        fc_argscope = slim.arg_scope([slim.fully_connected],
          activation_fn=lrelu)
        with fc_argscope:
            net = slim.fully_connected(z, 1024, scope='fc_0')
            net = slim.fully_connected(net, 1024, scope='fc_1')
            net = slim.fully_connected(net, 1024, scope='fc_2')
        s = slim.fully_connected(net, 8192, activation_fn=None, scope='s')

    with tf.variable_scope("xonly"):
        fc_argscope = slim.arg_scope([slim.fully_connected],
          activation_fn=lrelu)
        with fc_argscope:
            net = slim.fully_connected(x, 1024, scope='fc_0')
            net = slim.fully_connected(net, 1024, scope='fc_1')
            net = slim.fully_connected(net, 1024, scope='fc_2')
        Tx = slim.fully_connected(net, 1, activation_fn=None, scope='Tx',
              weights_initializer=tf.truncated_normal_initializer(stddev=1e-5))

    with tf.variable_scope("zonly"):
        fc_argscope = slim.arg_scope([slim.fully_connected],
          activation_fn=lrelu)
        with fc_argscope:
            net = slim.fully_connected(z, 1024, scope='fc_0')
            net = slim.fully_connected(net, 1024, scope='fc_1')
            net = slim.fully_connected(net, 1024, scope='fc_2')
        Tz = slim.fully_connected(net, 1, activation_fn=None, scope='Tz',
              weights_initializer=tf.truncated_normal_initializer(stddev=1e-5))

    T = tf.reduce_sum(theta * s, [1], keep_dims=True) + Tx + Tz
    T = tf.squeeze(T, 1)
    T += 0.5 * tf.reduce_sum(tf.square(z), [1])
    return T


### 2.3 构建模型训练计算图

In [10]:
## 构建计算图

def BuildComputationalGraph(x_real):
    '''
    构建计算图，输出两个损失值，和对应的参数
    '''
    # x_real = GetBatchData(train_loader)
    # x_real = tf.placeholder(tf.float32, shape=[BATCH_SIZE, X_dim])
    z_sampled = tf.random_normal([BATCH_SIZE, z_dim])
    eps = tf.random_normal([BATCH_SIZE, z_dim])

    # z_real, Ez, Varz = encoder(x_real) # 编码器的输出
    mean, logstd = encoder(x_real)
    std = tf.exp(logstd)
    z_real = mean + eps*std

    z_mean = tf.stop_gradient(mean)
    var = tf.multiply(std, std)
    z_var = tf.stop_gradient(var)
    z_std = tf.stop_gradient(std)

    z_norm = (z_real - z_mean) / z_std
    # z_norm = z_real
    Td = discriminator(x_real, z_norm) # 后验
    Ti = discriminator(x_real, z_sampled) # 先验
    logz = -0.5 * tf.reduce_sum(z_real*z_real  + np.log(2*np.pi), [1])
    # # 防止log操作出现nan
    z_var = tf.clip_by_value(z_var, 1e-8, 1.0)
    logr = -0.5 * tf.reduce_sum(z_norm*z_norm + tf.log(z_var) + np.log(2*np.pi), [1])

    decoder_out = decoder(z_real)  # 解码器的输出

    # 计算图所需参数
    c_dim = 1
    beta = 1
    factor = 10.0 / (X_dim * c_dim)

    # # Primal loss
    reconst_err = get_reconstr_err(decoder_out, x_real)
    KL = Td + logr - logz
#     KL = Td
    # KL = tf.reduce_sum(0.5*tf.square(z_real) - logstd - 0.5, 1)
    ELBO = reconst_err + KL
    #可以适当增加beta 和 factor的大小
    loss_primal = factor * tf.reduce_mean(3*reconst_err + beta*KL)

    # Mean values
    ELBO_mean = tf.reduce_mean(ELBO)
    KL_mean = tf.reduce_mean(KL)
    reconst_err_mean = tf.reduce_mean(reconst_err)

    # Dual loss
    d_loss_d = tf.reduce_mean(
      tf.nn.sigmoid_cross_entropy_with_logits(logits=Td, labels=tf.ones_like(Td)))
    d_loss_i = tf.reduce_mean(
      tf.nn.sigmoid_cross_entropy_with_logits(logits=Ti, labels=tf.zeros_like(Ti)))
    loss_dual = d_loss_i + d_loss_d
    # 获取需要更新的参数
    qvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoder")
    pvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "decoder")
    dvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "discriminator")

    td = tf.reduce_mean(Td)
#     gz = tf.reduce_mean(logz)
#     gr = tf.reduce_mean(logr)

    return loss_primal, loss_dual, pvars+qvars, dvars, ELBO_mean, reconst_err_mean, KL_mean,td

def GetTrainOp(loss_primal, loss_dual, vars_primal, vars_dual):
    learning_rate = LR_PRIMAL
    learning_rate_adversary = LR_DUAL

    # Train step
    primal_optimizer = tf.train.AdamOptimizer(learning_rate, use_locking=True, beta1=0.5)
    adversary_optimizer = tf.train.AdamOptimizer(learning_rate_adversary, use_locking=True, beta1=0.5)

    primal_grads = primal_optimizer.compute_gradients(loss_primal, var_list=vars_primal)
    adversary_grads = adversary_optimizer.compute_gradients(loss_dual, var_list=vars_dual)

    primal_grads = [(grad, var) for grad, var in primal_grads if grad is not None]
    adversary_grads = [(grad, var) for grad, var in adversary_grads if grad is not None]

    allgrads = [grad for grad, var in primal_grads + adversary_grads]
    with tf.control_dependencies(allgrads):
        primal_train_step = primal_optimizer.apply_gradients(primal_grads)
        adversary_train_step = adversary_optimizer.apply_gradients(adversary_grads)

    train_op = tf.group(primal_train_step, adversary_train_step)

    return train_op

### 2.4 模型训练

In [11]:
# global batch_index, h_dim
batch_index = 0

# 定义一些构建计算图所需函数
def Get2_WD(a_e, a_v, b_e, b_v):
    '''
    传入批量的两个多元高斯分布的均值和方差,(batch_size, z_dim)
    计算这一批多元高斯分布的平均距离,根据是否匹配，分别计算距离
    和对应的距离向量
    '''
    ab_e = tf.subtract(a_e, b_e)
    ab_v = tf.subtract(a_v, b_v)

    ab_e2 = tf.multiply(ab_e, ab_e)
    ab_v2 = tf.multiply(ab_v, ab_v)

    e_v = tf.add(ab_e2, ab_v2)
    e_v_sum = tf.reduce_mean(e_v, 1)

    return e_v_sum, e_v

# 构建二分类网络结构
def ClassifyModel(x):
    net = x
    h_dim = 300
    net = slim.fully_connected(net, h_dim, activation_fn=tf.nn.relu)
    net = slim.dropout(net, 0.3, is_training=False)

    net = slim.fully_connected(net, h_dim, activation_fn=tf.nn.relu)
    net = slim.dropout(net, 0.3, is_training=False)

    net = slim.fully_connected(net, h_dim, activation_fn=tf.nn.relu)
    net = slim.dropout(net, 0.3, is_training=False)

    net = slim.fully_connected(net, h_dim, activation_fn=tf.nn.relu)

    prediction = slim.fully_connected(net, 2, activation_fn=tf.nn.softmax)
    return prediction

classify_model = tf.make_template('ClassifyModel', ClassifyModel)

In [12]:
# # 保存X矩阵的内容
# X = sentences_vector
# with open('X_array1_10.txt', 'w') as fp:
#   for item in X:
#     temp = [str(i) for i in item]
#     s = ' '.join(temp)
#     fp.write(s + '\n')

In [13]:
# 读取保存的X矩阵
X = []
with open(file_path + 'X_array0_100.txt', 'r') as fp:
    temp = fp.readlines()
for item in temp:
    s = item.strip('\n')
    slist = s.split()
    slist = [eval(i) for i in slist]
    X.append(slist)
X = np.array(X)
X = torch.FloatTensor(X).data.numpy()
print(X.shape)
sentences_vector = X

(66879, 768)


In [14]:
# 提取训练集合测试集当中的数据进行训练SE模型
a_id_list = []
b_id_list = []
for i in range(train_df.shape[0]):
    a_id_list.append(train_df.loc[i, 'ltable_id'])
    b_id_list.append(train_df.loc[i, 'rtable_id'])
for i in range(test_df.shape[0]):
    a_id_list.append(test_df.loc[i, 'ltable_id'])
    b_id_list.append(test_df.loc[i, 'rtable_id'])
a_id_list = list(set(a_id_list))
b_id_list = list(set(b_id_list))
    
X_train_test = []
for a_id in a_id_list:
    X_train_test.append(sentences_vector[a_id])
for b_id in b_id_list:
    X_train_test.append(sentences_vector[table_a_df.shape[0] + b_id])

X_train_test = np.array(X_train_test)
X_train_test = torch.FloatTensor(X_train_test).data.numpy()
print(X_train_test.shape)

(11996, 768)


In [15]:
## 构建批处理操作
BATCH_SIZE = 20
Y_fake = torch.linspace(1, 100, X_train_test.shape[0]) # 为X_attr 构建一个虚假的Y值
torch_dataset = Data.TensorDataset(torch.FloatTensor(X_train_test), Y_fake)

train_loader = Data.DataLoader(
    dataset=torch_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

## 定义超参数
X_dim = X.shape[1]
h_dim = 300
z_dim = 200
eps_dim = 32
eps_nbasis = 16

# 获取模型变量
encoder = tf.make_template('encoder', encoder_func)
decoder = tf.make_template('decoder', decoder_func)
discriminator = tf.make_template('discriminator', discriminator_func)

# 构建计算图
LR_PRIMAL = 1e-5
LR_DUAL = 2e-5

x_real = tf.placeholder(tf.float32, shape=[BATCH_SIZE, X_dim])
loss_primal, loss_dual, vars_primal, vars_dual, ELBO_mean, reconst_err_mean, KL_mean,td = BuildComputationalGraph(x_real)
train_op1 = GetTrainOp(loss_primal, loss_dual, vars_primal, vars_dual)

Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [16]:
sentences_vector = X
# 先给train_df添加一个属性列similar:表示两个元组对应向量之间的余弦相似度
train_df['similar'] = 0.0

# 计算出train_df中所有元组对，对应的余弦相似度
for i in range(train_df.shape[0]):
    a_id = train_df.loc[i, 'ltable_id']
    b_id = train_df.loc[i, 'rtable_id']
    em_a = sentences_vector[a_id]
    em_b = sentences_vector[table_a_df.shape[0] + b_id]
    train_df.loc[i, 'similar'] = float(util.cos_sim(em_a, em_b))

for i in range(valid_df.shape[0]):
    a_id = valid_df.loc[i, 'ltable_id']
    b_id = valid_df.loc[i, 'rtable_id']
    em_a = sentences_vector[a_id]
    em_b = sentences_vector[table_a_df.shape[0] + b_id]
    valid_df.loc[i, 'similar'] = float(util.cos_sim(em_a, em_b))

# 分别将匹配的、不匹配的原组对，进行分开
pairs_0, pairs_1 = [], []

pairs_0.extend(train_df[train_df['label'] == 0].values.tolist())
pairs_0.extend(valid_df[valid_df['label'] == 0].values.tolist())

pairs_1.extend(train_df[train_df['label'] == 1].values.tolist())
pairs_1.extend(valid_df[valid_df['label'] == 1].values.tolist())

In [17]:
# 设置批处理函数所需的，索引
global id_0, id_1, num_0, num_1
id_0, id_1 = 0, 0
num_0, num_1 = 15, 5
total_num01 = num_0 + num_1

# 构建新的生产批数据的函数
def GetBatchData_2(pairs_0, pairs_1, sentences_vector):
    '''
    新的批数据生产函数
    '''
    # 根据batch_size设置pairs_0和pairs_1，每次抽样多少个数据
    # pairs_0 每批次15个，pairs_1 每批次5个
    global id_0, id_1, num_0, num_1
    
    len_0, len_1 = len(pairs_0), len(pairs_1)
#     move_0 = len_0 // num_0
#     move_1 = len_1 // num_1
#     move_0, move_1 = 100, 50
    x_a, x_b, y = [], [], []
    
    # 对pairs_0进行采样
    for i in range(num_0):
        id_0 = (id_0 + 1) % len_0
        x_a.append(sentences_vector[ int(pairs_0[id_0][0]) ])
        x_b.append(sentences_vector[table_a_df.shape[0] + int(pairs_0[id_0][1]) ])
        y.append(int(pairs_0[id_0][2]))
        
    for i in range(num_1):
        id_1 = (id_1 + 1) % len_1
        x_a.append(sentences_vector[ int(pairs_1[id_1][0]) ])
        x_b.append(sentences_vector[table_a_df.shape[0] + int(pairs_1[id_1][1]) ])
        y.append(int(pairs_1[id_1][2]) )
    
    x_a = np.array(x_a).reshape(total_num01, -1)
    x_b = np.array(x_b).reshape(total_num01, -1)
    y = np.array(y).reshape(total_num01, -1)
    
    return x_a, x_b, y



In [18]:
def TrainClassifyModel(x_a, x_b, y):

    # 对数据进行编码处理
    ez_a, logstd_a = encoder(x_a)
    ez_b, logstd_b = encoder(x_b)
    # 这里的vz 表示z的标准差（不修改名字了，因该是std_z_a）
    vz_a = tf.exp(logstd_a)
    vz_b = tf.exp(logstd_b)

    # 计算2-WassersteinDistance, (batch_size, 1), (batch_size, z_dim)
    WD_disc, WD_vector = Get2_WD(ez_a, vz_a, ez_b, vz_b)

    # 计算 模型的预测,每组数据对应匹配的概率，（batch_size, 2）
    # x_data = tf.concat([x_a, x_b], axis=-1)
    prediction = classify_model(WD_vector)
    predict_0 = tf.slice(prediction, [0,0],[-1,1])
    predict_1 = tf.slice(prediction, [0,1],[-1,1])

    # 构造对比损失函数-论文当中的,Cost-effictive ...
    M = 0.5  # 调节encoder参数更新的超参数
    class_w = 1.0  # 增加匹配样例的权重
    # 对概率值进行范围限制，防止log()操作出现nan
    predict_1 = tf.clip_by_value(predict_1, 1e-8, 1.0)
    predict_0 = tf.clip_by_value(predict_0, 1e-8, 1.0)
    loss_1 = -tf.add(tf.multiply(3*y, tf.log(predict_1)), tf.multiply(1*(1.0-y), tf.log(predict_0)))
    loss_2 = tf.add(tf.multiply(3*y, WD_disc), tf.multiply(1.0-y, tf.maximum(0.0,M-WD_disc)))

    loss_1 = tf.reduce_mean(loss_1)
    loss_2 = tf.reduce_mean(loss_2)
    total_loss = loss_1 + loss_2

    # 获取 模型需要更新的参数
    encoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoder")
    classify_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "ClassifyModel")

    # optimizer = tf.train.AdamOptimizer(LR)
    # loss1_op = optimizer.minimize(loss_1, var_list=classify_vars+encoder_vars)
    # loss2_op = optimizer.minimize(loss_2, var_list=encoder_vars)
    train_op2 = GetTrainOp(2*loss_1, 1*loss_2, encoder_vars+classify_vars, encoder_vars)
    # train_op = optimizer.minimize(total_loss, var_list=encoder_vars+classify_vars)
    return train_op2, loss_1, loss_2

LR_PRIMAL = 1e-5
LR_DUAL = 2e-5

x_a = tf.placeholder(tf.float32, shape=[total_num01, X_dim])
x_b = tf.placeholder(tf.float32, shape=[total_num01, X_dim])
y = tf.placeholder(tf.float32, shape=[total_num01, 1])
train_op2, loss_1, loss_2 = TrainClassifyModel(x_a, x_b, y)

In [19]:
#训练计算图
sess = tf.InteractiveSession()  # 创建会话交互类
sess.run(tf.global_variables_initializer())  # 初始化变量

LR_PRIMAL = 1e-5
LR_DUAL = 2e-5

batch_index = 0
max_batch_n = X_train_test.shape[0] // BATCH_SIZE
progress = tqdm_notebook(range(60*max_batch_n))
for i in progress:
    x = GetBatchData(train_loader)
    batch_index = (batch_index + 1) % max_batch_n

    # sess.run(train_op1)
    _,loss_p,loss_d, r_e,kl,td1 = sess.run([train_op1,loss_primal,loss_dual,reconst_err_mean,KL_mean,td], feed_dict={x_real:x})
    progress.set_description('loss_primal= %.4f, loss_dual= %.4f, re=%.4f - ' % (loss_p,loss_d,r_e))

# # 保存对应的参数
# saver = tf.train.Saver()
# saver.save(sess,'AVB_paramter.ckpt')

  0%|          | 0/35940 [00:00<?, ?it/s]

*   loss_primal= 0.0417, loss_dual= 3.6338, re=1.8010 
*   loss_primal= 0.0386, loss_dual= 3.9267, re=1.8149

In [20]:
# 获取模型测试用的test数据
sentences_vector = X
X_a = []
X_b = []
## valid_df，转换为test_df
for i in range(test_df.shape[0]):
    a_id = test_df.loc[i, 'ltable_id']
    b_id = test_df.loc[i, 'rtable_id']
    X_a.append(sentences_vector[a_id])
    X_b.append(sentences_vector[table_a_df.shape[0] + b_id])
X_a = np.array(X_a)
X_b = np.array(X_b)

Y = test_df['label'].values

LR_PRIMAL = 1e-5
LR_DUAL = 2e-5

In [None]:
# 训练过程
# x_data = np.concatenate([X_a, X_b], axis=-1)
id_0, id_1 = 0, 0
max_batch_n_2 = train_df.shape[0] // total_num01
epoch_n = 2000 * max_batch_n_2
progress = tqdm_notebook(range(epoch_n))

# temp_i = 0
# temp_list = [15,15]
for i in progress:
    # 更新参数
    # sess.run(train_op2)
    Xa, Xb, Yy = GetBatchData_2(pairs_0, pairs_1, sentences_vector)
    id_0 += 1
    id_1 += 1

    _, loss1, loss2 = sess.run([train_op2, loss_1, loss_2], feed_dict={x_a:Xa, x_b:Xb, y:Yy})
    progress.set_description('loss1 = %.5f, loss2 = %.4f - '%(loss1,loss2))

    num_0 = random.randint(8,total_num01)
    num_1 = total_num01 - num_0      

    # 用测试集进行验证
    if (i < max_batch_n_2 * 500):
        temp = 3*max_batch_n_2
    else:
        temp = max_batch_n_2
#     temp = max_batch_n_2
    if i % temp == 0:
        e_a, v_a = encoder(X_a)
        e_b, v_b = encoder(X_b)
        
        v_a = tf.exp(v_a)
        v_b = tf.exp(v_b)
        
        wd_disc, wd_vector = Get2_WD(e_a, v_a, e_b, v_b)

        # sess.run(wd_vector)
        output = classify_model(wd_vector)
        # output = tf.argmax(output, 1)
        output = sess.run(output)
        out = np.argmax(output, 1)
        # 计算交叉熵误差
        # accuracy = sum(out == Y) / X_a.shape[0]
        # print('epoch = %d, accuracy = %.4f' % (i, accuracy))
        # 计算模型效果
        precision = metrics.precision_score(Y, out)
        recall = metrics.recall_score(Y, out)
        f1_score = metrics.f1_score(Y, out)
        if f1_score > 0.95:
            print('epoch = %d, precision = %.3f, recall = %.3f, f1_score = %.4f'%(i,precision,recall,f1_score))


  0%|          | 0/1722000 [00:00<?, ?it/s]

*   precision = 0.982, recall = 0.986, f1_score = 0.9843(单属性50次训练)（ACM-S）
*   precision = 0.867, recall = 0.929, f1_score = 0.8966(单属性训练100次）（beer-S)
*   precision = 1.000, recall = 1.000, f1_score = 1.0000（单属性100次）（Fodor-S)
*   precision = 0.982, recall = 0.982, f1_score = 0.9820(单属性100次）（DA-D）
*   precision = 0.923, recall = 0.889, f1_score = 0.9057(单属性100次）（iTA-S）（需要提高效果）
*   precision = 0.595, recall = 0.757, f1_score = 0.6667()(AB-T)
*   precision = 0.922, recall = 0.912, f1_score = 0.9173()(DS-S)(需要重新实验)
*   precision = 0.694, recall = 0.926, f1_score = 0.7937()(iTA-D)
*   precision = 0.697, recall = 0.852, f1_score = 0.7667()(AG-S)
*   precision = 0.930, recall = 0.900, f1_score = 0.9150()(DS-D)(有待提高)