In [None]:
import argparse
import torch
import time
import json
import numpy as np
import math
import random
import codecs
import os
from pytorchtools import EarlyStopping
from sklearn.model_selection import train_test_split
from utils_train import save_data, batch_generator, valid_loss, generate_idx_word, remove_temporary_file
from generate_boundary_train_data import generate_boundary_train_data, generate_boundary_start_index, load_data, merge_boundary_train_data, merge_boundary_train_data_final
from generate_number_train_data import generate_number_train_data, save_number_train_data, merge_number_train_data, merge_number_train_data_final

In [None]:
seed = 1337
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
random_state = seed

In [None]:
class Model(torch.nn.Module):
    def __init__(self, gen_emb, domain_emb, num_classes=3, dropout=0.55, crf=True):
        super(Model, self).__init__()
        self.gen_embedding = torch.nn.Embedding(gen_emb.shape[0], gen_emb.shape[1])
        self.gen_embedding.weight=torch.nn.Parameter(torch.from_numpy(gen_emb), requires_grad=False)
        self.domain_embedding = torch.nn.Embedding(domain_emb.shape[0], domain_emb.shape[1])
        self.domain_embedding.weight=torch.nn.Parameter(torch.from_numpy(domain_emb), requires_grad=False)
        
        self.conv1=torch.nn.Conv1d(gen_emb.shape[1]+domain_emb.shape[1], 128, 5, padding=2)
        self.conv2=torch.nn.Conv1d(gen_emb.shape[1]+domain_emb.shape[1], 128, 3, padding=1)
        self.dropout=torch.nn.Dropout(dropout)
        
        self.conv3=torch.nn.Conv1d(256, 256, 5, padding=2)
        self.conv4=torch.nn.Conv1d(256, 256, 5, padding=2)
        self.conv5=torch.nn.Conv1d(256, 256, 5, padding=2)
        self.linear_ae=torch.nn.Linear(256, num_classes)
        self.crf_flag=crf
        if self.crf_flag:
            from allennlp.modules import ConditionalRandomField
            self.crf=ConditionalRandomField(num_classes)
        
    def forward(self, x, x_len, x_mask, x_tag=None, testing=False):
        x_emb=torch.cat((self.gen_embedding(x), self.domain_embedding(x)), dim=2)
        x_emb=self.dropout(x_emb).transpose(1, 2)
        x_conv=torch.nn.functional.relu(torch.cat((self.conv1(x_emb), self.conv2(x_emb)), dim=1))
        
        x_conv=self.dropout(x_conv)
        x_conv=torch.nn.functional.relu(self.conv3(x_conv))
        x_conv=self.dropout(x_conv)
        x_conv=torch.nn.functional.relu(self.conv4(x_conv))
        x_conv=self.dropout(x_conv)
        x_conv=torch.nn.functional.relu(self.conv5(x_conv))
        x_conv=x_conv.transpose(1, 2)
        x_logit=self.linear_ae(x_conv)
        if testing:
            if self.crf_flag:
                score=self.crf.viterbi_tags(x_logit, x_mask)
            else:
                x_logit=x_logit.transpose(2, 0)
                score=torch.nn.functional.log_softmax(x_logit).transpose(2, 0)
        else:
            if self.crf_flag:
                score=-self.crf(x_logit, x_tag, x_mask)
            else:
                x_logit=torch.nn.utils.rnn.pack_padded_sequence(x_logit, x_len, batch_first=True)
                score=torch.nn.functional.nll_loss(torch.nn.functional.log_softmax(x_logit.data), x_tag.data)
        return score

In [None]:
def train(train_X, train_y, valid_X, valid_y, model, model_fn, optimizer, parameters, run_epoch, epochs, batch_size, crf, generate_data, early_stopping, earlystopping, domain):
    best_loss=float("inf")
    valid_history=[]
    train_history=[]
    idx_word = generate_idx_word(model_fn)
    for epoch in range(epochs):
        pred_y=np.zeros((train_X.shape[0], train_X.shape[1]), np.int16)
        offset = range(0, train_X.shape[0], batch_size)
        i_th = 0
        results = []
        for batch in batch_generator(train_X, train_y, batch_size, crf=crf):
            batch_train_X, batch_train_y, batch_train_X_len, batch_train_X_mask=batch
            loss = model(batch_train_X, batch_train_X_len, batch_train_X_mask, batch_train_y)
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm(parameters, 1.)
            optimizer.step()
            
            if generate_data:
                model.eval()
                batch_print_X_len=np.sum(train_X[offset[i_th]:offset[i_th]+batch_size]!=0, axis=1)
                batch_idx=batch_print_X_len.argsort()[::-1]
                batch_print_X_len=batch_print_X_len[batch_idx]
                batch_print_X_mask=(train_X[offset[i_th]:offset[i_th]+batch_size]!=0)[batch_idx].astype(np.uint8)
                batch_print_X=train_X[offset[i_th]:offset[i_th]+batch_size][batch_idx]
                batch_print_X_mask=torch.autograd.Variable(torch.from_numpy(batch_print_X_mask).long().cuda())
                batch_print_X=torch.autograd.Variable(torch.from_numpy(batch_print_X).long().cuda())
                batch_pred_y=model(batch_print_X, batch_print_X_len, batch_print_X_mask, testing=True)
                r_idx=batch_idx.argsort()
                if crf:
                    batch_pred_y=[batch_pred_y[idx] for idx in r_idx]
                    for ix in range(len(batch_pred_y)):
                        for jx in range(len(batch_pred_y[ix][0])):
                            pred_y[offset[i_th]+ix,jx]=batch_pred_y[ix][0][jx]
                else:
                    batch_pred_y=batch_pred_y.data.cpu().numpy().argmax(axis=2)[r_idx]
                    pred_y[offset[i_th]:offset[i_th]+batch_size,:batch_pred_y.shape[1]]=batch_pred_y
                model.train()
                i_th += 1
            
        if generate_data:
            for j_th in range(len(train_X)):
                result = []
                words_num = train_X[j_th]
                words_str = []
                for w in words_num:
                    if(w != 0):
                        words_str.append(idx_word[w])
                gold = train_y[j_th]
                pred = pred_y[j_th]
                for words_str, gold, pred in zip(words_str, gold, pred):
                    result.append(" ".join([words_str, str(gold), str(pred)]))
                results.append(result)
            save_data(domain, results, epoch, run_epoch)
            context, query, answer, answer_start_index = load_data(domain, epoch, run_epoch)
            update_index = generate_boundary_start_index(context, answer_start_index, answer)
            generate_boundary_train_data(domain, context, query, answer, update_index, epoch, run_epoch)
            save_number_train_data(domain, run_epoch, epoch)
        
        loss=valid_loss(model, train_X, train_y, crf=crf)
        train_history.append(loss)
        loss=valid_loss(model, valid_X, valid_y, crf=crf)
        valid_history.append(loss)
        if loss<best_loss:
            best_loss=loss
            torch.save(model, model_fn)
        shuffle_idx=np.random.permutation(len(train_X))
        train_X=train_X[shuffle_idx]
        train_y=train_y[shuffle_idx]
        if(epoch % 10 == 0):
            print(str(epoch) + '/' + str(epochs))
        
        if earlystopping:
            early_stopping(loss,model)
            epoch_end = 0
            if early_stopping.early_stop:
                epoch_end = epoch
                print('当前epoch为：' + str(epoch) + ' 已执行提前停止')
                break
    model=torch.load(model_fn)
    if not earlystopping:
        epoch_end = epochs - 1
    return train_history, valid_history, epoch_end

In [None]:
def run(domain, data_dir, model_dir, valid_split, runs, epochs, lr, dropout, batch_size, crf, generate_data, earlystopping, patience):
    gen_emb=np.load(data_dir+"gen.vec.npy")
    domain_emb=np.load(data_dir+domain+"_emb.vec.npy")
    ae_data=np.load(data_dir+domain+".npz")
    """
    train_data = ae_data['train_X']
    train_label = ae_data['train_y']
    train_X, valid_X, train_y, valid_y = train_test_split(train_data,
                                                          train_label,
                                                          test_size = valid_split,
                                                          random_state = random_state)
    """
    valid_X=ae_data['train_X'][-valid_split:]
    valid_y=ae_data['train_y'][-valid_split:]
    train_X=ae_data['train_X'][:-valid_split]
    train_y=ae_data['train_y'][:-valid_split]
    
    print("数据集总大小：", len(ae_data['train_X']))
    print("训练集大小：", len(train_X))
    print("验证集大小：", len(valid_X))

    epochs_end = []
    
    for r in range(runs):
        print('正在训练第 ' + str(r + 1) + '轮')
        model=Model(gen_emb, domain_emb, 3, dropout, crf)
        model.cuda()
        print(model)
        parameters = [p for p in model.parameters() if p.requires_grad]
        optimizer=torch.optim.Adam(parameters, lr=lr)
        patience = patience
        early_stopping = EarlyStopping(patience, verbose = False)
        train_history, valid_history, epoch_end = train(train_X, train_y, valid_X, valid_y, model, model_dir+domain+str(r), 
                                                        optimizer, parameters, r, epochs, batch_size, crf, generate_data, early_stopping, earlystopping, domain)
        if generate_data:
            if epoch_end != 0:
                epochs_end.append(epoch_end)
                merge_boundary_train_data(domain, r, epoch_end)
                merge_number_train_data(domain, r, epoch_end)
            else:
                epochs_end.append(epochs)
                merge_boundary_train_data(domain, r, epochs)
                merge_number_train_data(domain, r, epochs)
    if generate_data:
        merge_boundary_train_data_final(domain, runs, epochs_end)
        merge_number_train_data_final(domain, runs, epochs_end)
        generate_number_train_data(domain, runs, epochs_end)
    #the generated temporary file is very big (about 6G for each domain), you can decide wheather to delete them
    remove_temporary_file(domain, runs)

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_dir', type=str, default="model/DECNN/")
    parser.add_argument('--batch_size', type=int, default=128)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--runs', type=int, default=5)
    #you can set this parameter to [laptop], [restaurant], [restaurant14], [restaurant15]
    parser.add_argument('--domain', type=str, default="laptop")
    #if you set the above parameter--domian to [laptop] or [restaurant], you need to set this parameter to [data/prep_data/]
    #else you need to set this parameter to [data/prep_data_15/]
    parser.add_argument('--data_dir', type=str, default="data/prep_data/")
    parser.add_argument('--valid', type=int, default=150)
    parser.add_argument('--lr', type=float, default=0.0001)
    parser.add_argument('--dropout', type=float, default=0.55)
    #you can replace the softmax layer with CRF layer
    parser.add_argument('--crf', type=bool, default=False)
    #this parameter will decide whether to generate the training data for post-process modules
    #if you just want to train DE-CNN, you can set this parameter to False
    parser.add_argument('--generate_data', type=bool, default=True)
    #we have added earlystopping mechanism
    parser.add_argument('--earlystopping', type=bool, default=False)
    parser.add_argument('--patience', type=int, default=30)
    args = parser.parse_known_args()[0]

    run(args.domain, args.data_dir, args.model_dir, args.valid, args.runs, args.epochs, args.lr, args.dropout, args.batch_size, args.crf, args.generate_data, args.earlystopping, args.patience)