In [1]:
from tqdm import tqdm, trange
import numpy as np

In [2]:
def process_file(input_file, output_file):
    """
    The function process the data files for Delete & Generate and convert
    it for the Delete, Retrieve and Generate training by separating the content
    and attributes. It includes all the attribure words.
    
    Input_file: string : Path of the input file
    Output_file: string : Path of the output file 
    """
    
    with open(input_file) as fp:
        data = fp.read().splitlines()
    with open (output_file,"w") as out_fp:
        for x in tqdm(data):
            temp = x.split("<START>")
            con = temp[0].replace("<POS>","").replace("<NEG>","").replace("<CON_START>","")
            sen = temp[1].replace("<END>","")
            lt1 = con.split()
            lt2 = sen.split()
            att_tokens = [z for z in lt2 if z not in lt1]
            if len(att_tokens) > max_atts:
                max_atts = len(att_tokens)
            att_words = " ".join(att_tokens)
            out_str = "<ATTR_WORDS> " + att_words + " <CON_START> " + con.strip() + " <START> " + sen.strip() + " <END>" + "\n"
            out_fp.write(out_str)
    

In [7]:
def process_file_v1(input_file, output_file):
    """
    The function process the data files for Delete & Generate and convert
    it for the Delete, Retrieve and Generate training by separating the content
    and attributes. It randomly picks 70% of the attributes only to make the generation
    more realistic instead of just filling the blanks, which helps while generating
    sentences for test cases.
    
    Input_file: string : Path of the input file
    Output_file: string : Path of the output file 
    """
    with open(input_file) as fp:
        data = fp.read().splitlines()
    with open (output_file,"w") as out_fp:
        for x in tqdm(data):
            try:
                temp = x.split("<START>")
                con = temp[0].replace("<POS>","").replace("<NEG>","").replace("<CON_START>","")
                sen = temp[1].replace("<END>","")
                lt1 = con.split()
                lt2 = sen.split()
                att_words = [z for z in lt2 if z not in lt1]
                #print(att_words)
            except:
                continue
            if len(att_words) > 2:
                indx = np.array(list(range(len(att_words))))
                np.random.shuffle(indx)
                att_words = " ".join([att_words[indx[k]] for k in range(int(0.7 * len(att_words)))])
            else: # If attributes less than 2 then keep all the attributes
                att_words = " ".join(att_words)
            #print(att_words)
            out_str = "<ATTR_WORDS> " + att_words + " <CON_START> " + con.strip() + " <START> " + sen.strip() + " <END>" + "\n"
            out_fp.write(out_str)

In [5]:
import os

data_dir = os.getcwd()
inp_dataset = "data/processed_files/" # amazon / yelp / imagecaption
out_dataset = "data/processed_files/dre_model"

In [14]:
process_file_v1(os.path.join(data_dir ,"{}/train/all.txt".format(inp_dataset)), \
             os.path.join(data_dir ,"{}/train/all.txt".format(out_dataset)))
process_file_v1(os.path.join(data_dir ,"{}/train/en_2.txt".format(inp_dataset)), \
             os.path.join(data_dir ,"{}/train/en.txt".format(out_dataset)))
process_file_v1(os.path.join(data_dir ,"{}/train/trump.txt".format(inp_dataset)), \
             os.path.join(data_dir ,"{}/train/trump.txt".format(out_dataset)))

100%|██████████| 23094/23094 [00:02<00:00, 9560.85it/s] 
100%|██████████| 9382/9382 [00:01<00:00, 5325.39it/s]
100%|██████████| 13711/13711 [00:01<00:00, 13663.81it/s]


In [15]:
process_file_v1(os.path.join(data_dir ,"{}/test/all.txt".format(inp_dataset)), \
             os.path.join(data_dir ,"{}/test/all.txt".format(out_dataset)))
process_file_v1(os.path.join(data_dir ,"{}/test/en_2.txt".format(inp_dataset)), \
             os.path.join(data_dir ,"{}/test/en.txt".format(out_dataset)))
process_file_v1(os.path.join(data_dir ,"{}/test/trump.txt".format(inp_dataset)), \
             os.path.join(data_dir ,"{}/test/trump.txt".format(out_dataset)))

100%|██████████| 91782/91782 [00:02<00:00, 36074.50it/s]
100%|██████████| 91633/91633 [00:02<00:00, 37409.91it/s]
100%|██████████| 148/148 [00:00<00:00, 9592.31it/s]


In [16]:
process_file_v1(os.path.join(data_dir ,"{}/dev/all.txt".format(inp_dataset)), \
             os.path.join(data_dir ,"{}/dev/all.txt".format(out_dataset)))
process_file_v1(os.path.join(data_dir ,"{}/dev/en_2.txt".format(inp_dataset)), \
             os.path.join(data_dir ,"{}/dev/en.txt".format(out_dataset)))
process_file_v1(os.path.join(data_dir ,"{}/dev/trump.txt".format(inp_dataset)), \
             os.path.join(data_dir ,"{}/dev/trump.txt".format(out_dataset)))

100%|██████████| 3175/3175 [00:00<00:00, 7288.05it/s]
100%|██████████| 1294/1294 [00:00<00:00, 4483.99it/s]
100%|██████████| 1880/1880 [00:00<00:00, 10229.35it/s]


In [None]:
# process_file_v1("./processed_files_with_bert_with_best_head/sentiment_train.txt","./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train.txt")
# process_file_v1("./processed_files_with_bert_with_best_head/sentiment_train_1.txt","./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_1.txt")
# process_file_v1("./processed_files_with_bert_with_best_head/sentiment_train_0.txt","./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_0.txt")

In [None]:
# process_file_v1("./processed_files_with_bert_with_best_head/sentiment_test.txt","./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test.txt")
# process_file_v1("./processed_files_with_bert_with_best_head/sentiment_test_1.txt","./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test_1.txt")
# process_file_v1("./processed_files_with_bert_with_best_head/sentiment_test_0.txt","./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test_0.txt")

In [None]:
# process_file_v1("./processed_files_with_bert_with_best_head/sentiment_dev.txt","./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev.txt")
# process_file_v1("./processed_files_with_bert_with_best_head/sentiment_dev_0.txt","./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_1.txt")
# process_file_v1("./processed_files_with_bert_with_best_head/sentiment_dev_1.txt","./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_0.txt")