In [2]:
from datasets import load_from_disk, concatenate_datasets, load_dataset
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import torch
import time
from tqdm import tqdm
import os
# from multiprocess import set_start_method
from glob import glob
import json



# def set_up_device():
#     # single gpu
#     gpu_index = 1
#     if torch.cuda.is_available():
#         dev = f"cuda:{gpu_index}"
#     else:
#         dev = "cpu"
#     return gpu_index, torch.device(dev)
#     # return gpu_index, torch.device("cpu")

def load_multitask_datasets(dataset_disk_paths):
    print("picked datasets x prompt_template number:", len(dataset_disk_paths))
    datasets = []
    print('loading datasets train split ...')
    for p in tqdm(dataset_disk_paths):
        loaded_dataset = load_from_disk(p)
        if "train" in loaded_dataset:
            loaded_dataset_train_split = loaded_dataset['train']
        else:
            print(f'INFO: no train split found, using entire dataset: {p}')
            loaded_dataset_train_split = loaded_dataset

        datasets.append(loaded_dataset_train_split)
    concatenated = concatenate_datasets(datasets)
    print("concatenated dataset:", concatenated)
    return concatenated

def load_single_dataset(dataset_path):
    loaded_dataset = load_from_disk(dataset_path)
    if "train" in loaded_dataset:
        loaded_dataset_train_split = loaded_dataset['train']
    else:
        print(f'INFO: no train split found, using entire dataset: {dataset_path}')
        loaded_dataset_train_split = loaded_dataset
    return loaded_dataset_train_split

@torch.no_grad()
def setup_retriever(saved_index_dir, device):
    # load dataset and index
    index_path = os.path.join(saved_index_dir,'index.faiss')
    config_json = os.path.join(saved_index_dir,'config.json')
    
    config = json.load(open(config_json))
    key_name, dataset_paths = config['key_name'], config['dataset_paths']
    ds_with_embeddings = load_multitask_datasets(dataset_paths)
    ds_with_embeddings.load_faiss_index('embeddings', index_path, device=-1)

    q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
    q_encoder.to(device)
    q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

    return ds_with_embeddings, q_encoder, q_tokenizer

@torch.no_grad()
def retrieve(
        ds_with_embeddings,
        q_encoder,
        q_tokenizer,
        question,
        device,
        topk=3
    ):
    question_embedding = q_encoder(**q_tokenizer(question, return_tensors="pt", truncation=True).to(device))[0][0].cpu().numpy()
    scores, retrieved_examples = ds_with_embeddings.get_nearest_examples('embeddings', question_embedding, k=topk)
    return scores, retrieved_examples


In [25]:

ds = load_single_dataset('/cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/adversarial_qa_droberta_answer_the_following_q')
print(ds)
print(ds[0]['inputs_pretokenized'])
print('---------------------------')

ds2 = load_single_dataset("/cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/glue_mrpc_equivalent")
print(ds2)
print(ds2[0]['inputs_pretokenized'])
print('---------------------------')
concatenated_ds = concatenate_datasets([ds,ds2])
print(concatenated_ds)
print(concatenated_ds[0]['inputs_pretokenized'])
print(concatenated_ds[10000]['inputs_pretokenized'])

print(range(10000,concatenated_ds.num_rows))
concatenated_ds = concatenated_ds.select(range(10000, concatenated_ds.num_rows))
print(concatenated_ds)
print(concatenated_ds[0]['inputs_pretokenized'])


Dataset({
    features: ['inputs', 'inputs_pretokenized', 'targets', 'targets_pretokenized'],
    num_rows: 10000
})

Given the following passage

"When he became First Consul and later Emperor, Napoleon eschewed his general's uniform and habitually wore the simple green colonel uniform (non-Hussar) of a colonel of the Chasseur à Cheval of the Imperial Guard, the regiment that often served as his personal escort, with a large bicorne. He also habitually wore (usually on Sundays) the blue uniform of a colonel of the Imperial Guard Foot Grenadiers (blue with white facings and red cuffs). He also wore his Légion d'honneur star, medal and ribbon, and the Order of the Iron Crown decorations, white French-style culottes and white stockings. This was in contrast to the gorgeous and complex uniforms with many decorations of his marshals and those around him.",

answer the following question. Note that the answer is present within the text.

Question: What jewelry like accessories did he wear? 

In [17]:
tobe_removed = ["/cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/adversarial_qa_droberta_answer_the_following_q",
"/cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/adversarial_qa_droberta_generate_question",
"/cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/adversarial_qa_droberta_tell_what_it_is",
"/cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/adversarial_qa_droberta_based_on",
"/cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/adversarial_qa_droberta_question_context_answer"]
tobe_removed_ds = load_multitask_datasets(tobe_removed)
print(tobe_removed)

picked datasets x prompt_template number: 5
loading datasets train split ...


100%|██████████| 5/5 [00:00<00:00, 17.48it/s]

concatenated dataset: Dataset({
    features: ['inputs', 'inputs_pretokenized', 'targets', 'targets_pretokenized'],
    num_rows: 50000
})
['/cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/adversarial_qa_droberta_answer_the_following_q', '/cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/adversarial_qa_droberta_generate_question', '/cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/adversarial_qa_droberta_tell_what_it_is', '/cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/adversarial_qa_droberta_based_on', '/cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/adversarial_qa_droberta_question_context_answer']





In [None]:
saved_index_dir_path="/cephfs/user/mikeeewang/summer_22/code/t-zero/retrieval_indexing/output_indexing/p3_subset_6_3-part-1"

# load dataset and index
index_path = os.path.join(saved_index_dir_path,'index_.faiss')
config_json = os.path.join(saved_index_dir_path,'config_.json')

config = json.load(open(config_json))
key_name, dataset_paths = config['key_name'], config['dataset_paths']
ds_with_embeddings = load_multitask_datasets(dataset_paths)
ds_with_embeddings.load_faiss_index('embeddings', index_path)
print(range(50000, ds_with_embeddings.num_rows))
ds_with_embeddings = ds_with_embeddings.select(range(50000, ds_with_embeddings.num_rows))
print(ds_with_embeddings)

In [None]:
output_index_dir = saved_index_dir_path
saving_index_path = os.path.join(output_index_dir, "index.faiss")
# saving_config_json_path = os.path.join(output_dir, "config.json")
print('saving faiss index to ', saving_index_path)
ds_with_embeddings.save_faiss_index('embeddings', saving_index_path)

output_dataset_dir = "/cephfs/user/mikeeewang/summer_22/code/t-zero/retrieval_indexing/indexed_datasets"
print('saving dataset ...')
saving_dataset_path = os.path.join(output_dataset_dir, os.path.basename(saved_index_dir_path))
ds_with_embeddings.save_to_disk(saving_dataset_path)


In [39]:
saved_index_dir_paths = [
    # "/cephfs/user/mikeeewang/summer_22/code/t-zero/retrieval_indexing/output_indexing/p3_subset_6_1",
    "/cephfs/user/mikeeewang/summer_22/code/t-zero/retrieval_indexing/output_indexing/p3_subset_6_3-part-1",
    # "/cephfs/user/mikeeewang/summer_22/code/t-zero/retrieval_indexing/output_indexing/p3_subset_6_3-part-2"
]

output_dataset_dir = "/cephfs/user/mikeeewang/summer_22/code/t-zero/retrieval_indexing/indexed_datasets"

ds_list = []
for saved_index_dir_path in saved_index_dir_paths:
    # load dataset and index
    index_path = os.path.join(saved_index_dir_path,'index.faiss')
    config_json = os.path.join(saved_index_dir_path,'config.json')

    config = json.load(open(config_json))
    key_name, dataset_paths = config['key_name'], config['dataset_paths']
    ds_with_embeddings = load_multitask_datasets(dataset_paths)
    ds_with_embeddings.save_to_disk(os.path.join(output_dataset_dir, os.path.basename(saved_index_dir_path)))
    print('save to disk ... ', os.path.basename(saved_index_dir_path))
    ds_with_embeddings.load_faiss_index('embeddings', index_path)
    ds_list.append(ds_with_embeddings)

# ds_list.append(load_from_disk(os.path.join(output_dataset_dir,"p3_subset_6_3-part-1")))

# concatenate
# concatenated_ds = concatenate_datasets(ds_list)
# print('saving concatenated dataset ...')
# saving_dataset_path = os.path.join(output_dataset_dir, "concatenated_6-1_6-3-part-2")
# ds_with_embeddings.save_to_disk(saving_dataset_path)

picked datasets x prompt_template number: 89
loading datasets train split ...


 13%|█▎        | 12/89 [00:00<00:03, 22.14it/s]

INFO: no train split found, using entire dataset: /cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/glue_qqp_answer_sampled


 16%|█▌        | 14/89 [00:20<03:26,  2.75s/it]

INFO: no train split found, using entire dataset: /cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/glue_qqp_meaning_sampled


 17%|█▋        | 15/89 [00:31<04:55,  4.00s/it]

INFO: no train split found, using entire dataset: /cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/glue_qqp_duplicate_sampled


 18%|█▊        | 16/89 [00:42<06:25,  5.28s/it]

INFO: no train split found, using entire dataset: /cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/glue_qqp_duplicate_or_not_sampled


 19%|█▉        | 17/89 [00:42<05:04,  4.23s/it]

INFO: no train split found, using entire dataset: /cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/glue_qqp_quora_sampled


 22%|██▏       | 20/89 [00:55<04:36,  4.01s/it]

INFO: no train split found, using entire dataset: /cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/glue_qqp_same_thing_sampled


 96%|█████████▌| 85/89 [01:32<00:12,  3.20s/it]

INFO: no train split found, using entire dataset: /cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/dbpedia_14_given_a_choice_of_categories__sampled


 97%|█████████▋| 86/89 [01:55<00:19,  6.45s/it]

INFO: no train split found, using entire dataset: /cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/dbpedia_14_given_list_what_category_does_the_paragraph_belong_to_sampled


 98%|█████████▊| 87/89 [02:17<00:18,  9.46s/it]

INFO: no train split found, using entire dataset: /cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/dbpedia_14_pick_one_category_for_the_following_text_sampled


100%|██████████| 89/89 [02:35<00:00,  1.75s/it]

INFO: no train split found, using entire dataset: /cephfs/user/mikeeewang/summer_22/workspace/data/p3_subset/dbpedia_14_given_a_list_of_category_what_does_the_title_belong_to_sampled





concatenated dataset: Dataset({
    features: ['inputs', 'inputs_pretokenized', 'targets', 'targets_pretokenized', 'answer_choices'],
    num_rows: 6421376
})
save to disk ...  p3_subset_6_3-part-1


In [4]:
# try loading the entire dataset:
concatenated_shard_paths = [
    "/cephfs/user/mikeeewang/summer_22/code/t-zero/retrieval_indexing/indexed_datasets/p3_subset_6_1",
    "/cephfs/user/mikeeewang/summer_22/code/t-zero/retrieval_indexing/indexed_datasets/p3_subset_6_3-part-1",
    "/cephfs/user/mikeeewang/summer_22/code/t-zero/retrieval_indexing/indexed_datasets/p3_subset_6_3-part-2"
]
saved_index_dir = "/cephfs/user/mikeeewang/summer_22/code/t-zero/retrieval_indexing/output_indexing"

reloaded_ds_list = []
for shard in concatenated_shard_paths:
    print("loading:",shard)
    reloaded = load_from_disk(shard)
    index_path = os.path.join(saved_index_dir, os.path.basename(shard), "index.faiss")
    print('index_path:',index_path)
    reloaded.load_faiss_index('embeddings', index_path, device=-1)
    reloaded_ds_list.append(reloaded)
    print(reloaded)

ds_with_embeddings = concatenate_datasets(reloaded_ds_list)
print(ds_with_embeddings)

for ds in reloaded_ds_list:
    print(ds.list_indexes())

print(ds_with_embeddings.list_indexes())

# ds_with_embeddings.save_faiss_index('embeddings', 'tmp.faiss')

# # try faiss
# retriever_device = torch.device("cuda:7")
# q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
# q_encoder.to(retriever_device)
# q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

# input = "hello world"
# scores, retreived_examples = retrieve(
#     ds_with_embeddings,
#     q_encoder,
#     q_tokenizer,
#     question=input,
#     device=retriever_device,
#     topk=5
# )
# print(retreived_examples["inputs_pretokenized"])
# print(retreived_examples["targets_pretokenized"])


loading: /cephfs/user/mikeeewang/summer_22/code/t-zero/retrieval_indexing/indexed_datasets/p3_subset_6_1
index_path: /cephfs/user/mikeeewang/summer_22/code/t-zero/retrieval_indexing/output_indexing/p3_subset_6_1/index.faiss


TypeError: load_faiss_index() got an unexpected keyword argument 'deivce'

In [None]:

saved_index_dir_path="/cephfs/user/mikeeewang/summer_22/code/t-zero/retrieval_indexing/output_indexing/p3_subset_6_1"

# load dataset and index
index_path = os.path.join(saved_index_dir_path,'index.faiss')
config_json = os.path.join(saved_index_dir_path,'config.json')

config = json.load(open(config_json))
key_name, dataset_paths = config['key_name'], config['dataset_paths']
ds_with_embeddings = load_multitask_datasets(dataset_paths)


In [3]:
retrieve_num = 1
concat_num = 1
retriever_device = torch.device("cuda:1")

q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
q_encoder.to(retriever_device)
q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# retriever_device = torch.device("cuda:1")
# retrieve_num = 1
# concat_num = 1
# saved_index_dir_path="/cephfs/user/mikeeewang/summer_22/code/t-zero/retrieval_indexing/output_indexing/p3_subset_6_1"
# ds_with_embeddings, q_encoder, q_tokenizer = setup_retriever(
#     saved_index_dir=saved_index_dir_path,
#     device=retriever_device
# )

In [4]:
input = "hello world"
scores, retreived_examples = retrieve(
    ds_with_embeddings,
    q_encoder,
    q_tokenizer,
    question=input,
    device=retriever_device,
    topk=5
)
print(retreived_examples["inputs_pretokenized"])
print(retreived_examples["targets_pretokenized"])

['Jubilation greets boy #39;s miracle rescue ALMOST four days after being engulfed by a collapsing hill, two-year-old Yuta Minagawa was lifted alive yesterday afternoon from the wreckage of his mother #39;s car. \nIs this a piece of news regarding world politics, sports, business, or science and technology? ', '\nI know that the answer to the question "What was Walt Disney\'s last name?" is in "Several works from the Golden Age of Animation matched the action to classical music. Notable examples are Walt Disney\'s Fantasia, Tom and Jerry\'s Johann Mouse, and Warner Bros.\' Rabbit of Seville and What\'s Opera, Doc?.". Can you tell me what it is? ', 'Jubilation greets boy #39;s miracle rescue ALMOST four days after being engulfed by a collapsing hill, two-year-old Yuta Minagawa was lifted alive yesterday afternoon from the wreckage of his mother #39;s car. \n\nWhich of the following sections of a newspaper would this article likely appear in? World News, Sports, Business, or Science and 