# 1. Install jsonlines, Numpy
```
pip install jsonlines numpy


# 1-2. Install beir Library
1) Create Virtual Environments
```
python -m venv {name of virtual environment}
python -m venv beir_env
```

2) Activate VE in bash
```
source {name of virtual environment}/Scripts/activate
source beir_env/Scripts/activate
```

# 2. Triviaqa Dataset Download
```
wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz
tar -xvf triviaqa-rc.tar.gz

# 3. Import Needed Libraries

In [3]:
!pip install beir

Collecting beir
  Using cached beir-2.0.0-py3-none-any.whl
Collecting sentence-transformers (from beir)
  Using cached sentence_transformers-3.0.0-py3-none-any.whl.metadata (10 kB)
Collecting pytrec-eval (from beir)
  Using cached pytrec_eval-0.5-cp311-cp311-win_amd64.whl
Collecting faiss-cpu (from beir)
  Using cached faiss_cpu-1.8.0-cp311-cp311-win_amd64.whl.metadata (3.8 kB)
Collecting elasticsearch==7.9.1 (from beir)
  Using cached elasticsearch-7.9.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting datasets (from beir)
  Using cached datasets-2.19.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow-hotfix (from datasets->beir)
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting xxhash (from datasets->beir)
  Using cached xxhash-3.4.1-cp311-cp311-win_amd64.whl.metadata (12 kB)
Collecting multiprocess (from datasets->beir)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting huggingface-hub>=0.21.2 (from datasets->beir)


In [4]:
from tqdm import tqdm
import jsonlines
import numpy as np
import os
import json
from beir.datasets.data_loader import GenericDataLoader

  from tqdm.autonotebook import tqdm


# 4. Main Functions

### retrieve_eval_data_loader()

In [None]:
def retrieve_eval_data_loader(data_name, query_file, corpus_file,qrels_file):

    data_path = f"datasets/{data_name}/"
    
    print("=====> Loading QA Dataset")
    print("=====> Loading Pool of Documents")
    corpus, queries, qrels = GenericDataLoader(
        data_folder = data_path,\
        corpus_file = corpus_file,\
        query_file = query_file,\
        ).load(split=qrels_file) # or split = "train" or "dev"
    

    
    return corpus, queries, qrels

### custom_data_loaer()

In [None]:
def custom_data_loader(args):

    if args.data_name == 'triviaqa':
        questions = []
        answers = []
        with jsonlines.open(args.qa_data) as f:
            for line in f.iter():
                questions.append(line["query"])
                answers.append(line["answers"])
    
        retrieval_queries = {}
        for i in tqdm(range(len(questions)), desc="QA Data Preprocessing"):
            question = questions[i]
            qa_id = str(args.data_name) + "_" + str(i)
            retrieval_queries[qa_id] = question
    
        titles = []
        texts = []
        with jsonlines.open(args.documents_pool) as f:
            for line in f.iter():
                texts.append(line["text"])
                titles.append(line["title"])
    
        retrieval_corpus = {}
        for i in tqdm(range(len(titles)), desc= "Documents_Pool Preprocessing"):
            json_obj = {}
            json_obj["title"] = titles[i]
            json_obj["text"] = texts[i]
            retrieval_corpus[str(i)] = json_obj

    return retrieval_queries, retrieval_corpus, questions, answers, titles, texts

### make_new_dataset()

In [None]:
def make_new_dataset(args, retrieved_doc):
        
        _,_, questions, answers, titles,texts = custom_data_loader(args)
        print("=====> Starting Construction of Dataset")

        sorted_idxs = []
        sorted_scores = []

        for i in range(len(titles)):
            scores_i = np.array(list(retrieved_doc['{}_{}'.format(args.data_name, i)].values()))
            sorted_idx = np.argsort(scores_i)[::-1]
            keys = list(retrieved_doc['{}_{}'.format(args.data_name, i)].keys())

            sorted_idxs_i = []
            sorted_scores_i = []
            for j in range(min(len(scores_i), args.num_retrieval)):
                sorted_idxs_i.append(int(keys[sorted_idx[j]]))
                sorted_scores_i.append(scores_i[sorted_idx[j]])

            sorted_idxs.append(sorted_idxs_i)
            sorted_scores.append(sorted_scores_i)

        res = []
        for i in range(len(questions)):
            new_item = {}
            new_item['question'] = questions[i]
            new_item['answer'] = answers[i]

            ctxs = []
            for j in range(len(sorted_idxs[i])):
                ctx = {}
                ctx['id'] = sorted_idxs[i][j]
                ctx['title'] = titles[sorted_idxs[i][j]]
                ctx['text'] = texts[sorted_idxs[i][j]]
                ctx['score'] = sorted_scores[i][j]
                ctxs.append(ctx)
            new_item['contexts'] = ctxs
            res.append(new_item)

        if not os.path.exists(args.output_folder):
            os.makedirs(args.output_folder)
        
        print("=====> All Procedure is finished!")
        with open(f"{args.output_folder}/{args.data_name}/{args.retrieval_method}_retrieved_docs.doc",'w') as writer:
            writer.write(json.dumps(res, indent=4, ensure_ascii=False) + "\n")

# 5. Run

In [None]:
class Args:
    data_name = 'triviaqa'
    qa_data = '/content/qa/wikipedia-dev.json'  # TriviaQA QA 데이터 경로
    documents_pool = '/content/documents_pool.json'  # 문서 풀 데이터 경로
    output_folder = 'output_folder'
    retrieval_method = 'retrieval_method'
    num_retrieval = 5

args = Args()

In [None]:
retrieved_doc = {f'triviaqa_{i}': {str(j): np.random.random() for j in range(10)} for i in range(5)}  # 예시 데이터

make_new_dataset(args, retrieved_doc)