# Preprocessing of ASQA dataset

In [None]:
# Create the necessary subfolders...
import os
root = os.environ['PROJECT_ROOT']
os.makedirs(os.path.join(root, "data", "ASQA", "source_documents"), exist_ok=True)
os.makedirs(os.path.join(root, "data", "ASQA", "human"), exist_ok=True)


After downloading ASQA according to the instructions on the official [GitHub](https://github.com/google-research/language/tree/master/language/asqa) and copying the downloaded files to `data/ASQA/source_documents`, we take some additional preprocessing steps here. This notebook cuts down the dev set data significantly through the following steps:

1. Remove annotations that do not cite sources
2. Remove annotations based on wikipedia pages that do not exist (anymore)
3. Remove questions without any remaining annotations after step 2 and 3.
4. Select 100 of the remaining questions.
5. Use only one annotation per question, remove additional annotations.
6. Create a list of all the cited wikipedia pages in the remaining annotations.
7. Scrape these wikipedia pages to construct a corpus.

This leaves us with 100 Questions, 100 annotated long-form answers, and all the necessary documents / knowledge to answer the questions.


In [None]:
target_n_questions = 100

## Step 1 - remove annotations without cited sources

In [None]:
import json
from collections import defaultdict
import copy
import wikipediaapi
import tqdm
import os
root = os.environ['PROJECT_ROOT']

split='dev' # train or dev

with open(f"{root}/data/ASQA/source_documents/ASQA.json", encoding="utf8") as f:
    data = json.load(f)[split]

keys = list(data.keys())
N = len(keys)

data_filtered = copy.deepcopy(data)
total_annotations=0
removed_annotations=0
for key in keys:
    annotations = data[key]['annotations']
    to_remove = []

    for i,a in enumerate(annotations):
        total_annotations+=1
        if len(a['knowledge']) == 0:
            to_remove.append(i)

    for j in reversed(to_remove):
        del data_filtered[key]['annotations'][j]
        removed_annotations+=1

print(f"Removed annotations: {removed_annotations} out of {total_annotations}")
data = data_filtered

## Step 2 - Remove annotations based on non-reachable wikipedia pages

This is not the most efficient way of working - we are scraping much more pages than we will end up using. However, this is the simplest method that retains easy control over the final amount of QA pairs.

In [None]:
wiki_pages = defaultdict(lambda: set())
wiki_pages_set = set()
for key in data:
    annotations = data[key]['annotations']
    for annotation in annotations:
        knowledges = annotation['knowledge']
        for knowledge in knowledges:
            wiki_pages_set.add(knowledge['wikipage'])
            wiki_pages[key].add(knowledge['wikipage'])

print(f"Total wikipedia pages referenced: {len(wiki_pages_set)}")



In [None]:
try:
    with open(f"{root}/data/ASQA/source_documents/existing_pages.json", "r", encoding="utf8") as f:
        existing_pages=json.load(f)

    with open(f"{root}/data/ASQA/source_documents/non_existing_pages.json", "r", encoding="utf8") as f:
        non_existing_pages=json.load(f)
except:
    print("JSON files not found. Scraping wikipedia...")
    wiki_wiki = wikipediaapi.Wikipedia(user_agent='RAG-Eval', language='en')
    existing_pages = {}
    non_existing_pages = {}
    for page in tqdm.tqdm(list(wiki_pages_set)):
        scraped_page = wiki_wiki.page(page)
        if scraped_page.exists():
            existing_pages[page] = scraped_page
        else:
            non_existing_pages[page] = scraped_page

    
    with open(f"{root}/data/ASQA/source_documents/existing_pages.json", "w", encoding="utf8") as f:
        json.dump(list(existing_pages.keys()), f)

    with open(f"{root}/data/ASQA/source_documents/non_existing_pages.json", "w", encoding="utf8") as f:
        json.dump(list(non_existing_pages.keys()), f)

        
print(f"Existing pages: {len(existing_pages)}")
print(f"Non-existing pages: {len(non_existing_pages)}")

In [None]:
filtered_data = copy.deepcopy(data)
total_annotations = 0
removed_annotations = 0
for key in data:
    to_remove = []
    for i, annotation in enumerate(data[key]['annotations']):
        knowledges = annotation['knowledge']
        total_annotations+=1
        for knowledge in knowledges:
            if knowledge['wikipage'] in non_existing_pages:
                to_remove.append(i)
                break
        
    for j in reversed(to_remove):
        del data_filtered[key]['annotations'][j]
        removed_annotations +=1

print(f"Removed annotations: {removed_annotations} out of {total_annotations}")
data = data_filtered

## Step 3 - Remove questions without any remaining annotations

In [None]:
filtered_data = copy.deepcopy(data)
removed_questions=0
for key in data:
    annotations = data[key]['annotations']
    if len(annotations) == 0:
        removed_questions +=1
        del filtered_data[key]

print(f"Removed {removed_questions} out of {N} questions.")
N = N-removed_questions
data = filtered_data

## Step 4 and 5 - Select 100 questions, and one annotation per question

In [None]:
data = {k:v for k,v in list(data.items())[:target_n_questions]}
len(data)
filtered_data = copy.deepcopy(data)

deleted_annotations = 0
total_annotations = 0
for key in data:
    n_annotations = len(data[key]['annotations'])
    total_annotations += n_annotations
    if n_annotations > 1:
        del filtered_data[key]['annotations'][1:]
        deleted_annotations+= (n_annotations-1)

print(f"Deleted {deleted_annotations} out of {total_annotations} annotations.")

data = filtered_data


## Step 6 - create a list of all the cited wikipedia pages in the remaining annotations

In [None]:
wiki_pages = defaultdict(lambda: set())
wiki_pages_set = set()
for key in data:
    annotations = data[key]['annotations']
    for annotation in annotations:
        knowledges = annotation['knowledge']
        for knowledge in knowledges:
            wiki_pages_set.add(knowledge['wikipage'])
            wiki_pages[key].add(knowledge['wikipage'])

print(f"Total wikipedia pages referenced: {len(wiki_pages_set)}")

## Step 7 - Scrape the pages to get the corpus


In [None]:
wiki_wiki = wikipediaapi.Wikipedia(user_agent='RAG-Eval', language='en')
corpus_pages = {}
for page in tqdm.tqdm(list(wiki_pages_set)):
        scraped_page = wiki_wiki.page(page)
        if scraped_page.exists():
            corpus_pages[page] = scraped_page
        else:
            print(f"WARNING: {page} was not found")
            continue


In [None]:
corpus_texts = {
    key:document.text for key,document in corpus_pages.items()
}

## Save all the files

In [None]:
with open(f"{root}/data/ASQA/corpus.json", "w", encoding="utf8") as f:
    json.dump(corpus_texts, f, indent=3)

In [None]:
qa_pairs = []
for key in data:
    qa_pairs.append({
        "question" : data[key]['ambiguous_question'],
        "reference": data[key]['annotations'][0]['long_answer']
    })

qa_pairs
with open(f"{root}/data/ASQA/human/questions.json", "w", encoding="utf8") as f:
    json.dump(qa_pairs, f, indent=3)
    

# ====================