# Parser for Question-Answer Dataset
This notebook parses the corpus of manually-generated factoid questions from Wikipedia articles. 

**Acknowledgments**
http://www.cs.cmu.edu/~ark/QA-data/

These data were collected by Noah Smith, Michael Heilman, Rebecca Hwa, Shay Cohen, Kevin Gimpel, and many students at Carnegie Mellon University and the University of Pittsburgh between 2008 and 2010.
 
Their research project was supported by NSF IIS-0713265 (to Smith), an NSF Graduate Research Fellowship (to Heilman), NSF IIS-0712810 and IIS-0745914 (to Hwa), and Institute of Education Sciences, U.S. Department of Education R305B040063 (to Carnegie Mellon).

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/data/datasets/cmu_wiki_qa/cmu_parser.ipynb)

In [1]:
# uncomment and run below lines to set up if running in colab
# !git clone https://github.com/LAION-AI/Open-Assistant.git
# %cd Open-Assistant/data/datasets/cmu_wiki_qa/
# !pip install -r requirements.txt

In [13]:
# download the dataset

import requests
import tarfile

with requests.get("http://www.cs.cmu.edu/~ark/QA-data/data/Question_Answer_Dataset_v1.2.tar.gz", stream=True) as resp:
    with tarfile.open(fileobj=resp.raw) as f:
        f.extractall()  # extract to ./Question_Answer_Dataset_v1.2

print("Done.")

In [2]:
# global settings

FOLDER = "Question_Answer_Dataset_v1.2"  # input folder containing subfolders with Q&A csv files
CSV = "question_answer_pairs.txt"  # csv files to look for in these subfolders
SOURCE = "wikipedia/cmu_qa"  # source to use in the parquet for each row
MUST_INCLUDE_ENTITY = True  # questions must include a reference to the article
MUST_BE_QUALITY = True  # only quality answers are accepted

In [3]:
import os
import io
import re
import json

from tqdm import tqdm

import numpy as np
import pandas as pd

In [4]:
# get all files and concatenate them into a single df
data = None
for file in os.listdir(FOLDER):
    if os.path.isdir(os.path.join(FOLDER, file)):
        for match in os.listdir(os.path.join(FOLDER, file)):
            if match.strip() == CSV:
                match = os.path.join(FOLDER, file, match)
                with open(match, "r", encoding="ISO-8859-1") as m:
                    raw = m.read().encode("utf-8").decode("utf-8")
                data = pd.concat([data, pd.read_csv(io.StringIO(raw), sep="\t")])
data.drop_duplicates()
data.head()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,data/set3/a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,data/set3/a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,data/set3/a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,data/set3/a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,data/set3/a4


In [5]:
# clean up the df, remove duplicates and answers that are way too short, etc.
clean = {col: [] for col in ["INSTRUCTION", "RESPONSE", "SOURCE", "METADATA"]}

for name, group in tqdm(data.groupby("ArticleTitle")):
    entity = "|".join([re.escape(item) for item in name.lower().strip().split("_")])
    for question, qgroup in group.groupby("Question"):
        # see if the question includes the article title (instead of referring to it, such as he, she, it, etc.)
        if MUST_INCLUDE_ENTITY and not re.findall(r"(?i)\b(" + entity + r").?", question):
            continue
        # use the longest answer
        qgroup.sort_values(by="Answer", key=lambda x: x.str.len(), inplace=True, ascending=False)
        for _, row in qgroup.iterrows():
            if not row["Answer"] or pd.isna(row["Answer"]):
                continue  # no answer given
            quality = True
            if re.findall(r"(?i)^(yes|no)\W*$", str(row["Answer"])):
                quality = False  # yes / no answer
            elif len(row["Answer"]) < 4:
                quality = False  # short answer
            elif len(row["Answer"].split()) < 2:
                quality = False  # one word answer
            if not quality and MUST_BE_QUALITY:
                continue  # skip low quality

            clean["INSTRUCTION"].append(question.strip())
            clean["RESPONSE"].append(row["Answer"])
            clean["SOURCE"].append(SOURCE)
            clean["METADATA"].append(
                json.dumps(
                    {
                        "article_title": name,
                        "article_file": row["ArticleFile"],
                        "difficulty_questioner": row["DifficultyFromQuestioner"],
                        "diffculty_answerer": row["DifficultyFromAnswerer"],
                        # "quality": quality,
                    }
                )
            )
            break  # include only one answer for each question

100%|████████████████████████████████████████████████████████████████████████████████| 109/109 [00:02<00:00, 46.44it/s]


In [6]:
# remove accidental duplicates
clean = pd.DataFrame(clean)
clean.sort_values(by="RESPONSE", key=lambda x: x.str.len(), inplace=True, ascending=False)
clean.drop_duplicates(subset=["INSTRUCTION"], inplace=True)
clean.sort_index(inplace=True)
clean.head()

Unnamed: 0,INSTRUCTION,RESPONSE,SOURCE,METADATA
0,How long was Lincoln's legal Career?,23 years,wikipedia/cmu_qa,"{""article_title"": ""Abraham_Lincoln"", ""article_..."
1,How many long was Lincoln's formal education?,18 months.,wikipedia/cmu_qa,"{""article_title"": ""Abraham_Lincoln"", ""article_..."
2,What trail did Lincoln use a Farmers' Almanac in?,"he defended William ""Duff"" Armstrong",wikipedia/cmu_qa,"{""article_title"": ""Abraham_Lincoln"", ""article_..."
3,When did Lincoln first serve as President?,"March 4, 1861",wikipedia/cmu_qa,"{""article_title"": ""Abraham_Lincoln"", ""article_..."
4,Which county was Lincoln born in?,"Southeast Hardin County, Kentucky",wikipedia/cmu_qa,"{""article_title"": ""Abraham_Lincoln"", ""article_..."


In [8]:
print(f"Retrieved {len(clean) / len(data) * 100.:.2f}% of all questions ({len(clean)})")

Retrieved 18.71% of all questions (748)
