# Automatically generate Q & A pairs from the WikiData graph

See README.md for more information.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/notebooks/data-augmentation/wikidata-qa/wikidata.ipynb)

In [1]:
# uncomment and run below lines to set up if running in colab
# !git clone https://github.com/LAION-AI/Open-Assistant.git
# %cd Open-Assistant/notebooks/data-augmentation/wikidata-qa
# !pip install -r requirements.txt

In [2]:
import requests
import json
import datetime
import time
from copy import deepcopy

import numpy as np
import pandas as pd
from tqdm import tqdm

from typing import Optional, Any

In [3]:
class WikiGraph:
    HEADER = {
        "User-Agent": "Mozilla/5.0 (compatible; WikiDataGraphCrawler/0.1)",
    }
    TIMER = 200  # wait ms between calls

    def __init__(self, file: Optional[str] = None, language: str = "en", seed: int = 12345678) -> None:
        self.file = file
        assert language in ("en",), f"This language is not yet supported: {language}"
        self.language = language
        self.cache = {self.language: {}}
        np.random.seed(seed)
        self.calls = 0
        self.last_call = 0
        if self.file:
            self._load()

    def _save(self) -> None:
        if not self.file:
            return
        df = {"language": [], "qid": [], "depth": [], "desc": [], "graph": []}
        for lang in self.cache:
            for qid in self.cache[lang]:
                for depth in self.cache[lang][qid]:
                    df["language"].append(lang)
                    df["qid"].append(qid)
                    df["depth"].append(depth)
                    df["desc"].append(json.dumps(self.cache[lang][qid][depth]["desc"]))
                    df["graph"].append(json.dumps(self.cache[lang][qid][depth]["graph"]))
        df = pd.DataFrame(df)
        df.to_csv(self.file, index=False)

    def _load(self) -> None:
        assert self.file
        try:
            df = pd.read_csv(self.file)
        except FileNotFoundError:
            return
        self.cache = {}
        for index, row in df.iterrows():
            if row["language"] not in self.cache:
                self.cache[row["language"]] = {}
            if row["qid"] not in self.cache[row["language"]]:
                self.cache[row["language"]][row["qid"]] = {}
            self.cache[row["language"]][row["qid"]][row["depth"]] = {
                "desc": row["desc"] if isinstance(row["desc"], dict) else json.loads(row["desc"]),
                "graph": row["graph"] if isinstance(row["graph"], dict) else json.loads(row["graph"]),
            }

    def _get(self, params: dict) -> dict:
        self.calls += 1
        diff = max(0.0, self.TIMER - (time.time() - self.last_call))
        if diff:
            time.sleep(diff / 1000.0)
        data = requests.get("https://www.wikidata.org/w/api.php", headers=self.HEADER, params=params)
        self.last_call = time.time()
        result = data.json()
        if "error" in result:
            raise Exception(result["error"]["code"], result["error"]["info"])
        return result

    def search(self, query: str) -> list:
        params = {"action": "wbsearchentities", "search": query.strip(), "language": self.language, "format": "json"}
        result = self._get(params)

        if "search" not in result or not result["search"]:
            return []
        output = []
        for item in result["search"]:
            allow = False
            if "display" in item:
                if "label" in item["display"]:
                    if "language" in item["display"]["label"] and item["display"]["label"]["language"] == self.language:
                        allow = True
                if not allow and "description" in item["display"]:
                    if (
                        "language" in item["display"]["description"]
                        and item["display"]["description"]["language"] == self.language
                    ):
                        allow = True
            if not allow and "match" in item:
                if "language" in item["match"] and item["match"]["language"] == self.language:
                    allow = True
            if allow:
                output.append({key: item[key] if key in item else "" for key in ["id", "label", "description"]})
        return output

    def _fetch(self, qid: str, depth: int = 1) -> str:
        qid = qid.upper() if isinstance(qid, str) else f"Q{qid}"
        if qid in self.cache[self.language] and self.cache[self.language][qid]:
            largest = int(sorted(self.cache[self.language][qid].keys())[-1])
            if largest >= depth:
                return self.cache[self.language][qid][largest]["desc"]["label"]
        else:
            self.cache[self.language][qid] = {}

        params = {"action": "wbgetentities", "ids": qid, "language": self.language, "format": "json"}
        result = self._get(params)

        if "entities" not in result or qid not in result["entities"] or not result["entities"][qid]:
            raise ValueError(f"No entities found for {qid}")

        hit = result["entities"][qid]
        desc = {"qid": qid, "language": self.language, "label": "", "aliases": [], "description": ""}
        if "labels" in hit and self.language in hit["labels"] and "value" in hit["labels"][self.language]:
            desc["label"] = hit["labels"][self.language]["value"]
        # elif self.language != "en" and "en" in hit["labels"] and "value" in hit["labels"]["en"]:
        #    desc["label"] = hit["labels"]["en"]["value"]

        if "aliases" in hit and self.language in hit["aliases"]:
            desc["aliases"] = [item["value"] for item in hit["aliases"][self.language] if "value" in item]
        if (
            "descriptions" in hit
            and self.language in hit["descriptions"]
            and "value" in hit["descriptions"][self.language]
        ):
            desc["description"] = hit["descriptions"][self.language]["value"]

        graph = {}
        if "claims" in hit and depth > 0:
            for key in tqdm(hit["claims"]):
                if "datavalue" not in hit["claims"][key][0]["mainsnak"]:
                    continue
                results = []
                for elem in hit["claims"][key]:
                    item = elem["mainsnak"]["datavalue"]["value"]
                    if isinstance(item, dict) and "id" in item and item["id"] == qid:
                        results.append(desc["label"])
                    else:
                        results.append(self._parse(item, qid, depth))
                graph[key] = results

        self.cache[self.language][qid][depth] = {"desc": deepcopy(desc), "graph": deepcopy(graph)}
        self._save()

        return desc["label"]

    def _parse(self, item: Any, qid: str, depth: int) -> str:
        result = ""
        if isinstance(item, dict):
            if "amount" in item:
                unit = item["unit"].split("/Q")[-1] if "unit" in item else ""
                result = item["amount"][(1 if item["amount"][0] == "+" else 0) :]
                if unit and unit != "1":
                    unit = f"Q{unit}"
                    if unit == qid:
                        unit = desc["label"]
                    else:
                        unit = self._fetch(unit, 0)
                    if unit:
                        result = f"{result} {unit}"
            if "latitude" in item and "longitude" in item:
                result = f'{item["latitude"]} {item["longitude"]}'
            elif "time" in item:
                result = str(item["time"])
                if "T00:00:00Z" in result:
                    result = result.split("T00:00:00Z")[0]
                    if "-00-00" in result:
                        result = result.split("-00-00")[0]
                if result[0] == "+":
                    result = result[1:]
                elif result[0] == "-":
                    if self.language == "en":
                        result = f"{result} BC"
            elif "id" in item:
                result = self._fetch(item["id"], depth - 1)
        elif isinstance(item, (str, int, float, bool)):
            result = str(item)
        return result

    def _zalgo(self, question: str) -> str:
        if len(question) > 2 and np.random.choice([True, False]):
            if np.random.choice([True, False]):
                # make it lowercase or all caps
                if np.random.choice([True, False]):
                    question = question.upper()
                else:
                    question = question.lower()
            if np.random.choice([True, False]):
                # add typo: remove characters at random
                question = "".join([c for c, v in zip(question, np.random.normal(0, 1, len(question))) if v < 3.0])
            if np.random.choice([True, False]):
                # add typo: swap characters
                n = np.random.randint(len(question) - 1)
                question = question[:n] + question[n + 1] + question[n] + question[n + 2 :]

            # question marks
            if np.random.choice([True, False]):
                if question[-1] == "?":
                    if np.random.choice([True, False]):
                        question = question[:-1]
                    else:
                        for i in range(np.random.randint(5)):
                            question += "?"
                elif np.random.choice([True, False]):
                    question = question[:-1]
        return question

    def generate(self, qid: str, zalgo: bool = False, **kwargs):
        self._fetch(qid)
        if self.language == "en":
            return self._generate_en(qid=qid, zalgo=zalgo, **kwargs)
        else:
            raise NotImplementedError(f"Unknown language: {self.language}")

    def _generate_en(
        self, qid: str, zalgo: bool = False, pronoun: str = "it", proper: bool = True
    ) -> str:  # it is a proper noun
        def _pronoun(which: str) -> tuple:
            if which in ("he", "him", "his"):
                return "he", "him", "his"
            elif which in ("she", "her"):
                return "she", "her", "her"
            elif which in ("it", "its"):
                return "it", "it", "its"
            else:
                return "they", "them", "their"

        def _add_a(name: str) -> str:
            if np.random.choice([True, False]):
                return f"the {name}"
            elif name[0].lower() in ("a", "e", "i", "o", "u"):
                return f"an {name}"
            else:
                return f"a {name}"

        sub, obj, pos = _pronoun(pronoun)

        # question
        Q = {
            "P6": [
                "Do you know who the prime minister of {name} is?",
                "Who is the president of {name}?",
                "Who is the governor of {name}?",
            ],
            "P17": [
                "Can you tell me Which country {name} is in?",
                "Which country is {name} located in?",
                "Where is {name} located in the world?",
            ],
            "P19": ["Do you know Where {name} was born at?", "What is {name}'s place of birth?"],
            "P20": ["Can you tell me where {name} died?", "Where has {name} died?"],
            "P22": ["Do you know who {name}'s father is?", "What is {name}'s father called?", "Who is {name}'s dad?"],
            "P25": [
                "Tell me who {name}'s mother is.",
                "What is {name}'s mother called?",
                "Who is {name}'s mom?",
                "Who's {name}'s mum?",
            ],
            "P27": [
                "Do you have any information on what country {name} is from?",
                "Where is {name} from?",
                "Where does {name} originate from?",
                "What is {name}'s country of origin?",
            ],
            "P30": [
                "Do you happen to know what continent {name} is under?",
                "Which continent is {name} in?",
                "Which continent does {name} belong to?",
            ],
            "P36": [
                "Please tell me, what the capital of {name} is?",
                "What's {name}'s capital city? Thank you in advance!",
            ],
            "P37": [
                "Tell me what the offical language of {name} is?",
                "What language do they speak in {name}?",
                "How do they speak in {name}?",
                "What languages they understand in {name}?",
            ],
            "P38": ["Do you know what {name}'s currency is?", "What currency do they use in {name}?"],
            "P40": [
                "List {name}'s children.",
                "Who are {name}'s children?",
                "What are the names of {name}'s children?",
                "Does {name} have children?",
                "How many children does {name} have?",
                "Does {name} have any kids?",
                "How many children does {name} have?",
            ],
            "P50": ["Give me the name of the author for {name}.", "Who wrote {name}?", "Who's the author for {name}?"],
            "P57": [
                "Do you know who directed {name}?",
                "Who directed {name}?",
                "Who is the director of {name}?",
                "{name} is directed by whom?",
            ],
            "P61": ["Do you know who invented {name}?", "Who discovered {name}?", "{name} was invented by whom?"],
            "P106": [
                "List the places {name} works at.",
                "Where does {name} work at?",
                "What is {name}'s occupation?",
                "What does {name} do?",
                "What does {name} work?",
                "Where does {name} work at?",
                "What does {name} work in?",
            ],
            "P138": [
                "Describe what {name} was named after.",
                "Do you know what {name} was named after?",
                "What was {name} named after?",
                "Who was {name} named after?",
                "Why is {name} called {name}?",
                "Why is {name} named like that?",
            ],
            "P169": [
                "Tell me who is {name} the CEO of.",
                "Who's {name} the CEO of?",
                "Which company is {name} the CEO of?",
            ],
            "P170": [
                "Tell me more about the creator of {name}.",
                "Who crated {name}?",
                "Who is {name}'s creator?",
                "Who made {name}?",
                "Who is reponsible for {name}?",
            ],
            "P225": [
                "Describe {name} to me in latin.",
                "What is {name}'s scientific name?",
                "What is {name}'s taxon name?",
                "How do you say {name} in latin?",
                "What is {name} in latin?",
            ],
            "P246": [
                "Tell me {name}'s formula.",
                "What is the formula for {name}?",
                "What is the chemical formula of {name}?",
                "What is the molecular formula of {name}?",
                "Which chemical element is {name}?",
                "Describe the chemical compound for {name}.",
                "What is the chemical symbol for {name}?",
            ],
            "P274": [
                "Tell me the chemical formula for {name}.",
                "What is the formula for {name}?",
                "What is the chemical formula of {name}?",
                "What is the molecular formula of {name}?",
                "Which chemical element is {name}?",
                "Describe the chemical compound for {name}.",
                "What is the chemical symbol for {name}?",
            ],
            "P275": [
                "Describe {name}'s license.",
                "What's {name}'s license?",
                "Is {name} copyrighted?",
                "Does {name} have a copyright license?",
                "What license is associated with {name}?",
            ],
            "P366": [
                "Give me use cases for {name}.",
                "What's a use-case for {name}?",
                "What is {name}'s main use case?",
                "How is {name} used?",
                "What is {name} good for?",
            ],
            "P487": [
                "Say in emoji: {name}.",
                "Is there an emoji for {name}?",
                "Which unicode character does represent {name}?",
            ],
            "P509": ["Can you tell me how {name} died?", "What did {name} die of?", "What caused {name}'s death?"],
            "P527": [
                "What are the ingredients of {name}?",
                "What are {name}s made of?",
                "What are {name}s created from?",
                "What are the parts of {name}?",
            ],
            "P569": [
                "Do you know when {name} was born?",
                "When did {name} born?",
                "When was {name} born?",
                "When is {name}'s birthday?",
            ],
            "P570": [
                "Do you have information on the date of {name}'s death?",
                "When did {name} die?",
                "Is {name} dead?",
                "Is {name} still alive?",
            ],
            "P571": [
                "Do you have information on when {name} was first created? Thanks!",
                "When was {name} created?",
                "When was {name} first released?",
            ],
            "P575": [
                "Please tell me when {name} was first discovered. Thank you!",
                "When was {name} invented?",
                "What was the date when {name} was finally discovered?",
            ],
            "P576": [
                "Do you know when {name} was discontinued?",
                "When was {name} demolished?",
                "At what time was {name} dissolved?",
            ],
            "P580": [
                "Can you recall when {name} started?",
                "When did {name} start?",
                "What was the starting date for {name}?",
                "When did {name} break out?",
            ],
            "P582": [
                "Do you have information on the date when {name} ended?",
                "When did {name} end?",
                "What was the ending date of {name}?",
                "When was {name} finally over?",
            ],
            "P625": [
                "Give me the coordinates for {name}!",
                "Locate {name}.",
                "What is {name}'s location?",
                "Where can I find {name}?",
                "What are the GPS coordinates for {name}?",
            ],
            "P837": [
                "Tell me when {name} is!",
                "When is {name} celebrated?",
                "On which day is {name}?",
                "When is {name} day?",
            ],
            "P856": [
                "Give me the URL for {name}.",
                "What's the URL for {name}? Thanks!",
                "What's {name}'s website?",
                "What is the offical website for {name}?",
                "Can you tell me the link to {name}?",
            ],
            "P973": [
                "Return the URL for {name}!",
                "Where can I find more information on {name}?",
                "Where can I read more abou {name} online?",
                "Is there a site that explains {name} in detail?",
            ],
            "P1082": [
                "Count the number of people who live in {name}!",
                "What is {name}'s population?",
                "What is the population of {name}?",
                "How many people live in {name}?",
            ],
            "P1120": [
                "Do you know the number of people who died in {name}?",
                "How many people have died due to {name}?",
                "How many people have lost their lives in {name}?",
                "What is the number of fatalities after {name}?",
                "How many people have lost their lives in {name}?",
            ],
            "P2043": ["Calculate the length of {name}!", "How long is {name}?", "What is {name}'s length?"],
            "P2044": [
                "Do you know how tall {name} is?",
                "How tall is {name}?",
                "How high is {name}?",
                "How many meters is {name} above sea level?",
                "What is {name}'s elevation?",
            ],
            "P2046": ["Is {name} big?", "How big is {name}?", "What is the area of {name}?", "How big is {name}?"],
            "P2049": [
                "Describe the width of {name}.",
                "What's {name}'s width?",
                "How wide is {name}?",
                "What's the width of {name}?",
            ],
            "P2250": [
                "Do you know how long {name} lives?",
                "What is the life expectancy of {name}?",
                "How long do {name}s live?",
            ],
            "P2283": [
                "Describe {name} in detail.",
                "How does {name} work?",
                "What makes {name} work in theory? Thanks for the answer!",
            ],
            "P3063": ["I need information on the gestation period of {name}s.", "How long are {name}s pregnant?"],
            "P3373": [
                "List {name}'s siblings.",
                "Who are {name}'s siblings?",
                "What are the names of {name}'s brothers and sisters?" "Does {name} have any siblings?",
                "Does {name} have a brother or sister?",
                "How many siblings does {name} have?",
                "How many brothers and sisters does {name} have?",
            ],
            "P4511": ["Calculate the depth of {name}!", "How deep is {name}?", "What is {name}'s vertical depth?"],
            "P4733": [
                "Do you know the noise {name} makes?",
                "What does {name} say?",
                "What sound does {name} make?",
                "How does {name} sound like?",
            ],
            "P7767": ["How would you serve {name} for me?", "How are {name}s served?"],
        }
        # reference to name in question
        Qp = {
            "P6": ["Tell me who {pos} governor is!", "Who's the governor?", "Who is {pos} president?"],
            "P17": [
                "Do you know which country is that in?",
                "Which country is {sub} in?",
                "Under which country is {sub} located?",
            ],
            "P19": [
                "Can you tell me the place {sub} was born at? Thanks!",
                "Where was {sub} born?",
                "What is {pos} place of birth?",
            ],
            "P20": [
                "Where di {sub} die, can you tell me that?",
                "Where did {sub} die?",
                "What is the place of {pos} death?",
            ],
            "P22": [
                "Who is {pos} father, respond with his name. Thank you.",
                "What is {pos} father called?",
                "Who is {pos} dad?",
                "What's {pos} father's name?",
            ],
            "P25": [
                "Who is {pos} mother, respond with her name.",
                "What is {pos} mother called?",
                "Who is {pos} mom?",
                "Who's {pos} mum?",
                "What's {pos} mother's name?",
            ],
            "P27": [
                "Can you tell me where {sub} came from?",
                "Where is {sub} from?",
                "Where does {sub} come from?",
                "Where does {sub} originate from?",
            ],
            "P30": [
                "Tell me the name of the continent {sub} is in.",
                "Which continent is {sub} in?",
                "Which continent does {sub} belong to?",
            ],
            "P36": ["Do you know {pos} capital?", "What is {pos} capital called?", "What's the name of {pos} capital?"],
            "P37": [
                "Describe {pos} official language.",
                "What is {pos} offical language?",
                "What language do they speak there?",
            ],
            "P38": [
                "Tell me more about {pos} currency.",
                "What is {pos} currency?",
                "Which currencies are used there?",
            ],
            "P40": [
                "I need more information on {pos} children.",
                "Who are {pos} children?",
                "What are the names of {pos} kids?",
                "How many children does {sub} have?",
                "Does {sub} have kids?",
                "Does {sub} have any children?",
                "How many kids {sub} got?",
            ],
            "P50": [
                "Please, describe {pos} author.",
                "Who wrote {obj}?",
                "Who's {pos} author?",
                "Who {pos} author is?",
            ],
            "P57": [
                "Who is {obj} director, do you have information on that in your database?",
                "Who directed {obj}?",
                "Who is {pos} director?",
            ],
            "P61": ["Tell me who invented {obj}!", "Do you know who discovered {obj} first?"],
            "P106": [
                "Do you have data on {pos} jobs?",
                "Where does {sub} work at?",
                "What does {sub} do for a living?",
                "What's {pos} job?",
                "What is {pos} occupation?",
            ],
            "P138": [
                "Explain how {sub} got {pos} name!",
                "How did {sub} get {pos} name?",
                "Where did {sub} get {pos} name from?",
                "Why is {sub} called {name}?",
            ],
            "P169": [
                "Give me information on the companies {sub} is the CEO at.",
                "Is {sub} the CEO of a company?",
                "Which company is {sub} the CEO of?",
            ],
            "P170": [
                "Who made {obj}, can you tell me?",
                "Who crated {obj}?",
                "Who is {pos} creator?",
                "Who made {obj}?",
            ],
            "P225": [
                "Translate {pos} name to latin.",
                "What is {pos} scientific name?",
                "How do you call {obj} in latin?",
                "How to say {obj} in latin?",
            ],
            "P246": [
                "Give me {pos} formula.",
                "What is {pos} formula?",
                "What is {pos} chemical formula?",
                "What is {pos} molecular formula?",
                "Which chemical element is {sub}?",
                "Describe the chemical compound for {obj}.",
                "What is the chemical symbol for {obj}?",
            ],
            "P274": [
                "Describe {pos} chemical formula!",
                "What is {pos} formula?",
                "What is {pos} chemical formula?",
                "What is {pos} molecular formula?",
                "Which chemical element is {sub}?",
                "Describe the chemical compound for {obj}.",
                "What is the chemical symbol for {obj}?",
            ],
            "P275": [
                "Do you know which license {pos} is under?",
                "What's {pos} license?",
                "Is {sub} copyrighted?",
                "What license was {sub} released under?",
            ],
            "P366": [
                "And {pos} use cases are?",
                "What is {pos} main use case?",
                "How is {sub} used?",
                "What is {sub} good for?",
                "What does {sub} do?",
            ],
            "P487": [
                "Write {obj} down using emojis only.",
                "Does {sub} have an emoji?",
                "Is there a unicode character for {sub}?",
            ],
            "P509": [
                "Can you tell me hat {sub} died of?",
                "What did {sub} die of?",
                "What was the cause of {pos} death?",
            ],
            "P527": [
                "List {pos} parts.",
                "What are {pos} ingredients?",
                "What are they made of?",
                "What are their parts?",
            ],
            "P569": ["{pos} birthday is?", "When did {sub} born?", "When was {sub} born?", "When is {pos} birthday?"],
            "P570": ["Is {sub} dead?", "When did {sub} die?", "Is {sub} dead?", "Did {sub} die?"],
            "P571": [
                "Do you know the date of {pos} inception?",
                "When was {sub} first released?",
                "And when was {sub} actually created?",
            ],
            "P575": ["Tell me the date of {pos} discovery!", "When was {sub} invented then?"],
            "P576": [
                "Can you tell me the date {sub} was dinally discontinued?",
                "When was {sub} demolished?",
                "At what time was {sub} dissolved?",
            ],
            "P580": ["Write down the exact date {sub} started!", "When did {sub} start?", "When did {sub} break out?"],
            "P582": ["Write down the exact date {sub} ended.", "When was {sub} finally over?", "When did {sub} end?"],
            "P625": ["I need {pos} GPS location!", "What is {pos} GPS location?", "What are {pos} coordinates?"],
            "P837": ["When would you celebrate {obj}?", "When is {sub} celebrated?", "On which day is {sub}?"],
            "P856": [
                "Send me {pos} web address.",
                "What's the address of {pos} website?",
                "What is {pos} offical website?",
                "Can you tell me the link to {obj}?",
            ],
            "P973": [
                "Can you give me more information on {obj}?",
                "Where can I find more info on {obj}?",
                "Where can I read more about {obj} online?",
                "Is there a site that explains {obj} in detail?",
            ],
            "P1082": ["Estimate {pos} population.", "How many people live there?", "How large is {pos} population?"],
            "P1120": [
                "Estimate the number of people who died in the event!",
                "How many people died?",
                "How deadly was {sub}?",
                "How many fatalities were there?",
            ],
            "P2043": ["Do you know {pos} length?", "How long is {sub}?", "What'S the length of {obj}?"],
            "P2044": [
                "Is {sub} tall?",
                "How tall is {sub}?",
                "How high is {sub}?",
                "Is {name} above sea level?",
                "What is {pos} elevation?",
            ],
            "P2046": ["Provide information on {pos} area.", "How big is {sub}?", "How big is {pos} area?"],
            "P2049": ["Calculate {pos} width!", "How wide is {sub}?", "What is {pos} width?"],
            "P2250": ["Can you tell me how long {sub} live?", "What is {pos} life expectancy?", "How long they live?"],
            "P2283": [
                "Tell me how {sub} works under the hood.",
                "Do you know how {sub} works?",
                "How does {sub} work under the hood?",
                "How does {obj} run?",
            ],
            "P3063": ["So how long is their gestation period?", "How long do they stay pregnant?"],
            "P3373": [
                "List {pos} siblings please.",
                "Who are {pos} brothers and sisters?",
                "What are the names of {pos} siblings?",
                "Does {sub} have any siblings?",
                "Does {sub} have a brother or sister?",
                "How many siblings does {sub} have?",
                "How many brothers and sisters does {sub} have?",
            ],
            "P4511": ["Do you know if {sub} is really deep?", "Is {sub} deep?", "How deep is {sub} really?"],
            "P4733": ["Mimic {pos} sound!", "What sound does {sub} make?", "How does {sub} sound like?"],
            "P7767": ["Would you serve {obj} hot or cold?", "How are they usually served?"],
        }

        # single answer
        A = {
            "P6": ["{name}'s president is {a}.", "{name}'s prime minister is {a}."],
            "P17": ["{name} is located in {a}.", "{name} is found in the country of {a}."],
            "P19": ["{name} was born in {a}.", "{sub} was born in {a}.", "In {a}."],
            "P20": ["{name} died in {a}.", "{sub} died in {a}."],
            "P22": ["{name}'s father is {a}.", "{pos} father is called {a}.", "His name is {a}.", "It's {a}."],
            "P25": ["{name}'s mother is {a}.", "{pos} mother is called {a}.", "Her name is {a}.", "It's {a}."],
            "P27": ["{name} originates from {a}.", "{sub} is from {a}.", "{name} comes from {a}."],
            "P30": ["{name} is part of {a}.", "{name} is part of the continent of {a}."],
            "P36": ["{name}'s capital city is {a}.", "{pos} capital is called {a}.", "The capital of {name} is {a}."],
            "P37": ["The official language of {name} is {a}.", "The people in {name} speak {a}."],
            "P38": [
                "{name}'s currency is the {a}.",
                "{name} uses {a} as their currency.",
                "The currency of {name} is the {a}.",
            ],
            "P40": [
                "{name} has one child named {a}.",
                "{name} has a single child named {a}.",
                "{sub} has a child named {a}.",
            ],
            "P50": ["{name} was written by {a}.", "The author of {name} is {a}.", "{a} is {pos} author."],
            "P57": ["{name} was directed by {a}.", "{sub} was directed by {a}."],
            "P61": ["{name} was discovered by {a}.", "{sub} was discovered by {a}."],
            "P106": ["{name} works at {a}.", "{sub} works at {a}.", "{name} job title is {a}."],
            "P138": ["{name} was named after {a}.", "{name} got {pos} name from {a}.", "{pos} name comes from {a}."],
            "P169": ["{name} is the CEO of {a}.", "{sub} is the CEO of {a}."],
            "P170": [
                "{sub} was created by {a}.",
                "{a} created {name}.",
                "The creator of {name} is {a}.",
                "{a} made {obj}.",
                "{sub} was created by {a}.",
            ],
            "P225": ["{name} is called {a} in latin.", "The scientific term for {name} is {a}."],
            "P246": ["The element of {name} is {a}.", "The symbol for {name} is {a}."],
            "P274": ["The formula for {name} is {a}.", "The chemical formula of {name} is {a}."],
            "P275": [
                "{name} has the following license: {a}.",
                "{name} has a {a} license associated with {obj}.",
                "{name} was released under {a}.",
                "{sub} is licensed under {a}.",
            ],
            "P366": [
                "{name} is most commonly used for {a}.",
                "{sub} is used mostly for {a}.",
                "{name} is mostly known for {a}.",
            ],
            "P487": ["{a}", "The {name} emoji is {a}.", "The {a} character repesents {name}."],
            "P509": ["{name} died of {a}.", "The cause of {pos} death was {a}."],
            "P527": ["{name} are made of {a}.", "They are made of {a}."],
            "P569": ["{name} was born on {a}.", "{pos} birthday is on the {a}."],
            "P570": ["{name} died at {a}", "{sub} died in {a}."],
            "P571": [
                "{name} was created in {a}.",
                "The date of {pos} inception is {a}.",
                "{name} was first released in {a}.",
            ],
            "P575": ["{name} was invented at {a}.", "{name} was discovered in {a}."],
            "P576": [
                "{name} was discontinued after {a}.",
                "{name} was demolished by {a}.",
                "{sub} got dissolved at {a}.",
            ],
            "P580": ["{name} started in {a}.", "{name} first started at {a}."],
            "P582": ["{name} ended in {a}.", "{name} lasted until {a}."],
            "P625": ["{name} is lcoated at {a}.", "The coordinates for {name} are {a}.", "{pos} GPS location is {a}."],
            "P837": ["{name} is celebrated on {a}.", "{name} is on {a}."],
            "P856": [
                "The URL for {name} is: {a}",
                "See {a}",
                "The URL of {pos} webiste is {a}",
                "{pos} web address is: {a}",
            ],
            "P973": [
                "You can find out more at {a}",
                "Here's a link on {name}: {a}",
                "You can find out more about {obj} on {a}",
            ],
            "P1082": [
                "{name}'s population is {a}.",
                "Around {a} people live in {name}.",
                "{pos} population is estimated to be around {a}.",
            ],
            "P1120": [
                "The number of deaths was {a}.",
                "The number of fatalities was {a}.",
                "{a} died due to {name}.",
                "{name} has taken the lives of {a}.",
            ],
            "P2043": ["{name} is {a} long.", "{sub} has a length of {a}."],
            "P2044": ["{name} is {a} tall.", "{name} is {a} above sea level.", "{pos} elevation is {a}."],
            "P2046": ["{name}'s area is {a}", "{pos} area is {a}."],
            "P2049": ["{name}'s widht is {a}.", "{name} is {a} wide."],
            "P2250": ["{name} have a life expectancy of {a}.", "{pos} life expectancy is about {a}."],
            "P2283": [
                "{name} uses {a} to work.",
                "{sub} works via {a}.",
                "{name} works through {a}.",
                "{sub} makes use of {a}.",
            ],
            "P3063": [
                "The gestation period for {name}s is {a}.",
                "The amount of time needed for their gestation period is known to be {a}.",
            ],
            "P3373": ["{name} has a siblings called {a}.", "{sub} has a sibling named {a}."],
            "P4511": ["{name} has a depth of {a}.", "{name} can be as deep as {a}.", "{pos} vertical depth is {a}."],
            "P4733": ["{name} makes the following sound: {a}", "{name} makes a {a} sound.", "The {name} says {a}."],
            "P7767": ["{name}s are served {a}.", "{name} is usually served {a}."],
        }
        # plural / multiple answers
        Ap = {
            "P6": ["The governors of {name} are {a}.", "The ministers of {name} are {a}."],
            "P37": ["The official languages of {name} are {a}.", "They speak {a}."],
            "P38": [
                "{name} accepts {a}.",
                "{name} uses {a} as their countriy's currencies.",
                "The currencies of {name} are {a}.",
            ],
            "P40": [
                "{name} has {l} children: {a}.",
                "The number of children {name} has is {l}. Their names are {a}.",
                "{pos} {l} children are {a}.",
            ],
            "P50": ["{name} was co-written by {a}.", "The authors of {name} are {a}."],
            "P57": ["{name} was direcrted by the following people: {a}.", "{a} were the directors of {name}."],
            "P61": ["{pos} inventors are {a}.", "{name} was discovered by {a}."],
            "P106": ["{name} has multiple occupations: {a}.", "{name}'s job titles are: {a}."],
            "P169": ["{name} is the CEO of multiple companies, such as {a}.", "{sub} is the CEO at {a}."],
            "P225": ["The taxon names for {name} are {a}.", "The proper scientific terms for {name} are {a}."],
            "P246": ["The elements of {name} are {a}.", "The symbols for {name} are {a}."],
            "P274": ["The formulas for {name} are {a}.", "The chemical formulas of the compound {name} are {a}."],
            "P487": ["The {name} emojis are {a}.", "The characters {a} repesent {name}."],
            "P527": ["The ingredients of {name} are {a}.", "{a} are all parts needed for {name}."],
            "P575": [
                "Sources disagree on the exact date, it is said that {name} was invented in {a}.",
                "{name} was discovered multiple times at {a}.",
            ],
            "P856": ["The URLs for {name} are: {a}", "See {a}", "The URLs of {pos} webiste are {a}"],
            "P625": [
                "{name} can be found under the following GPS locations: {a}.",
                "The coordinates for {name} are {a}.",
            ],
            "P973": ["You can find out more at {a}", "You can find out more about {obj} at {a}"],
            "P1120": [
                "There are multiple sources on the number of fatalities: {a}",
                "{name} is know to take the lives of somewhere between {a}.",
            ],
            "P1082": [
                "There are multiple sources on {pos} population: {a}.",
                "There are different sources on {name}'s population: {a}.",
            ],
            "P2046": ["{name}'s area has changed over time: {a}", "{pos} area has altered over the ages to {a}."],
            "P3373": [
                "{name} has {l} siblings: {a}.",
                "The number of brothers and sisters {name} has is {l}. Their names are {a}.",
            ],
            "P4733": ["{name} makes sounds like {a}.", "The sounds {sub} often makes are {a}."],
        }

        assert len(Q.keys()) == len(A.keys())

        largest = int(sorted(self.cache[self.language][qid].keys())[-1])
        qs = [key for key in Q.keys() if key in self.cache[self.language][qid][largest]["graph"]]
        if not qs:
            return ""
        np.random.shuffle(qs)

        if np.random.choice([True, False]):
            if np.random.choice([True, False]):
                results = [f'Questions about {self.cache[self.language][qid][largest]["desc"]["label"]}:']
            else:
                results = [
                    f'Questions and Answers on {self.cache[self.language][qid][largest]["desc"]["label"]}, {self.cache[self.language][qid][largest]["desc"]["description"]}:'
                ]
        else:
            if np.random.choice([True, False]):
                results = [
                    f'Questions about {self.cache[self.language][qid][largest]["desc"]["label"]} ({self.cache[self.language][qid][largest]["desc"]["description"]}):'
                ]
            else:
                results = [
                    f'Questions and Answers on {self.cache[self.language][qid][largest]["desc"]["label"]} (also known as {", ".join(self.cache[self.language][qid][largest]["desc"]["aliases"])}):'
                ]

        for i, key in enumerate(qs):
            if np.random.choice([True, False]):
                name = self.cache[self.language][qid][largest]["desc"]["label"]
            else:
                name = np.random.choice(self.cache[self.language][qid][largest]["desc"]["aliases"])
            if not proper:
                name = _add_a(name)
            if i == 0 or np.random.choice([True, False]):
                question = np.random.choice(Q[key]).format(name=name, sub=sub, obj=obj, pos=pos)
            else:
                question = np.random.choice(Qp[key] if key in Qp else Q[key]).format(
                    name=name, sub=sub, obj=obj, pos=pos
                )
            if zalgo:
                question = self._zalgo(question)

            a = self.cache[self.language][qid][largest]["graph"][key]
            l = len(a)
            if key not in Ap or l <= 1:
                if l <= 1:
                    a = a[0]
                else:
                    a = ", ".join(a[:-1]) + f" and {a[-1]}"
                answer = np.random.choice(A[key]).format(q=question, name=name, sub=sub, obj=obj, pos=pos, a=a, l=l)
            else:
                a = ", ".join(a[:-1]) + f" and {a[-1]}"
                answer = np.random.choice(Ap[key]).format(q=question, name=name, sub=sub, obj=obj, pos=pos, a=a, l=l)

            results.append(f"Q: {question}\r\nA: {answer[0].upper()}{answer[1:]}")
        return "\n\n".join(results)

## Search for a concept and use its QID to generate question and answer pairs.

In [4]:
wg = WikiGraph(file="cache.tmp")  # will save cached graph to cache.tmp

In [5]:
# search for QID
wg.search("chatgpt")

[{'id': 'Q115564437',
  'label': 'ChatGPT',
  'description': 'pre-trained language model developed by OpenAI'},
 {'id': 'Q116786574',
  'label': 'ChatGPT: Na volta às aulas, experimentação precisa ser o caminho, defendem especialistas',
  'description': 'webpage'},
 {'id': 'Q116488506',
  'label': 'ChatGPT is fun, but not an author',
  'description': 'scientific article'},
 {'id': 'Q116294278',
  'label': 'ChatGPT listed as author on research papers: many scientists disapprove',
  'description': 'scientific article published on 18 January 2023'}]

In [6]:
# chatgpt
print(wg.generate(qid="Q115564437"))

100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [00:10<00:00,  3.44it/s]

Questions about ChatGPT (pre-trained language model developed by OpenAI):

Q: Where can I read more abou GPT-3.5 online?
A: Here's a link on GPT-3.5: https://openai.com/blog/chatgpt/

Q: What's the URL for ChatGPT? Thanks!
A: The URL for ChatGPT is: https://chat.openai.com/chat

Q: When was Generative Pre-trained Transformer created?
A: Generative Pre-trained Transformer was created in 2022-11-30.

Q: What makes GPT-3.5 work in theory? Thanks for the answer!
A: It works via Reinforcement Learning from Human Feedback and Proximal Policy Optimization.

Q: Tell me who invented it!
A: ChatGPT was discovered by OpenAI.

Q: Why is GPT-3.5 named like that?
A: GPT-3.5 got its name from online chat and Generative Pre-trained Transformer.

Q: And its use cases are?
A: ChatGPT is most commonly used for natural language generation and machine translation.

Q: Who is reponsible for Generative Pre-trained Transformer?
A: OpenAI made it.

Q: What license is associated with Generative Pre-trai




In [7]:
# bill gates
print(wg.generate("Q5284", pronoun="he"))

100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:55<00:00,  3.63it/s]

Questions about Bill Gates (American businessman and philanthropist (born 1955)):

Q: How many children does Bill Gates have?
A: His 3 children are Jennifer Katherine Gates, Phoebe Adele Gates and Rory John Gates.

Q: Who is his dad?
A: His name is Bill Gates Sr..

Q: Who is his mother, respond with her name.
A: It's Mary Maxwell Gates.

Q: Where is William Henry Gates III from?
A: He is from United States of America.

Q: What does William Henry, III Gates work in?
A: William Henry, III Gates has multiple occupations: entrepreneur, programmer, computer scientist, inventor, financier, bridge player, investor, actor, philanthropist, writer, international forum participant and business magnate.

Q: What's the URL for William Henry "Bill" Gates III? Thanks!
A: His web address is: http://www.thegatesnotes.com/

Q: his birthday is?
A: His birthday is on the 1955-10-28.

Q: List his siblings please.
A: The number of brothers and sisters William Gates has is 2. Their names are Kristian




In [8]:
# budapest
print(wg.generate("Q1781", zalgo=True))

100%|████████████████████████████████████████████████████████████████████████████████| 152/152 [01:41<00:00,  1.50it/s]

Questions and Answers on Budapest (also known as Buda Pest, Buda-Pest, Budapešť, Budapesta, Budapeszt, Buda, Ofen, Budín, Budim, Budon, Pest, Pešť, Pešta, Alt-Ofen, Budapest, Hungary):

Q: What is Budapest's location?
A: The coordinates for Budapest are 47.498333333333 19.040833333333.

Q: Do you know the date of its inception?
A: Budon was first released in 1873-11-17.

Q: WHERE IS BUDAPEST LOCATED IN THE WORLD
A: Budapest is located in Hungary, Kingdom of Hungary, Hungarian Republic, Hungarian Soviet Republic, First Hungarian Republic, Austria-Hungary, First Hungarian Republic, Republic of Hungary and People's Republic of Hungary.

Q: How did it get its name?
A: Its name comes from Buda and Pest.

Q: How big is Budim?
A: Budim's area is 52514 hectare

Q: What is its offical website?
A: The URLs for Budapest are: https://budapest.hu and https://budapest.hu/sites/english/

Q: Tell me who its governor is!
A: The governors of Budapest are István Tarlós, Gábor Demszky and Gergely K




In [9]:
# hamburger
print(wg.generate("Q6663", proper=False))

100%|██████████████████████████████████████████████████████████████████████████████████| 47/47 [00:13<00:00,  3.53it/s]

Questions and Answers on hamburger, American sandwich of ground beef patty:

Q: Describe what a Hamburgh ſauſage was named after.
A: A Hamburgh ſauſage got its name from sandwich and Hamburg.

Q: How is a hamburger used?
A: A hamburger is most commonly used for eating and burger eating contest.

Q: Say in emoji: the hamburger.
A: The 🍔 character repesents the hamburger.

Q: What are the ingredients of the Hamburgh ſauſage?
A: Patty, cheese, bread, lettuce, tomato and onion are all parts needed for the Hamburgh ſauſage.

Q: When did it start?
A: A hamburger started in 1758.

Q: How are the hamburgers served?
A: The hamburger is usually served hot.



