In [1]:
import os
import requests
import json
import pandas as pd
import time


def get_questions(previous_id, count=100):
    base_url = "https://rulesguru.net/api/questions/"
    parameters = {
        "count": count,
        "level": ["0", "1", "2", "3", "Corner Case"],
        "complexity": ["Simple", "Intermediate", "Complicated"],
        "legality": "All of Magic",
        "tags": [],
        "previousId": int(previous_id),
    }

    response = requests.get(base_url, params={"json": json.dumps(parameters)})
    response.raise_for_status()
    return response.json()


if os.path.exists("data/rules_guru/rules_guru_qa_dataset.csv"):
    df = pd.read_csv("data/rules_guru/rules_guru_qa_dataset.csv")
    previous_id = df["id"].max()
else:
    df = pd.DataFrame()
    previous_id = 1

count = 100
while True:
    time.sleep(3)
    questions = get_questions(previous_id, count=count)
    if questions["status"] != 200:
        if (
            questions["status"] == 400
            and questions["error"] == "Incorrectly formatted json."
        ):
            count = count // 2
            print(f"Reducing count ({count})...")

            if count < 10:
                previous_id += 1
                count = 100

            continue

        print("ERROR:")
        print(questions)
        break
    else:
        if count < 100:
            print(f"Restoring count to 100...")
            count = 100

    if len(df) == 0:
        df = pd.DataFrame(questions["questions"])
    else:
        df = pd.concat([df, pd.DataFrame(questions["questions"])])
        df = df.drop_duplicates(subset="id", keep="first")

    previous_id = df["id"].max()

    df.to_csv("data/rules_guru/rules_guru_qa_dataset.csv", index=False)

    print(f"Downloaded {len(df)} questions / ID {previous_id}")

Downloaded 1457 questions / ID 7357
ERROR:
{'status': 429, 'error': "Please don't send more than one request every 2 seconds."}


In [50]:
import pandas as pd
from html import unescape
import re


def clean_text(text):
    # Decodificar entidades HTML
    text = unescape(text)

    # Proteger nombres de cartas en doble corchetes
    card_names = re.findall(r"\[\[.*?\]\]", text)
    card_dict = {f"<<{i}>>": card_names[i] for i in range(len(card_names))}
    for key, value in card_dict.items():
        text = text.replace(value, key)

    # Eliminar URLs de Markdown
    text = re.sub(r"\[.*?\]\(.*?\)", "", text)

    # Eliminar URLs independientes
    text = re.sub(r"http[s]?://\S+", "", text)

    # Eliminar sintaxis de Markdown (negrita, cursiva)
    text = re.sub(r"\*{1,2}|_{1,2}", "", text)

    # Eliminar líneas de separadores de Markdown
    text = re.sub(r"^---\s*$", "", text, flags=re.MULTILINE)

    # Restaurar nombres de cartas
    for key, value in card_dict.items():
        text = text.replace(key, value)

    # Reemplazar tabulaciones y otros espacios por un solo espacio
    text = re.sub(r"[ \t]+", " ", text)

    # Eliminar múltiples saltos de línea dejando solo uno
    text = re.sub(r"\n+", "\n", text)

    # Eliminar espacios antes y después de saltos de línea
    text = re.sub(r" *\n *", "\n", text)

    # Eliminar múltiples espacios dejando solo uno
    text = re.sub(r" +", " ", text)

    text = re.sub(r"^\>", "", text, flags=re.MULTILINE)

    return text


df = pd.read_csv("../data/reddit/reddit_qa_dataset_with_context.csv").dropna()
df["html"] = df["question"].apply(clean_text)

for c in ["question", "answer", "context"]:
    print(c, df[c].apply(clean_text).str.contains("&gt").sum())

question 0
answer 0
context 0


In [54]:
aux = df[df["question"].str.contains("&gt")].iloc[9]
print(aux["question"])
print(aux["html"])

Najeela the Blade Blossom grants haste to creatures already attacking.
Hello, I was confused by looking at the card [[Najeela, the Blade-Blossom]] its activated ability says.

&gt;Untap all attacking creatures. They gain trample, lifelink, and haste until end of turn. After this phase, there is an additional combat phase. Activate this ability only during combat.

Could someone explain to a relatively new player what that would amount to functionally? If they are already attacking, then what is the point of giving them haste?
Najeela the Blade Blossom grants haste to creatures already attacking.
Hello, I was confused by looking at the card [[Najeela, the Blade-Blossom]] its activated ability says.
Untap all attacking creatures. They gain trample, lifelink, and haste until end of turn. After this phase, there is an additional combat phase. Activate this ability only during combat.
Could someone explain to a relatively new player what that would amount to functionally? If they are alread

In [19]:
df[df["question"].str.contains("---")].iloc[9]

question    2+ Academy Manufactors + Doubling Effects\nThe...
answer      &gt; Am I understanding the layering of the tr...
score                                                       5
context     \nExtracted documents:\nDocument 0:::\nName: A...
Name: 5274, dtype: object

In [None]:
5274