# Clean the Minecraft Q&A 700k Dataset

In [1]:
import pandas as pd

# df = pd.read_json("hf://datasets/naklecha/minecraft-question-answer-700k/train.json")
mc_qa = pd.read_json("../data/hf/minecraft-question-answer-700k.json")
print(f"The original table has {len(mc_qa)} rows.")

The original table has 694814 rows.


## Unify Source URL

In [2]:
def remove_common_root_url(s: str) -> str:
    return s[len("https://minecraft.wiki/w/"):]

mc_qa['source'] = mc_qa['source'].apply(remove_common_root_url)

In [3]:
import re

def capitalize_url(url):
    if isinstance(url, str):
        def capitalize_match(match):
            return match.group(0)[0] + match.group(0)[1].upper()

        return re.sub(r"_[a-z]", capitalize_match, url)
    return url

mc_qa['source'] = mc_qa['source'].apply(capitalize_url)

In [4]:
def remove_after_last_hash(s: str) -> str:
    index = s.rfind("#")  # Find the last occurrence of '#'
    return s[:index] if index != -1 else s  # Keep only the part before the last '#'

mc_qa['source'] = mc_qa['source'].apply(remove_after_last_hash)

## Drop duplicates

In [5]:
mc_qa = mc_qa.drop_duplicates()
print(f"The resulting table has {len(mc_qa)} rows.")

The resulting table has 678064 rows.


## Remove synthetic data leakage

In [6]:
question_counts = mc_qa['question'].value_counts()
# duplicate_questions = question_counts[question_counts > 100]
# sorted_duplicates = duplicate_questions.sort_values(ascending=False)
# sorted_duplicates

In [7]:
# This number is chosen by looking sorting the count descendingly 
# and I found everything above this is synthetic garbage
leaked_synthetic_questions = question_counts[question_counts > 120] 
mask = ~mc_qa['question'].isin(leaked_synthetic_questions.index.to_list()) 
mc_qa = mc_qa[mask]
print(f"The resulting table has {len(mc_qa)} rows.")

The resulting table has 658031 rows.


In [8]:
# question_counts = mc_qa['question'].value_counts()
# duplicate_questions = question_counts[question_counts > 10]
# sorted_duplicates = duplicate_questions.sort_values(ascending=False)

# questions = sorted_duplicates.index.to_list()
# with open("temp.txt", "w") as f:
#     f.write("\n".join(questions))

In [9]:
leaked_synthetic_questions2 = [
    "your question",
    "Question text here",
    "{prompt}",
    "YOUR_QUESTION_HERE",
    "",
    "{question_sentence}",
    "...?",
    "Question goes here",
    "#{question}",
    "{0}",
    " Your question here ",
    "{q}",
    " ?",
    "\" + question_sentence + \"",
    "Question about the topic",
    "{question_answer_pair['question']}",
    "\" + question.strip() + \"",
    "{question['question']}",
    "{sentence}",
    "\"\nanswer_tag = \"",
    "YOUR QUESTION",
    "' + question_sentence + '",
    "question_text",
    "Generated Question",
    "{question_text}",
    "{input_text}",
    "Q: ?",
    "Insert question here",
    "' + sentence + '",
    "\" + sentence + \""
    "{qa_pair['question']}",
    "??",
    "How many wheels does a car have?",
    "question text",
    "' + question.strip() + '",
    "{question_answer['question']}",
    "\" + q + \"",
    "your_question",
    "{question_template}",
    "${question}",
    "Question text goes here",
    "What is the purpose of the ",
    "\", question[\"question\"], \"",
    "Question here",
    "{synthetic_question}",
    " Your question ",
    "?????",
    "_______________",
    "`, `",
    chr(10),
    "What is the meaning of life?",
   "_____________",
   " Your Question ",
   "\" + prompt + \"",
   "question",
   "{random_question}",
   "{question}?",
   " and ",
   "\", question[0]['question'], \"",
   "` and `"
]
mask = ~mc_qa['question'].isin(leaked_synthetic_questions2) 
mc_qa = mc_qa[mask]
print(f"The resulting table has {len(mc_qa)} rows.")

The resulting table has 656131 rows.


In [10]:
mc_qa = mc_qa.drop_duplicates(subset=['question', 'source'])
print(f"The resulting table has {len(mc_qa)} rows.")

The resulting table has 630378 rows.


In [11]:
# question_counts = mc_qa['question'].value_counts()
# duplicate_questions = question_counts[question_counts > 10]
# sorted_duplicates = duplicate_questions.sort_values(ascending=False)

# questions = sorted_duplicates.index.to_list()
# with open("temp.txt", "w") as f:
#     f.write("|||\n".join(questions))

## Save V1

In [12]:
mc_qa.to_json("../data/hf/minecraft-question-answer-630k.jsonl", orient='records', lines=True)
from datasets import Dataset 
mc_qa_630k = Dataset.from_pandas(mc_qa, preserve_index=False)
mc_qa_630k.push_to_hub("minecraft-question-answer-630k")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/631 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/minhaozhang/minecraft-question-answer-630k/commit/96f1e8b1e6b3b022aa247182f8dd7201479c0b9d', commit_message='Upload dataset', commit_description='', oid='96f1e8b1e6b3b022aa247182f8dd7201479c0b9d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/minhaozhang/minecraft-question-answer-630k', endpoint='https://huggingface.co', repo_type='dataset', repo_id='minhaozhang/minecraft-question-answer-630k'), pr_revision=None, pr_num=None)

## Remove platfrom history, Minecraft Earth & Dungeons, and some more

In [13]:
removal = ['Alpha_', 'Bedrock_', 'Beta_', 'Classic_', 'Console_', 
           'Education_Edition_', 'Element', 'Indev_', 'Infdev_', 
           'Java_Edition_', 'Launcher_', 'Legacy_Console_', 'Minecon_', 
           'MinecraftEdu_', 'Minecraft_Dungeons_', 'Minecraft_Earth_', 
           'Minecraft_Education_', 'Minecraft_Wiki/', 'New_Nintendo_', 
           'Nintendo_', 'PlayStation_', 'Pocket_', 'Programs_And_Editors', 
           'Realm_', 'Resource_Pack_', 'Resource_Location', 
           'Skin/', 'Village/', 'Wii_', 'XBox_', 'Xbox_']

# print(sorted(removal))

mask = ~mc_qa['source'].str.startswith(tuple(removal), na=False) # na=False to handle NaN values

mc_qa = mc_qa[mask]
print(f"The resulting table has {len(mc_qa)} rows.")

The resulting table has 563344 rows.


## Remove Java version history 

In [14]:
urls = sorted(set(mc_qa['source']))
amplified_index = urls.index("AMPLIFIED")
if amplified_index > 0:
    sources_to_remove = urls[:amplified_index]
else:
    sources_to_remove = []

mc_qa = mc_qa[~mc_qa['source'].isin(sources_to_remove)]
print(f"The resulting table has {len(mc_qa)} rows.")

The resulting table has 554828 rows.


## Drop duplicate questions

In [15]:
mc_qa = mc_qa.drop_duplicates(['question'])
print(f"The resulting table has {len(mc_qa)} rows.")

The resulting table has 504899 rows.


## Remove pages with less than 5 questions (too short of a page)

In [16]:
# Group by source, count rows, and sort by count
source_counts = mc_qa.groupby('source').size().sort_values()

# Filter groups with size less than 20
sources_to_remove = source_counts[source_counts <= 5].index.tolist()

# Remove rows with sources in the sources_to_remove list
mc_qa = mc_qa[~mc_qa['source'].isin(
    sources_to_remove)]
print(f"The resulting table has {len(mc_qa)} rows.")

The resulting table has 500441 rows.


## Save V2

In [17]:
mc_qa.to_json("../data/hf/minecraft-question-answer-500k.jsonl", orient='records', lines=True)
mc_qa_500k = Dataset.from_pandas(mc_qa, preserve_index=False)
mc_qa_500k.push_to_hub("minecraft-question-answer-500k")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/501 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/minhaozhang/minecraft-question-answer-500k/commit/70cfa228cd0d9617926e411d4c9b8eec8ac3e806', commit_message='Upload dataset', commit_description='', oid='70cfa228cd0d9617926e411d4c9b8eec8ac3e806', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/minhaozhang/minecraft-question-answer-500k', endpoint='https://huggingface.co', repo_type='dataset', repo_id='minhaozhang/minecraft-question-answer-500k'), pr_revision=None, pr_num=None)