#### Stackoverflow QA

In [22]:
from stackapi import StackAPI
import json
import time

SITE = StackAPI('stackoverflow')
SITE.page_size = 100
SITE.max_pages = 50  # adjust based on how many you want
SITE.sleep_between_requests = 1

questions_data = []

for question in SITE.fetch('questions', tagged='unreal-engine4;unreal-engine5', filter='withbody')['items']:
    qid = question['question_id']
    title = question['title']
    body = question['body']
    answers = []

    try:
        a = SITE.fetch(f'questions/{qid}/answers', filter='withbody')
        for ans in a['items']:
            answers.append(ans['body'])
    except Exception as e:
        print(f"Error fetching answers for {qid}: {e}")
        continue

    if answers:
        questions_data.append({
            "question_id": qid,
            "question": title,
            "question_body": body,
            "answers": answers
        })

    time.sleep(0.5)  # prevent hitting rate limit

# save to file
with open('../data/ue_qa_raw.json', 'w', encoding='utf-8') as f:
    json.dump(questions_data, f, indent=2, ensure_ascii=False)


In [23]:
import json
from bs4 import BeautifulSoup

with open('../data/ue_qa_raw.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Number of questions: {len(data)}")

dataset = []

for item in data:
    question = BeautifulSoup(item['question_body'], 'html.parser').get_text()
    answers = [BeautifulSoup(a, 'html.parser').get_text() for a in item['answers']]

    for ans in answers:
        dataset.append({
            "instruction": item['question'],
            "input": question,
            "output": ans
        })

with open('../data/ue_finetune_dataset.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(dataset, indent=2, ensure_ascii=False))
    """for d in dataset:
        f.write(json.dumps(d, ensure_ascii=False) + "\n")"""


Number of questions: 135


#### Parse UE documentation

In [6]:
from bs4 import BeautifulSoup
from pathlib import Path
import json

DOC_PATH = Path("/path/to/UnrealEngine/Engine/Documentation/HTML/en")
output = []

for html_file in DOC_PATH.rglob("*.html"):
    try:
        with open(html_file, "r", encoding="utf-8", errors="ignore") as f:
            soup = BeautifulSoup(f, "html.parser")
        title = soup.title.string if soup.title else html_file.stem
        # Many Unreal docs wrap the actual text in <div class="doc-content"> or <article>
        article = soup.select_one(".doc-content") or soup.select_one("article") or soup.body
        text = article.get_text(separator="\n", strip=True) if article else ""
        output.append({"file": str(html_file), "title": title, "content": text})
        print("✅ Parsed", html_file)
    except Exception as e:
        print("⚠️ Error", html_file, e)

with open("ue_docs_local.jsonl", "w", encoding="utf-8") as f:
    for o in output:
        f.write(json.dumps(o, ensure_ascii=False) + "\n")
    f.close()
