#### Stackoverflow QA

In [37]:
from stackapi import StackAPI
import json
import time
import re
import os

def clean_html(raw_html):
    """Remove HTML tags and unescape basic entities."""
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = cleantext.replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
    return cleantext.strip()

# --- StackOverflow API setup ---
SITE = StackAPI('stackoverflow', key='rl_XSrg9NN8mZuYZNRcthYeLE1hk')
SITE.page_size = 100           # items per page
SITE.max_pages = 50            # max pages to fetch
SITE.sleep_between_requests = 1  # be kind to the API

# --- File setup ---
output_path = '../data/ue_qa_raw.json'
os.makedirs(os.path.dirname(output_path), exist_ok=True)

questions_data = []
page = 1
has_more = True

# If the file exists, resume from where you left off
if os.path.exists(output_path):
    with open(output_path, 'r', encoding='utf-8') as f:
        questions_data = json.load(f)
    print(f"Resuming from {len(questions_data)} saved Q&A pairs.")

# --- Main loop ---
while has_more and page <= SITE.max_pages:
    print(f"\nüîπ Fetching page {page}...")
    resp = SITE.fetch(
        'questions',
        tagged='unreal-engine4',
        filter='withbody',
        page=page,
        sort='creation'
    )

    for question in resp['items']:
        qid = question['question_id']
        title = clean_html(question['title'])
        body = clean_html(question['body'])
        answers = []

        try:
            a = SITE.fetch(f'questions/{qid}/answers', filter='withbody')
            for ans in a['items']:
                answers.append(clean_html(ans['body']))
        except Exception as e:
            print(f"‚ö†Ô∏è Error fetching answers for {qid}: {e}")
            continue

        if answers:
            questions_data.append({
                "question_id": qid,
                "question": title,
                "question_body": body,
                "answers": answers
            })

    # Save progress every 5 pages
    if page % 5 == 0:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(questions_data, f, indent=2, ensure_ascii=False)
        print(f"üíæ Saved {len(questions_data)} Q&A pairs so far...")

    has_more = resp.get('has_more', False)
    page += 1
    time.sleep(1)

# --- Final save ---
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(questions_data, f, indent=2, ensure_ascii=False)

print(f"\n‚úÖ Done! Total collected: {len(questions_data)} Q&A pairs")
print(f"üìÅ Saved to: {output_path}")


Resuming from 661 saved Q&A pairs.

üîπ Fetching page 1...
‚ö†Ô∏è Error fetching answers for 58428978: ('https://api.stackexchange.com/2.3/questions/58428978/answers/?pagesize=100&page=1&filter=withbody&key=rl_XSrg9NN8mZuYZNRcthYeLE1hk&site=stackoverflow', 'Expecting value: line 1 column 1 (char 0)', 'Expecting value: line 1 column 1 (char 0)', 'Expecting value: line 1 column 1 (char 0)')

‚úÖ Done! Total collected: 2483 Q&A pairs
üìÅ Saved to: ../data/ue_qa_raw.json


In [38]:
import json
from bs4 import BeautifulSoup

with open('../data/ue_qa_raw.json', 'r', encoding='utf-8') as f:
    data = json.load(f)


dataset = []

for item in data:
    question = BeautifulSoup(item['question_body'], 'html.parser').get_text()
    answers = [BeautifulSoup(a, 'html.parser').get_text() for a in item['answers']]

    for ans in answers:
        dataset.append({
            "instruction": item['question'],
            "input": question,
            "output": ans
        })

with open('../data/ue_finetune_dataset.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(dataset, indent=2, ensure_ascii=False))
    """for d in dataset:
        f.write(json.dumps(d, ensure_ascii=False) + "\n")"""



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  answers = [BeautifulSoup(a, 'html.parser').get_text() for a in item['answers']]

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  question = BeautifulSoup(item['question_body'], 'html.parser').get_text()


#### Parse UE documentation

In [6]:
from bs4 import BeautifulSoup
from pathlib import Path
import json

DOC_PATH = Path("/path/to/UnrealEngine/Engine/Documentation/HTML/en")
output = []

for html_file in DOC_PATH.rglob("*.html"):
    try:
        with open(html_file, "r", encoding="utf-8", errors="ignore") as f:
            soup = BeautifulSoup(f, "html.parser")
        title = soup.title.string if soup.title else html_file.stem
        # Many Unreal docs wrap the actual text in <div class="doc-content"> or <article>
        article = soup.select_one(".doc-content") or soup.select_one("article") or soup.body
        text = article.get_text(separator="\n", strip=True) if article else ""
        output.append({"file": str(html_file), "title": title, "content": text})
        print("‚úÖ Parsed", html_file)
    except Exception as e:
        print("‚ö†Ô∏è Error", html_file, e)

with open("ue_docs_local.jsonl", "w", encoding="utf-8") as f:
    for o in output:
        f.write(json.dumps(o, ensure_ascii=False) + "\n")
    f.close()
