In [1]:
import requests
import time
import html
from bs4 import BeautifulSoup
import json


In [2]:

def fetch_questions(tag, pages=2, pagesize=30):
    base_url = "https://api.stackexchange.com/2.3/questions"
    all_items = []

    for page in range(1, pages + 1):
        params = {
            "order": "desc",
            "sort": "votes",
            "tagged": tag,
            "site": "stackoverflow",
            "filter": "withbody",
            "pagesize": pagesize,
            "page": page
        }

        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            items = response.json().get("items", [])
            all_items.extend(items)
        else:
            print(f"Failed to fetch for tag={tag}, page={page}")
        time.sleep(1.5)

    return all_items

def extract_code_blocks(html_content):
    soup = BeautifulSoup(html.unescape(html_content), "html.parser")
    return [code.text for code in soup.find_all("code")]

def build_dataset(tags, pages_per_tag=2):
    dataset = []
    for tag in tags:
        print(f"Fetching questions for tag: {tag}")
        questions = fetch_questions(tag, pages=pages_per_tag)

        for q in questions:
            code_snippets = extract_code_blocks(q["body"])
            if code_snippets:
                dataset.append({
                    "title": q["title"],
                    "link": q["link"],
                    "tags": q["tags"],
                    "code_snippets": code_snippets
                })

    return dataset

def save_to_json(data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)
    print(f"Saved {len(data)} entries to {filename}")


In [3]:
tags = ["tensorflow", "pytorch", "pandas", "scikit-learn"]
so_data = build_dataset(tags, pages_per_tag=3)
save_to_json(so_data, "stackoverflow_apis.json")


Fetching questions for tag: tensorflow
Fetching questions for tag: pytorch
Fetching questions for tag: pandas
Fetching questions for tag: scikit-learn
Saved 309 entries to stackoverflow_apis.json
