In [3]:
from __future__ import annotations

import re
import time
from datetime import datetime
from html import unescape
from typing import Any, Dict, Iterable, List
from dotenv import load_dotenv, find_dotenv
import os
import pandas as pd
import requests

load_dotenv(find_dotenv())
API_KEY = os.getenv("API-KEY")

STACKEXCHANGE_API_BASE = "https://api.stackexchange.com/2.3"
STACKEXCHANGE_SITE = "stackoverflow"


In [4]:
def chunked(iterable: Iterable[Any], size: int) -> Iterable[List[Any]]:
    chunk: List[Any] = []
    for item in iterable:
        chunk.append(item)
        if len(chunk) == size:
            yield chunk
            chunk = []
    if chunk:
        yield chunk


def strip_html(raw_html: str) -> str:
    if not raw_html:
        return ""
    no_tags = re.sub(r"<[^>]+>", " ", raw_html)
    return unescape(no_tags)


def clean_text(raw_html: str) -> str:
    text = strip_html(raw_html)
    return " ".join(text.split())


def count_code_blocks(body: str) -> int:
    if not body:
        return 0
    return body.lower().count("<code>")


def compute_politeness_score(text: str) -> int:
    lowered = text.lower()
    return sum(keyword in lowered for keyword in ("please", "thank", "appreciate"))


def compute_clarity_metrics(text: str) -> Dict[str, float]:
    words = [token for token in text.split() if token.isalpha()]
    sentences = [segment for segment in text.replace("?", ".").replace("!", ".").split(".") if segment.strip()]
    avg_sentence_length = (len(words) / len(sentences)) if sentences else float(len(words))
    lexical_diversity = (len(set(map(str.lower, words))) / len(words)) if words else 0.0
    return {
        "avg_sentence_length": avg_sentence_length,
        "lexical_diversity": lexical_diversity,
    }


def fetch_questions(site: str, max_questions: int = 50) -> List[Dict[str, Any]]:
    collected: List[Dict[str, Any]] = []
    page = 1
    while len(collected) < max_questions:
        page_size = min(100, max_questions - len(collected))
        params = {
            "order": "desc",
            "sort": "activity",
            "site": site,
            "page": page,
            "pagesize": page_size,
            "filter": "withbody",
        }
        response = requests.get(f"{STACKEXCHANGE_API_BASE}/questions", params=params, timeout=30)
        response.raise_for_status()
        payload = response.json()
        collected.extend(payload.get("items", []))
        backoff = payload.get("backoff")
        if backoff:
            time.sleep(backoff)
        if not payload.get("has_more"):
            break
        page += 1
    return collected


def fetch_answers(site: str, answer_ids: Iterable[int]) -> Dict[int, Dict[str, Any]]:
    answers: Dict[int, Dict[str, Any]] = {}
    for batch in chunked(answer_ids, 100):
        ids = ";".join(str(answer_id) for answer_id in batch)
        params = {
            "order": "desc",
            "sort": "activity",
            "site": site,
            "filter": "withbody",
        }
        response = requests.get(f"{STACKEXCHANGE_API_BASE}/answers/{ids}", params=params, timeout=30)
        response.raise_for_status()
        payload = response.json()
        for item in payload.get("items", []):
            answers[item["answer_id"]] = item
        backoff = payload.get("backoff")
        if backoff:
            time.sleep(backoff)
    return answers


def build_stackexchange_dataset(site: str = STACKEXCHANGE_SITE, max_questions: int = 50) -> pd.DataFrame:
    questions = fetch_questions(site=site, max_questions=max_questions)
    accepted_answer_ids = [q["accepted_answer_id"] for q in questions if "accepted_answer_id" in q]
    answers_by_id = fetch_answers(site=site, answer_ids=accepted_answer_ids) if accepted_answer_ids else {}

    records: List[Dict[str, Any]] = []
    for question in questions:
        question_body = question.get("body", "")
        clean_body = clean_text(question_body)
        clarity = compute_clarity_metrics(clean_body)

        creation_timestamp = question.get("creation_date")
        creation_dt = datetime.utcfromtimestamp(creation_timestamp) if creation_timestamp else None
        answer = answers_by_id.get(question.get("accepted_answer_id"))
        answer_creation = datetime.utcfromtimestamp(answer["creation_date"]) if answer and answer.get("creation_date") else None
        time_to_answer_hours = None
        if creation_dt and answer_creation:
            delta_seconds = (answer_creation - creation_dt).total_seconds()
            time_to_answer_hours = delta_seconds / 3600.0 if delta_seconds >= 0 else None

        accepted_answer_score = answer.get("score") if answer else None
        
        record = {
            "question_id": question.get("question_id"),
            "title": question.get("title"),
            "has_accepted_answer": question.get("is_answered", False) and question.get("accepted_answer_id") is not None,
            "accepted_answer_score": accepted_answer_score,
            "time_to_accepted_answer_hours": time_to_answer_hours,
            "question_score": question.get("score"),
            "question_text": clean_body,
            "num_tags": len(question.get("tags", [])),
            "tags": question.get("tags", []),
        }

        if answer:
            record.update(
                {
                    "accepted_answer_id": answer.get("answer_id"),
                    "accepted_answer_length_chars": len(answer.get("body", "")),
                    "accepted_answer_length_tokens": len(clean_text(answer.get("body", "")).split()),
                }
            )
        records.append(record)

    df = pd.DataFrame(records)
    return df


In [5]:
df = build_stackexchange_dataset(site=STACKEXCHANGE_SITE, max_questions=1000)

  creation_dt = datetime.utcfromtimestamp(creation_timestamp) if creation_timestamp else None
  answer_creation = datetime.utcfromtimestamp(answer["creation_date"]) if answer and answer.get("creation_date") else None


In [6]:
df

Unnamed: 0,question_id,title,has_accepted_answer,accepted_answer_score,time_to_accepted_answer_hours,question_score,question_text,num_tags,tags,accepted_answer_id,accepted_answer_length_chars,accepted_answer_length_tokens
0,79784736,How to efficiently serve Grad-CAM heatmaps in ...,False,,,-6,I’m working on a web-based pneumonia detection...,5,"[python, django, deep-learning, computer-visio...",,,
1,75891,Algorithm for finding similar images,True,,,93,I need an algorithm that can determine whether...,3,"[algorithm, math, image-comparison]",,,
2,13407017,JavaFX source code not showing,True,19.0,3.410278,13,I downloaded the JavaFx source code from http:...,2,"[intellij-idea, javafx]",13409072.0,5636.0,628.0
3,79791639,Android downsize and compress GIF/WEBP and sav...,False,,,0,I've been trying to find a way to edit GIFs/WE...,5,"[android, compression, gif, animated-gif, webp]",,,
4,79791638,Dapper returning Nulls for KeyValuePair,False,,,0,We are trying to use Dapper to get a list of r...,1,[dapper],,,
...,...,...,...,...,...,...,...,...,...,...,...,...
995,79790804,Are Flink&#39;s timer service &amp; Collector ...,False,,,0,Question Async operation & Future callback was...,2,"[apache-flink, flink-streaming]",,,
996,79772139,RealityKit different transform-related action ...,True,0.0,531.294167,0,I have this minimum reproducible code. This co...,2,"[ios, realitykit]",79790802.0,1606.0,164.0
997,79536277,Displaying VAT Inclusive and Exclusive on shop...,False,,,0,I am editing the file: product-template.liquid...,3,"[math, shopify, shopify-liquid]",,,
998,79787564,Win10 child process show it&#39;s window behin...,False,,,1,I have two processes - one starter app and one...,3,"[c++, winapi, windows-10]",,,


In [10]:
df.question_id.unique().shape

(1000,)