In [1]:
import requests
import pandas as pd
import time
from tqdm import tqdm

In [2]:
# URL de base de l'API
BASE_URL = "https://api.stackexchange.com/2.3/questions"
# Spécifier le site StackExchange ciblé
SITE = "stackoverflow"
# Nombre de questions par page
PAGESIZE = 50

In [3]:
API_KEY = None

In [4]:
# Fonction pour récupérer une page de questions

def fetch_questions(page=1, pagesize=50, tagged=None):
    params = {
        "site": SITE,
        "pagesize": pagesize,
        "page": page,
        "order": "desc",
        "sort": "votes",
        "filter": "withbody",  # Pour inclure le corps des questions
    }
    if tagged:
        params["tagged"] = tagged
    if API_KEY:
        params["key"] = API_KEY

    # Envoie la requête
    response = requests.get(BASE_URL, params=params)
    response.raise_for_status()
    # Retourne les données au format JSON
    return response.json()

In [5]:
# Fonction pour collecter plusieurs pages de questions

def collect_questions(n_pages=10, tagged=None):
    all_items = [] # Liste pour stocker toutes les questions

    for page in tqdm(range(1, n_pages + 1)):
        data = fetch_questions(page=page, tagged=tagged)
        items = data.get("items", [])
        all_items.extend(items)
        time.sleep(1)  # Pour respecter les limites de l’API

    # Retourne toutes les questions collectées
    return all_items 

In [6]:
# Fonction pour convertir les données en DataFrame pandas

def questions_to_df(questions):
    data = []

    for q in questions:
        data.append({
            "question_id": q.get("question_id"),
            "title": q.get("title"),
            "body": q.get("body"),
            "tags": q.get("tags"),
            "creation_date": q.get("creation_date"),
            "score": q.get("score"),
            "view_count": q.get("view_count"),
            "answer_count": q.get("answer_count"),
            "is_answered": q.get("is_answered")
        })

    # Créer un DataFrame à partir des données
    return pd.DataFrame(data)

In [7]:
# Exemple : récupérer 250 questions populaires (5 pages de 50 questions)
questions = collect_questions(n_pages=5)
df_questions = questions_to_df(questions)

# Aperçu du DataFrame
df_questions.head()

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:07<00:00,  1.43s/it]


Unnamed: 0,question_id,title,body,tags,creation_date,score,view_count,answer_count,is_answered
0,11227809,Why is processing a sorted array faster than p...,"<p>In this C++ code, sorting the data (<em>bef...","[java, c++, performance, cpu-architecture, bra...",1340805096,27457,1941240,25,True
1,927358,How do I undo the most recent local commits in...,<p>I accidentally committed the wrong files to...,"[git, version-control, git-commit, undo]",1243620554,27061,16139015,103,True
2,2003505,How do I delete a Git branch locally and remot...,<p>Failed Attempts to Delete a Remote Branch:<...,"[git, version-control, git-branch, git-push, g...",1262653935,20365,12869902,41,True
3,292357,What is the difference between &#39;git pull&#...,"<p>What are the differences between <a href=""h...","[git, version-control, git-pull, git-fetch]",1226742669,13995,3674790,37,True
4,231767,What does the &quot;yield&quot; keyword do in ...,"<p>What functionality does the <a href=""https:...","[python, iterator, generator, yield]",1224800471,13086,3450087,52,True


In [8]:
df_questions.to_csv("stack_questions_api.csv", index=False)