StackExchange App

https://stackapps.com/apps/oauth/view/30531#

Ref: https://huggingface.co/datasets/ymoslem/Law-StackExchange/blob/main/StackExchange.ipynb

In [1]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter, Retry
from requests.exceptions import ConnectionError
from tqdm.notebook import tqdm
import json
from time import sleep

In [2]:
with open('../ignore/secret.json', 'r', encoding='utf-8') as file:
    secret_key = json.load(file)


In [3]:
# Which site to extract questions from

site = "law"  # "law" or "medicalsciences" etc.

## Get Questions

In [4]:
url = "https://api.stackexchange.com/2.3/questions/"
params = {"client_secret": secret_key['secret '],
          "key": secret_key['key '],
          "site": site,
          "filter": "withbody",
          "order":"desc",
          "sort":"votes",
          "pagesize":100
          }

max_pages = 300 # 300 for law or 80 for medicalsciences

retries = Retry(total=5,
                backoff_factor=0.1,
                status_forcelist=[500, 502, 503, 504, 429])

adapter = HTTPAdapter(max_retries=retries)
session = requests.Session()

# Use the adapter for all requests to endpoints that start with this URL
session.mount('https://api.stackexchange.com/', adapter)

retrieved_data = []

In [5]:
for page in tqdm(range(max_pages)):
  params["page"] = page+1
  response = session.get(url, params=params)

  if response.status_code != 200:
    print("Error:", response.status_code)
    break
  else:
    current_page = response.json()
    for item in current_page['items']:
      retrieved_data.append(item)
    if current_page["has_more"] == False:
      print("No more pages")
      break

  0%|          | 0/300 [00:00<?, ?it/s]

In [6]:
len(retrieved_data)

30000

In [7]:
import json

print(json.dumps(retrieved_data[-1], indent=4))

{
    "tags": [
        "england-and-wales",
        "legal-history"
    ],
    "owner": {
        "account_id": 27127305,
        "reputation": 1,
        "user_id": 48046,
        "user_type": "registered",
        "profile_image": "https://www.gravatar.com/avatar/2b42ab6e24f8294aba29c8440f7e8384?s=256&d=identicon&r=PG",
        "display_name": "TylerDurden",
        "link": "https://law.stackexchange.com/users/48046/tylerdurden"
    },
    "is_answered": true,
    "view_count": 72,
    "accepted_answer_id": 95976,
    "answer_count": 1,
    "score": -3,
    "last_activity_date": 1696453559,
    "creation_date": 1696452360,
    "last_edit_date": 1696453559,
    "question_id": 95973,
    "content_license": "CC BY-SA 4.0",
    "link": "https://law.stackexchange.com/questions/95973/was-affray-a-common-law-offence-before-it-was-codified-in-statute",
    "title": "Was affray a common law offence before it was codified in statute?",
    "body": "<p>Section 3 POA 1986 describes affray as a 

In [None]:
# Save file - test
with open("example.json", "w") as output:
  json.dump(retrieved_data[:10], output, indent=4)

In [None]:
# Save the whole data
import os

# output_questions_file_name = "law.stackexchange.json"
output_questions_file_name = "medical.stackexchange.json"
directory = "/content/drive/MyDrive/data/StackExchange/MedicalSciences"
full_output_path = os.path.join(directory, output_questions_file_name)

with open(full_output_path, "w+") as output:
  json.dump(retrieved_data, output, indent=4)

## Get answers

In [None]:
import os

# file_path = "/content/drive/MyDrive/data/StackExchange/Law/"
# file_name = "law.stackexchange.json"

file_path = "/content/drive/MyDrive/data/StackExchange/MedicalSciences/"
file_name = "medical.stackexchange.json"

data = json.load(open(os.path.join(file_path, file_name)))
questions = [question for question in data if question["is_answered"]]

print("Questions total:", len(data))
print("Questions with answers:", len(questions))

In [None]:
url = {"url":"https://api.stackexchange.com/2.3/questions/{idx}/answers"}
params = {"client_secret": secret,
          "key": key,
          "site": site,
          "filter": "withbody",
          "order":"desc",
          "sort":"votes",
          "pagesize":100}

retries = Retry(total=5,
                backoff_factor=0.99,
                status_forcelist=[400, 429, 500, 502, 503, 504])

adapter = HTTPAdapter(max_retries=retries)
session = requests.Session()

# Use the adapter for all requests to endpoints that start with this URL
session.mount('https://api.stackexchange.com/', adapter)

questions_data = []

for question_item in tqdm(questions):
  question_id = question_item["question_id"]

  question = {}
  question["question_id"] = question_item["question_id"]
  question["tags"] = question_item["tags"]
  question["score"] = question_item["score"]
  question["license"] = question_item.get("content_license", "")
  question["title"] = question_item["title"]
  question["body"] = question_item["body"]
  question["link"] = question_item["link"]
  question["answers"] = []

  response = session.get(url["url"].format(idx=question_id), params=params)

  if response.status_code != 200:
    print("Error:", response.status_code)
    break
  else:
    current_page = response.json()
    for answer_item in current_page['items']:
      if answer_item["score"] >= 0:
        answer = {}
        answer["answer_id"] = answer_item["answer_id"]
        answer["score"] = answer_item["score"]
        answer["body"] = answer_item["body"]

        question["answers"].append(answer)

  questions_data.append(question)
  sleep(0.5)

In [None]:
print(len(questions_data))
print(json.dumps(questions_data[-1], indent=2))

In [None]:
# Save the whole data

# output_file_name = "law.stackexchange-questions-answers.json"
output_file_name = "medical.stackexchange-questions-answers.json"

full_output_path = os.path.join(file_path, output_file_name)

with open(full_output_path, "w+") as output:
  json.dump(questions_data, output, indent=4)

In [None]:
import os
import json

# output_file_name = "law.stackexchange-questions-answers.json"
output_file_name = "medical.stackexchange-questions-answers.json"

full_output_path = os.path.join(file_path, output_file_name)

with open(full_output_path, "r") as output:
  questions_answers = json.load(output)

In [None]:
len(questions_answers)

In [None]:
print(json.dumps(questions_answers[0], indent=4))