StackExchange App

https://stackapps.com/apps/oauth/view/30531#

Ref: https://huggingface.co/datasets/ymoslem/Law-StackExchange/blob/main/StackExchange.ipynb

In [1]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter, Retry
from requests.exceptions import ConnectionError
from tqdm.notebook import tqdm
import json
from time import sleep

In [2]:
with open('../ignore/secret.json', 'r', encoding='utf-8') as file:
    secret_key = json.load(file)


In [3]:
# Which site to extract questions from

site = "law"  # "law" or "medicalsciences" etc.

## Get Questions

In [4]:
url = "https://api.stackexchange.com/2.3/questions/"
params = {"client_secret": secret_key['secret '],
          "key": secret_key['key '],
          "site": site,
          "filter": "withbody",
          "order":"desc",
          "sort":"votes",
          "pagesize":100
          }

max_pages = 300 # 300 for law or 80 for medicalsciences

retries = Retry(total=5,
                backoff_factor=0.1,
                status_forcelist=[500, 502, 503, 504, 429])

adapter = HTTPAdapter(max_retries=retries)
session = requests.Session()

# Use the adapter for all requests to endpoints that start with this URL
session.mount('https://api.stackexchange.com/', adapter)

retrieved_data = []

In [5]:
for page in tqdm(range(max_pages)):
  params["page"] = page+1
  response = session.get(url, params=params)

  if response.status_code != 200:
    print("Error:", response.status_code)
    break
  else:
    current_page = response.json()
    for item in current_page['items']:
      retrieved_data.append(item)
    if current_page["has_more"] == False:
      print("No more pages")
      break

  0%|          | 0/300 [00:00<?, ?it/s]

In [6]:
len(retrieved_data)

30000

In [7]:
import json

print(json.dumps(retrieved_data[-1], indent=4))

{
    "tags": [
        "england-and-wales",
        "discrimination",
        "is-x-legal",
        "sex-discrimination"
    ],
    "owner": {
        "account_id": 27127305,
        "reputation": 1,
        "user_id": 48046,
        "user_type": "registered",
        "profile_image": "https://www.gravatar.com/avatar/2b42ab6e24f8294aba29c8440f7e8384?s=256&d=identicon&r=PG",
        "display_name": "TylerDurden",
        "link": "https://law.stackexchange.com/users/48046/tylerdurden"
    },
    "is_answered": true,
    "view_count": 311,
    "closed_date": 1694809103,
    "answer_count": 2,
    "score": -3,
    "last_activity_date": 1694694618,
    "creation_date": 1694627391,
    "last_edit_date": 1694646497,
    "question_id": 95448,
    "link": "https://law.stackexchange.com/questions/95448/is-it-lawful-to-maintain-different-independent-dress-codes-for-male-and-female",
    "closed_reason": "Duplicate",
    "title": "Is it lawful to maintain different, independent dress codes for ma

In [8]:
# Save file - test
with open("example.json", "w") as output:
  json.dump(retrieved_data[:10], output, indent=4)

In [12]:
# Save the whole data
import os

# output_questions_file_name = "law.stackexchange.json"
output_questions_file_name = "law.stackexchange.json"
directory = "output_data/"
full_output_path = os.path.join(directory, output_questions_file_name)

with open(full_output_path, "w+") as output:
  json.dump(retrieved_data, output, indent=4)

## Get answers

In [13]:
import os

# file_path = "/content/drive/MyDrive/data/StackExchange/Law/"
# file_name = "law.stackexchange.json"

file_path = "output_data/"
file_name = "law.stackexchange.json"

data = json.load(open(os.path.join(file_path, file_name)))
questions = [question for question in data if question["is_answered"]]

print("Questions total:", len(data))
print("Questions with answers:", len(questions))

Questions total: 30000
Questions with answers: 26823


In [15]:
url = {"url":"https://api.stackexchange.com/2.3/questions/{idx}/answers"}
params = {"client_secret": secret_key['secret '],
          "key": secret_key['key '],
          "site": site,
          "filter": "withbody",
          "order":"desc",
          "sort":"votes",
          "pagesize":100}

retries = Retry(total=5,
                backoff_factor=0.99,
                status_forcelist=[400, 429, 500, 502, 503, 504])

adapter = HTTPAdapter(max_retries=retries)
session = requests.Session()

# Use the adapter for all requests to endpoints that start with this URL
session.mount('https://api.stackexchange.com/', adapter)

questions_data = []

for question_item in tqdm(questions):
  question_id = question_item["question_id"]

  question = {}
  question["question_id"] = question_item["question_id"]
  question["tags"] = question_item["tags"]
  question["score"] = question_item["score"]
  question["license"] = question_item.get("content_license", "")
  question["title"] = question_item["title"]
  question["body"] = question_item["body"]
  question["link"] = question_item["link"]
  question["answers"] = []

  response = session.get(url["url"].format(idx=question_id), params=params)

  if response.status_code != 200:
    print("Error:", response.status_code)
    break
  else:
    current_page = response.json()
    for answer_item in current_page['items']:
      if answer_item["score"] >= 0:
        answer = {}
        answer["answer_id"] = answer_item["answer_id"]
        answer["score"] = answer_item["score"]
        answer["body"] = answer_item["body"]

        question["answers"].append(answer)

  questions_data.append(question)
  sleep(0.5)

  0%|          | 0/26823 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [16]:
print(len(questions_data))
print(json.dumps(questions_data[-1], indent=2))

118
{
  "question_id": 3519,
  "tags": [
    "police",
    "search-and-seizure"
  ],
  "score": 39,
  "license": "CC BY-SA 3.0",
  "title": "Consent to searches: Who wins &quot;my-word-against-yours?&quot;",
  "body": "<h3>Hypothetical</h3>\n\n<ul>\n<li>Officer testifies citizen <strong>consented</strong> to be searched.</li>\n<li>Citizen <strong>denies</strong> consenting to be searched.</li>\n<li>There is <strong>no other evidence</strong> (that weighs net in favor of either party).</li>\n</ul>\n\n<h1>Question</h1>\n\n<blockquote>\n  <p><strong>Whom does the judge believe?</strong></p>\n</blockquote>\n\n<hr>\n\n<h3>Clarifying Discussion</h3>\n\n<p><em>from previous comments...</em></p>\n\nPoint\n\n<p>The testimony of the officer and the citizen would be more complete than just the officer saying \"The defendant consented to be searched\" and the defendant saying \"I did not consent to be searched\". Each would give a narrative of the situation that could be cross-examined. This would

In [17]:
# Save the whole data

# output_file_name = "law.stackexchange-questions-answers.json"
output_file_name = "law.stackexchange-questions-answers.json"

full_output_path = os.path.join(file_path, output_file_name)

with open(full_output_path, "w+") as output:
  json.dump(questions_data, output, indent=4)

In [18]:
import os
import json

# output_file_name = "law.stackexchange-questions-answers.json"
output_file_name = "law.stackexchange-questions-answers.json"

full_output_path = os.path.join(file_path, output_file_name)

with open(full_output_path, "r") as output:
  questions_answers = json.load(output)

In [19]:
len(questions_answers)

118

In [20]:
print(json.dumps(questions_answers[0], indent=4))

{
    "question_id": 36259,
    "tags": [
        "contract-law",
        "employment"
    ],
    "score": 232,
    "license": "CC BY-SA 4.0",
    "title": "If a company agrees to pay travel cost for a job interview, is the promise binding and enforceable?",
    "body": "<p>There is a question on workplace.SE about a company which offered to pay travel costs for a job interview, but canceled the return ticket after ending the interview - <a href=\"https://workplace.stackexchange.com/questions/126565/potential-employer-cancels-return-flight\">Potential Employer Cancels Return Flight</a>.</p>\n\n<p>Generally, when arranging a job interview which requires the candidate to travel, it is common for the potential employer to pay travel cost and lodging for the candidate. In practice, the employer will ususally offer this, often in writing, but not write up a formal contract or similar document.</p>\n\n<p><strong>In that situation, can the employer later renege on the promise to pay costs?</s