StackExchange App

https://stackapps.com/apps/oauth/view/30531#

Ref: https://huggingface.co/datasets/ymoslem/Law-StackExchange/blob/main/StackExchange.ipynb

In [85]:
import pandas as pd
import re
import requests
from requests.adapters import HTTPAdapter, Retry
from requests.exceptions import ConnectionError
from tqdm.notebook import tqdm
import json
from time import sleep

In [86]:
with open('../ignore/secret.json', 'r', encoding='utf-8') as file:
    secret_key = json.load(file)


In [87]:
# Which site to extract questions from

site = "law"  # "law" or "medicalsciences" etc.

## Get Questions

In [88]:
url = "https://api.stackexchange.com/2.3/questions/"
params = {"client_secret": secret_key['secret '],
          "key": secret_key['key '],
          "site": site,
          "filter": "withbody",
          "order":"desc",
          "sort":"votes",
          "pagesize":100
          }

max_pages = 300 # 300 for law or 80 for medicalsciences

retries = Retry(total=5,
                backoff_factor=0.1,
                status_forcelist=[500, 502, 503, 504, 429])

adapter = HTTPAdapter(max_retries=retries)
session = requests.Session()

# Use the adapter for all requests to endpoints that start with this URL
session.mount('https://api.stackexchange.com/', adapter)

retrieved_data = []

In [89]:
for page in tqdm(range(max_pages)):
  params["page"] = page+1
  response = session.get(url, params=params)

  if response.status_code != 200:
    print("Error:", response.status_code)
    break
  else:
    current_page = response.json()
    for item in current_page['items']:
      retrieved_data.append(item)
    if current_page["has_more"] == False:
      print("No more pages")
      break

  0%|          | 0/300 [00:00<?, ?it/s]

In [90]:
len(retrieved_data)

30000

In [91]:
import json

print(json.dumps(retrieved_data[-1], indent=4))

{
    "tags": [
        "united-kingdom",
        "interpretation",
        "judiciary",
        "equality-act-2010"
    ],
    "owner": {
        "account_id": 27127305,
        "reputation": 1,
        "user_id": 48046,
        "user_type": "registered",
        "profile_image": "https://www.gravatar.com/avatar/2b42ab6e24f8294aba29c8440f7e8384?s=256&d=identicon&r=PG",
        "display_name": "TylerDurden",
        "link": "https://law.stackexchange.com/users/48046/tylerdurden"
    },
    "is_answered": true,
    "view_count": 364,
    "accepted_answer_id": 94472,
    "answer_count": 2,
    "score": -3,
    "last_activity_date": 1691150237,
    "creation_date": 1690661713,
    "last_edit_date": 1690830120,
    "question_id": 94292,
    "content_license": "CC BY-SA 4.0",
    "link": "https://law.stackexchange.com/questions/94292/what-is-the-judicial-rationale-for-introducing-the-grainger-test-to-qualify-beli",
    "title": "What is the judicial rationale for introducing the grainger te

In [92]:
# Save file - test
with open("example.json", "w") as output:
  json.dump(retrieved_data[:10], output, indent=4)

In [93]:
# Save the whole data
import os

# output_questions_file_name = "law.stackexchange.json"
output_questions_file_name = "law.stackexchange.json"
directory = "scrapping_data/"
full_output_path = os.path.join(directory, output_questions_file_name)

with open(full_output_path, "w+") as output:
  json.dump(retrieved_data, output, indent=4)

## Get answers

In [94]:
import os

# file_path = "/content/drive/MyDrive/scrapping_data/StackExchange/Law/"
# file_name = "law.stackexchange.json"

file_path = "scrapping_data/"
file_name = "law.stackexchange.json"

data = json.load(open(os.path.join(file_path, file_name)))
questions = [question for question in data if question["is_answered"]]

print("Questions total:", len(data))
print("Questions with answers:", len(questions))

Questions total: 30000
Questions with answers: 26826


In [96]:
url = {"url":"https://api.stackexchange.com/2.3/questions/{idx}/answers"}
params = {"client_secret": secret_key['secret '],
          "key": secret_key['key '],
          "site": site,
          "filter": "withbody",
          "order":"desc",
          "sort":"votes",
          "pagesize":100}

retries = Retry(total=5,
                backoff_factor=0.99,
                status_forcelist=[400, 429, 500, 502, 503, 504])

adapter = HTTPAdapter(max_retries=retries)
session = requests.Session()

# Use the adapter for all requests to endpoints that start with this URL
session.mount('https://api.stackexchange.com/', adapter)

questions_data = []

for question_item in tqdm(questions):
  question_id = question_item["question_id"]

  question = {}
  question["question_id"] = question_item["question_id"]
  question["tags"] = question_item["tags"]
  question["score"] = question_item["score"]
  question["license"] = question_item.get("content_license", "")
  question["title"] = question_item["title"]
  question["body"] = question_item["body"]
  question["link"] = question_item["link"]
  question["answers"] = []

  response = session.get(url["url"].format(idx=question_id), params=params)

  if response.status_code != 200:
    print("Error:", response.status_code)
    break
  else:
    current_page = response.json()
    for answer_item in current_page['items']:
      if answer_item["score"] >= 0:
        answer = {}
        answer["answer_id"] = answer_item["answer_id"]
        answer["score"] = answer_item["score"]
        answer["body"] = answer_item["body"]

        question["answers"].append(answer)

  questions_data.append(question)
  sleep(0.3)

  0%|          | 0/26826 [00:00<?, ?it/s]

RetryError: HTTPSConnectionPool(host='api.stackexchange.com', port=443): Max retries exceeded with url: /2.3/questions/92991/answers?client_secret=NCec61zg6XY5DJ4v8UIoRw%28%28&key=rl_2doaMUY8b45faCZ9XLbbieHBX&site=law&filter=withbody&order=desc&sort=votes&pagesize=100 (Caused by ResponseError('too many 400 error responses'))

In [97]:
print(len(questions_data))
print(json.dumps(questions_data[-1], indent=2))

8680
{
  "question_id": 92959,
  "tags": [
    "trademark",
    "germany"
  ],
  "score": 3,
  "license": "CC BY-SA 4.0",
  "title": "Does the renaming of the German &#39;ebay Kleinanzeigen&#39; to &#39;Kleinanzeigen&#39; mean they lose any trademark to their name?",
  "body": "<p>So ebay Kleinanzeigen is a company that owns a website for classified ads. People can post classified ads there and search the website to buy things from classified ads. I suppose ebay has the exact same business model in various other countries. This is the German branch. Now the German word 'Kleinanzeige' means exactly classified ad.</p>\n<p>Recently they renamed both their website and the company itself to just 'Kleinanzeigen'. To me this looks like a classic example of a trademark for common words as in <a href=\"https://law.stackexchange.com/questions/18625/what-happens-if-a-brand-or-companys-name-is-a-common-everyday-word?rq=1\">this</a> or <a href=\"https://law.stackexchange.com/questions/76855/can-you

In [98]:
# Save the whole data

# output_file_name = "law.stackexchange-questions-answers.json"
output_file_name = "law.stackexchange-questions-answers.json"

full_output_path = os.path.join(file_path, output_file_name)

with open(full_output_path, "w+") as output:
  json.dump(questions_data, output, indent=4)

In [99]:
import os
import json

# output_file_name = "law.stackexchange-questions-answers.json"
output_file_name = "law.stackexchange-questions-answers.json"

full_output_path = os.path.join(file_path, output_file_name)

with open(full_output_path, "r") as output:
  questions_answers = json.load(output)

In [100]:
len(questions_answers)

8680

In [101]:
print(json.dumps(questions_answers[0], indent=4))

{
    "question_id": 36259,
    "tags": [
        "contract-law",
        "employment"
    ],
    "score": 232,
    "license": "CC BY-SA 4.0",
    "title": "If a company agrees to pay travel cost for a job interview, is the promise binding and enforceable?",
    "body": "<p>There is a question on workplace.SE about a company which offered to pay travel costs for a job interview, but canceled the return ticket after ending the interview - <a href=\"https://workplace.stackexchange.com/questions/126565/potential-employer-cancels-return-flight\">Potential Employer Cancels Return Flight</a>.</p>\n\n<p>Generally, when arranging a job interview which requires the candidate to travel, it is common for the potential employer to pay travel cost and lodging for the candidate. In practice, the employer will ususally offer this, often in writing, but not write up a formal contract or similar document.</p>\n\n<p><strong>In that situation, can the employer later renege on the promise to pay costs?</s

In [102]:
questions_answers[0]['answers'][0]['body']

'<p>This aspect (and many others) of contract law is applicable in the US and various countries of the EU.</p>\n<blockquote>\n<p>can they renege after the candidate has begun their journey, thus\nsaddling the candidate with the travel cost?</p>\n</blockquote>\n<p>No. The company would incur breach of contract.</p>\n<p>There is no need for a formal contract. The candidate only needs to prove that the company agreed (in writing, orally or clearly through its conduct) to cover or reimburse those expenses and that this elicited a <em><a href="https://en.wikipedia.org/wiki/Meeting_of_the_minds" rel="noreferrer">meeting of the minds</a></em>.</p>\n<p>The agreement would be void if the candidate incurred the expenses despite knowing (via timely notice) that the company changed its mind.</p>\n<p>Likewise, if the candidate lied on his CV, the contract (here, the company\'s agreement to cover the expenses) would be voidable by the company, since the candidate\'s intentional misrepresentations pr

In [103]:
questions[0]['body']

'<p>There is a question on workplace.SE about a company which offered to pay travel costs for a job interview, but canceled the return ticket after ending the interview - <a href="https://workplace.stackexchange.com/questions/126565/potential-employer-cancels-return-flight">Potential Employer Cancels Return Flight</a>.</p>\n\n<p>Generally, when arranging a job interview which requires the candidate to travel, it is common for the potential employer to pay travel cost and lodging for the candidate. In practice, the employer will ususally offer this, often in writing, but not write up a formal contract or similar document.</p>\n\n<p><strong>In that situation, can the employer later renege on the promise to pay costs?</strong> In particular, can they renege after the candidate has begun their journey, thus saddling the candidate with the travel cost?</p>\n\n<hr>\n\n<p>My thoughts:</p>\n\n<ul>\n<li>On the one hand, a simple one-sided promise is usually not binding, as a binding agreement r

In [104]:
list_questions = []

for q in questions:
    list_questions.append({
        "question_id": q['question_id'],
        "question_body": q['body']
    })

df_questions = pd.DataFrame(list_questions)

df_questions.head()

Unnamed: 0,question_id,question_body
0,36259,<p>There is a question on workplace.SE about a...
1,33212,<p>I am a high school Chemistry teacher. This ...
2,47456,<p>My youngest is a freshman &amp; signed up f...
3,98706,<p>Imagine sitting in an airplane when suddenl...
4,48515,"<p>As I understand, in general, <a href=""https..."


In [105]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]*>', '', text)
    # Remove links (http, https, www)
    text = re.sub(r'http[s]?://\S+|www\.\S+', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

list_questions_answers = []

for q in questions_answers:

    for a in q['answers']: 
        key = {
            'question_id': q['question_id'],
            'question_title': clean_text(q['title']),
            'question_body': clean_text(q['body']),
            'question_complete': 'Q: ' + clean_text(q['title']) + '. ' + clean_text(q['body']),
            'answer_id': a['answer_id'],
            'answer_score': a['score'],
            'answer_body': 'A: ' + clean_text(a['body'])
        }
        list_questions_answers.append(key)

df_qa = pd.DataFrame(list_questions_answers)
df_qa['row_number_score'] = df_qa.groupby(['question_id', 'question_title'])['answer_score'].rank(method='first', ascending=False).astype(int)
df_qa.drop(columns=['answer_score'], inplace=True)

In [106]:
display(df_qa.shape)
display(df_qa.head())

(16836, 7)

Unnamed: 0,question_id,question_title,question_body,question_complete,answer_id,answer_body,row_number_score
0,36259,If a company agrees to pay travel cost for a j...,There is a question on workplace.SE about a co...,Q: If a company agrees to pay travel cost for ...,36261,A: This aspect (and many others) of contract l...,1
1,36259,If a company agrees to pay travel cost for a j...,There is a question on workplace.SE about a co...,Q: If a company agrees to pay travel cost for ...,36278,A: Yes the promise is binding and enforceable ...,2
2,36259,If a company agrees to pay travel cost for a j...,There is a question on workplace.SE about a co...,Q: If a company agrees to pay travel cost for ...,36342,A: As the question specifically asks for diffe...,3
3,36259,If a company agrees to pay travel cost for a j...,There is a question on workplace.SE about a co...,Q: If a company agrees to pay travel cost for ...,36344,A: Under UK law you can recover any costs from...,4
4,36259,If a company agrees to pay travel cost for a j...,There is a question on workplace.SE about a co...,Q: If a company agrees to pay travel cost for ...,36372,"A: Under English Law to form a contract, the e...",5


In [109]:
df_output = df_qa.loc[df_qa['row_number_score'] <= 3, ['question_complete', 'answer_body']]
df_output = df_output.rename(columns={'question_complete': 'question', 'answer_body': 'answer'})

display(df_output.head(10))

df_output.to_csv('data/input_dataset.csv', index=False)

Unnamed: 0,question,answer
0,Q: If a company agrees to pay travel cost for ...,A: This aspect (and many others) of contract l...
1,Q: If a company agrees to pay travel cost for ...,A: Yes the promise is binding and enforceable ...
2,Q: If a company agrees to pay travel cost for ...,A: As the question specifically asks for diffe...
7,Q: A student slipped a drug into my coffee — w...,A: If the pill contained a harmful or noxious ...
8,Q: A student slipped a drug into my coffee — w...,A: I told the school resource officer that I w...
9,Q: A student slipped a drug into my coffee — w...,"A: Edit 11/13/2018 Yes, I'm aware this answer ..."
12,Q: Can a public school in the USA force a 14yr...,A: You say: the school expects him to create a...
13,Q: Can a public school in the USA force a 14yr...,A: I think some of the answers are good but ta...
14,Q: Can a public school in the USA force a 14yr...,A: I would suggest reaching out to the ACLU in...
18,Q: Is it insider trading if I bought Boeing pu...,A: Is it insider trading if I bought Boeing pu...


In [108]:
df_output.shape

(15180, 2)