StackExchange App

https://stackapps.com/apps/oauth/view/30531#

Ref: https://huggingface.co/datasets/ymoslem/Law-StackExchange/blob/main/StackExchange.ipynb

In [1]:
import pandas as pd
import re
import requests
from requests.adapters import HTTPAdapter, Retry
from requests.exceptions import ConnectionError
from tqdm.notebook import tqdm
import json
from time import sleep

In [2]:
with open('../ignore/secret.json', 'r', encoding='utf-8') as file:
    secret_key = json.load(file)

In [3]:
# Which site to extract questions from

site = "law"  # "law" or "medicalsciences" etc.

## Get Questions

In [4]:
url = "https://api.stackexchange.com/2.3/questions/"
params = {"client_secret": secret_key['secret '],
          "key": secret_key['key '],
          "site": site,
          "filter": "withbody",
          "order":"desc",
          "sort":"votes",
          "pagesize":100
          }

max_pages = 300 # 300 for law or 80 for medicalsciences

retries = Retry(total=5,
                backoff_factor=0.1,
                status_forcelist=[500, 502, 503, 504, 429])

adapter = HTTPAdapter(max_retries=retries)
session = requests.Session()

# Use the adapter for all requests to endpoints that start with this URL
session.mount('https://api.stackexchange.com/', adapter)

retrieved_data = []

In [None]:
for page in tqdm(range(max_pages)):
  params["page"] = page+1
  response = session.get(url, params=params)

  if response.status_code != 200:
    print("Error:", response.status_code)
    break
  else:
    current_page = response.json()
    for item in current_page['items']:
      retrieved_data.append(item)
    if current_page["has_more"] == False:
      print("No more pages")
      break

In [None]:
len(retrieved_data)

In [None]:
import json

print(json.dumps(retrieved_data[-1], indent=4))

In [8]:
# Save file - test
with open("scrapping_data/example.json", "w") as output:
  json.dump(retrieved_data[:10], output, indent=4)

In [9]:
# Save the whole data
import os

# output_questions_file_name = "law.stackexchange.json"
output_questions_file_name = "law.stackexchange.json"
directory = "scrapping_data/"
full_output_path = os.path.join(directory, output_questions_file_name)

with open(full_output_path, "w+") as output:
  json.dump(retrieved_data, output, indent=4)

## Get answers

In [12]:
import os

# file_path = "/content/drive/MyDrive/scrapping_data/StackExchange/Law/"
# file_name = "law.stackexchange.json"

file_path = "scrapping_data/"
file_name = "law.stackexchange.json"

data = json.load(open(os.path.join(file_path, file_name)))
questions = [question for question in data if question["is_answered"]]

print("Questions total:", len(data))
print("Questions with answers:", len(questions))

Questions total: 30000
Questions with answers: 26834


In [13]:
questions = questions[0:25]

## Using batch

In [14]:
batch_size = 5
batch_length = len(questions) // 5
batch_questions = []

for i in range(batch_size):
    batch_questions.append(questions[i * batch_length: i * batch_length + batch_length])


# output_questions_file_name = "law.stackexchange.json"
output_questions_file_name = "law.stackexchange.batched.json"
directory = "scrapping_data/"
full_output_path = os.path.join(directory, output_questions_file_name)

with open(full_output_path, "w+") as output:
  json.dump(batch_questions, output, indent=4)

In [15]:
url = {"url":"https://api.stackexchange.com/2.3/questions/{idx}/answers"}
params = {"client_secret": secret_key['secret '],
          "key": secret_key['key '],
          "site": site,
          "filter": "withbody",
          "order":"desc",
          "sort":"votes",
          "pagesize":100}

retries = Retry(total=5,
                backoff_factor=0.99,
                status_forcelist=[400, 429, 500, 502, 503, 504])

adapter = HTTPAdapter(max_retries=retries)
session = requests.Session()

# Use the adapter for all requests to endpoints that start with this URL
session.mount('https://api.stackexchange.com/', adapter)

questions_data = []

for i in tqdm(range(len(batch_questions))):

    batch = batch_questions[i]

    for question_item in tqdm(batch):
        question_id = question_item["question_id"]

        question = {}
        question["question_id"] = question_item["question_id"]
        question["tags"] = question_item["tags"]
        question["score"] = question_item["score"]
        question["license"] = question_item.get("content_license", "")
        question["title"] = question_item["title"]
        question["body"] = question_item["body"]
        question["link"] = question_item["link"]
        question["answers"] = []

        response = session.get(url["url"].format(idx=question_id), params=params)

        if response.status_code != 200:
            print("Error:", response.status_code)
            break
        else:
            current_page = response.json()
            for answer_item in current_page['items']:
                if answer_item["score"] >= 0:
                    answer = {}
                    answer["answer_id"] = answer_item["answer_id"]
                    answer["score"] = answer_item["score"]
                    answer["body"] = answer_item["body"]

                    question["answers"].append(answer)

        questions_data.append(question)
        sleep(0.2)

        
        
        # Save the batched data
        output_file_name = f"law.stackexchange-questions-answers-batched-{i}.json"

        full_output_path = os.path.join(file_path, output_file_name)
        
        with open(full_output_path, "w+") as output:
            json.dump(questions_data, output, indent=4)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

RetryError: HTTPSConnectionPool(host='api.stackexchange.com', port=443): Max retries exceeded with url: /2.3/questions/36259/answers?client_secret=NCec61zg6XY5DJ4v8UIoRw%28%28&key=rl_2doaMUY8b45faCZ9XLbbieHBX&site=law&filter=withbody&order=desc&sort=votes&pagesize=100 (Caused by ResponseError('too many 400 error responses'))

## Without batch

In [None]:
url = {"url":"https://api.stackexchange.com/2.3/questions/{idx}/answers"}
params = {"client_secret": secret_key['secret '],
          "key": secret_key['key '],
          "site": site,
          "filter": "withbody",
          "order":"desc",
          "sort":"votes",
          "pagesize":100}

retries = Retry(total=5,
                backoff_factor=0.99,
                status_forcelist=[400, 429, 500, 502, 503, 504])

adapter = HTTPAdapter(max_retries=retries)
session = requests.Session()

# Use the adapter for all requests to endpoints that start with this URL
session.mount('https://api.stackexchange.com/', adapter)

questions_data = []

for question_item in tqdm(questions):
  question_id = question_item["question_id"]

  question = {}
  question["question_id"] = question_item["question_id"]
  question["tags"] = question_item["tags"]
  question["score"] = question_item["score"]
  question["license"] = question_item.get("content_license", "")
  question["title"] = question_item["title"]
  question["body"] = question_item["body"]
  question["link"] = question_item["link"]
  question["answers"] = []

  response = session.get(url["url"].format(idx=question_id), params=params)

  if response.status_code != 200:
    print("Error:", response.status_code)
    break
  else:
    current_page = response.json()
    for answer_item in current_page['items']:
      if answer_item["score"] >= 0:
        answer = {}
        answer["answer_id"] = answer_item["answer_id"]
        answer["score"] = answer_item["score"]
        answer["body"] = answer_item["body"]

        question["answers"].append(answer)

  questions_data.append(question)
  sleep(0.2)

In [None]:
print(len(questions_data))
print(json.dumps(questions_data[-1], indent=2))

In [98]:
# Save the whole data

# output_file_name = "law.stackexchange-questions-answers.json"
output_file_name = "law.stackexchange-questions-answers.json"

full_output_path = os.path.join(file_path, output_file_name)

with open(full_output_path, "w+") as output:
  json.dump(questions_data, output, indent=4)

In [99]:
import os
import json

# output_file_name = "law.stackexchange-questions-answers.json"
output_file_name = "law.stackexchange-questions-answers.json"

full_output_path = os.path.join(file_path, output_file_name)

with open(full_output_path, "r") as output:
  questions_answers = json.load(output)

In [None]:
len(questions_answers)

In [None]:
print(json.dumps(questions_answers[0], indent=4))

In [None]:
questions_answers[0]['answers'][0]['body']

In [None]:
questions[0]['body']

In [None]:
list_questions = []

for q in questions:
    list_questions.append({
        "question_id": q['question_id'],
        "question_body": q['body']
    })

df_questions = pd.DataFrame(list_questions)

df_questions.head()

In [105]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]*>', '', text)
    # Remove links (http, https, www)
    text = re.sub(r'http[s]?://\S+|www\.\S+', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

list_questions_answers = []

for q in questions_answers:

    for a in q['answers']: 
        key = {
            'question_id': q['question_id'],
            'question_title': clean_text(q['title']),
            'question_body': clean_text(q['body']),
            'question_complete': 'Q: ' + clean_text(q['title']) + '. ' + clean_text(q['body']),
            'answer_id': a['answer_id'],
            'answer_score': a['score'],
            'answer_body': 'A: ' + clean_text(a['body'])
        }
        list_questions_answers.append(key)

df_qa = pd.DataFrame(list_questions_answers)
df_qa['row_number_score'] = df_qa.groupby(['question_id', 'question_title'])['answer_score'].rank(method='first', ascending=False).astype(int)
df_qa.drop(columns=['answer_score'], inplace=True)

In [None]:
display(df_qa.shape)
display(df_qa.head())

In [None]:
df_output = df_qa.loc[df_qa['row_number_score'] <= 3, ['question_complete', 'answer_body']]
df_output = df_output.rename(columns={'question_complete': 'question', 'answer_body': 'answer'})

display(df_output.head(10))

df_output.to_csv('data/input_dataset.csv', index=False)

In [None]:
df_output.shape