In [1]:
import logging
import requests
import time
import json

https://requests.readthedocs.io/en/latest/user/quickstart/

In [2]:
URL = r"https://api.personality-database.com/api/v1/profile/"
# It’s a good practice to set connect timeouts to slightly larger than a multiple of 3, which is the default TCP packet retransmission window.
# https://requests.readthedocs.io/en/latest/user/advanced/ -> Timeouts
TIMEOUT = 3.05
# 5 minute delay because of AWS WAF constraint
# https://docs.aws.amazon.com/waf/latest/developerguide/waf-rule-statement-type-rate-based.html
DELAY = 305
LOWER_LIMIT = 1
UPPER_LIMIT = 100000

In [3]:
logging.basicConfig(filename="errorlog.log", encoding="utf-8", level=logging.ERROR)

In [4]:
with open('./corpus_template.json', encoding='UTF-8') as file:
    external_json_format = json.load(file)

def format_JSON(file):
    external_json_dict = {}
    try:
        for element in file['topic_info']['topic']['posts']['posts']:
            for key in ['username', 'user_pic_path', 'user_personality_type', 'is_mod']:
                element.pop(key)
    except KeyError:
        pass
    for category in external_json_format:
        external_json_dict[category] = file[category]
    return external_json_dict

def save_JSON(file, file_name: int):
    with open(f"data/{file_name}.json", "w", encoding="utf-8") as f:
        json.dump(file, f, ensure_ascii=False)
    logging.info("JSON ready")

https://requests.readthedocs.io/en/latest/user/advanced/

In [5]:
session = requests.Session()

In [6]:
def scrape(session: requests.Session = session, lower_bound: int = LOWER_LIMIT, upper_bound: int = UPPER_LIMIT):
    for file_id in range(lower_bound, upper_bound):
        try:
            r = session.get(f"{URL}{file_id}", timeout = TIMEOUT)

            r.raise_for_status()
            
            json = r.json()
            data = format_JSON(json)

            save_JSON(data, file_id)
        except requests.exceptions.HTTPError as HTTP_err:
            if r.text.__contains__("message"):
                logging.error(f"{HTTP_err} - {r.json()['message']}")
            else:
                logging.critical(f"{HTTP_err} - Cloudfront")
                time.sleep(DELAY)
                scrape(lower_bound=file_id)
                break
            continue
        except requests.exceptions.ConnectionError as conn_err:
            logging.critical(f"{conn_err} {r.json()['message']}")
            break
        except requests.exceptions.Timeout as time_err:
            logging.error(f"{time_err} - Timed out for url: {r.url}")
            continue
        except requests.JSONDecodeError as decode_err:
            logging.error(f"{decode_err} - Couldn't decode json from: {r.url}")
            continue

In [21]:
scrape()