In [1]:
import logging
import requests
import time
import json

https://requests.readthedocs.io/en/latest/user/quickstart/

In [2]:
URL = r"https://api.personality-database.com/api/v1/profile/"
# It’s a good practice to set connect timeouts to slightly larger than a multiple of 3, which is the default TCP packet retransmission window.
# https://requests.readthedocs.io/en/latest/user/advanced/ -> Timeouts
TIMEOUT = 3.05
# Add delay after error because of 5 minute block by AWS WAF
# https://docs.aws.amazon.com/waf/latest/developerguide/waf-rule-statement-type-rate-based.html
DELAY_AFTER_ERROR = 21
# A delay between requests would be more ethical, especially if we start to scrape in parallel a lot
# https://scrapeops.io/python-scrapy-playbook/scrapy-delay-between-requests/
DELAY_BETWEEN_REQUESTS = 3.05
LOWER_LIMIT = 1
UPPER_LIMIT = 100000

In [5]:
local_logger = logging.getLogger("local")
local_logger.setLevel(logging.ERROR)
local_handler = logging.FileHandler(filename="error.log", mode="a", encoding="utf-8")
local_formatter = logging.Formatter(fmt="%(asctime)s :: %(name)s :: %(levelname)s :: %(message)s", datefmt="%Y-%m-%d %H:%M:%S", style="%")
local_handler.setFormatter(local_formatter)
local_logger.addHandler(local_handler)

In [6]:
def save_JSON(file, file_name: int):
    with open(f"data/{file_name}.json", "w", encoding="utf-8") as f:
        json.dump(file, f, ensure_ascii=False)

https://requests.readthedocs.io/en/latest/user/advanced/

In [7]:
session = requests.Session()

In [None]:
current_id = LOWER_LIMIT
while current_id <= UPPER_LIMIT:
    time.sleep(DELAY_BETWEEN_REQUESTS)
    try:
        with session.get(f"{URL}{current_id}", timeout = TIMEOUT) as r:
            r.raise_for_status()

            data = r.json()

        save_JSON(data, current_id)
        current_id += 1
    except requests.exceptions.HTTPError as HTTP_err:
        if r.text.__contains__("message"):
            err_message = f"{HTTP_err}\t-\t{r.json()['message']}"
            local_logger.error(err_message)
            current_id += 1
        else:
            err_message = f"{HTTP_err}\t-\tCloudfront"
            local_logger.critical(err_message)
            time.sleep(DELAY_AFTER_ERROR)
            continue
    except requests.exceptions.ConnectionError as conn_err:
        err_message = f"{conn_err}\t-\t{r.json()['message']}"
        local_logger.critical(err_message)
        time.sleep(DELAY_AFTER_ERROR)
        continue
    except requests.exceptions.Timeout as time_err:
        err_message = f"{time_err}\t-\tTimed out for url: {r.url}"
        local_logger.critical(err_message)
        continue
    except Exception as exc:
        local_logger.critical(exc)
        break