In [1]:
import logging
import requests
import time
import json
import csv

https://requests.readthedocs.io/en/latest/user/quickstart/

In [2]:
URL = r"https://api.personality-database.com/api/v2/profiles/"
# It’s a good practice to set connect timeouts to slightly larger than a multiple of 3, which is the default TCP packet retransmission window.
# https://requests.readthedocs.io/en/latest/user/advanced/ -> Timeouts
TIMEOUT = 3.05
# Add delay after error because of 5 minute block by AWS WAF
# https://docs.aws.amazon.com/waf/latest/developerguide/waf-rule-statement-type-rate-based.html
DELAY_AFTER_ERROR = 21
# A delay between requests would be more ethical, especially if we start to scrape in parallel a lot
# https://scrapeops.io/python-scrapy-playbook/scrapy-delay-between-requests/
DELAY_BETWEEN_REQUESTS = 3.05
LOWER_LIMIT = 29997
UPPER_LIMIT = 45000

In [3]:
local_logger = logging.getLogger("local")
local_logger.setLevel(logging.INFO)
local_handler = logging.FileHandler(filename="error.log", mode="a", encoding="utf-8")
local_formatter = logging.Formatter(fmt="%(asctime)s :: %(name)s :: %(levelname)s :: %(message)s", datefmt="%Y-%m-%d %H:%M:%S", style="%")
local_handler.setFormatter(local_formatter)
local_logger.addHandler(local_handler)

In [4]:
id_list = []

In [5]:
def populate_id_list():
    with open("output.csv", newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=';')
        for row in reader:
            id_list.append(row[0])       

In [6]:
populate_id_list()

In [7]:
def save_JSON(file, file_name: int):
    with open(f"data/{file_name}_related.json", "w", encoding="utf-8") as f:
        json.dump(file, f, ensure_ascii=False)

https://requests.readthedocs.io/en/latest/user/advanced/

In [8]:
session = requests.Session()

In [9]:
id_index = LOWER_LIMIT
while id_index <= UPPER_LIMIT:
    time.sleep(DELAY_BETWEEN_REQUESTS)
    try:
        with session.get(rf"{URL}{id_list[id_index]}/related", timeout = TIMEOUT) as r:
            r.raise_for_status()

            data = r.json()

        save_JSON(data, id_list[id_index])
        local_logger.info(f"Saved {id_list[id_index]} @ index {id_index}")
        id_index += 1
    except requests.exceptions.HTTPError as HTTP_err:
        if r.text.__contains__("error"):
            err_message = f"{HTTP_err}\t-\t@ index {id_index}"
            local_logger.error(err_message)
            id_index += 1
        else:
            err_message = f"{HTTP_err}\t-\tCloudfront @ index {id_index}"
            local_logger.critical(err_message)
            time.sleep(DELAY_AFTER_ERROR)
            continue
    except requests.exceptions.ConnectionError as conn_err:
        err_message = f"{conn_err}\t-\t@ index {id_index}"
        local_logger.critical(err_message)
        time.sleep(DELAY_AFTER_ERROR)
        continue
    except requests.exceptions.Timeout as time_err:
        err_message = f"{time_err}\t-\tTimed out for url: {r.url} @ index {id_index}"
        local_logger.critical(err_message)
        continue
    except Exception as exc:
        local_logger.critical(exc)
        break