In [1]:
import logging
import requests
import time
import json
import os
from logging.handlers import SysLogHandler

https://requests.readthedocs.io/en/latest/user/quickstart/

In [2]:
URL = r"https://api.personality-database.com/api/v1/profile/"
# It’s a good practice to set connect timeouts to slightly larger than a multiple of 3, which is the default TCP packet retransmission window.
# https://requests.readthedocs.io/en/latest/user/advanced/ -> Timeouts
TIMEOUT = 3.05
# 5 minute delay because of AWS WAF constraint
# https://docs.aws.amazon.com/waf/latest/developerguide/waf-rule-statement-type-rate-based.html
DELAY = 103
LOWER_LIMIT = 1
UPPER_LIMIT = 100000

In [3]:
PAPERTRAIL_HOST="logs3.papertrailapp.com"
PAPERTRAIL_PORT=36753

In [4]:
# Timezone has to be set for logger to record correct timestamps
os.environ['TZ'] = 'Europe/Berlin'
time.tzset()

In [5]:
remote_logger = logging.getLogger("remote")
remote_logger.setLevel(logging.INFO)
remote_handler = SysLogHandler(address=(PAPERTRAIL_HOST, PAPERTRAIL_PORT))
remote_formatter = logging.Formatter(fmt="%(name)s :: %(levelname)s :: %(message)s")
remote_handler.setFormatter(remote_formatter)
remote_logger.addHandler(remote_handler)

local_logger = logging.getLogger("local")
local_logger.setLevel(logging.ERROR)
local_handler = logging.FileHandler(filename="error.log", mode="a", encoding="utf-8")
local_formatter = logging.Formatter(fmt="%(asctime)s :: %(name)s :: %(levelname)s :: %(message)s", datefmt="%Y-%m-%d %H:%M:%S", style="%")
local_handler.setFormatter(local_formatter)
local_logger.addHandler(local_handler)

In [6]:
#with open('./corpus_template.json', encoding='UTF-8') as file:
#    external_json_format = json.load(file)

#def format_JSON(file):
#    external_json_dict = {}
#    try:
#        for element in file['topic_info']['topic']['posts']['posts']:
#            for key in ['username', 'user_pic_path', 'user_personality_type', 'is_mod']:
#                element.pop(key)
#    except KeyError:
#        pass
#    for category in external_json_format:
#        external_json_dict[category] = file[category]
#    return external_json_dict

def save_JSON(file, file_name: int):
    with open(f"data/{file_name}.json", "w", encoding="utf-8") as f:
        json.dump(file, f, ensure_ascii=False)

https://requests.readthedocs.io/en/latest/user/advanced/

In [7]:
session = requests.Session()

In [8]:
def scrape(session: requests.Session = session, lower_bound: int = LOWER_LIMIT, upper_bound: int = UPPER_LIMIT):
    for file_id in range(lower_bound, upper_bound):
        try:
            r = session.get(f"{URL}{file_id}", timeout = TIMEOUT)

            r.raise_for_status()
            
            data = r.json()
            #data = format_JSON(json)

            save_JSON(data, file_id)
            remote_logger.info(f"{r.url} saved")
        except requests.exceptions.HTTPError as HTTP_err:
            if r.text.__contains__("message"):
                err_message = f"{HTTP_err} - {r.json()['message']}"
                remote_logger.error(err_message)
                local_logger.error(err_message)
            else:
                err_message = f"{HTTP_err} - Cloudfront"
                remote_logger.critical(err_message)
                local_logger.critical(err_message)
                time.sleep(DELAY)
                scrape(lower_bound=file_id)
                break
            continue
        except requests.exceptions.ConnectionError as conn_err:
            err_message = f"{conn_err} {r.json()['message']}"
            remote_logger.critical(err_message)
            local_logger.critical(err_message)
            time.sleep(DELAY)
            scrape(lower_bound=file_id)
            break
        except requests.exceptions.Timeout as time_err:
            err_message = f"{time_err} - Timed out for url: {r.url}"
            remote_logger.critical(err_message)
            local_logger.critical(err_message)
            continue
        except requests.JSONDecodeError as decode_err:
            err_message = f"{decode_err} - Couldn't decode json from: {r.url}"
            remote_logger.critical(err_message)
            local_logger.critical(err_message)
            continue

In [9]:
scrape()