Idea of using BeautifulSoup from
import json
from io import BytesIO
import os
import tempfile
from bs4 import BeautifulSoup
from PIL import Image
import requests
from pelican import signals
class ImageCache():
CACHE_FOLDER_NAME = "img_cache"
CACHE_FILE_NAME = "cache.json"
# TODO: Move to settings, but what default value to provide if not present?
def __init__(self):
self.cache = {}
self.dirty = False
def load(self, instance):
cache_file = self._get_cache_file_path(instance)
if os.path.exists(cache_file):
with open(cache_file, "r") as file_handle:
self.cache = json.load(file_handle)
def save(self, instance):
if not self.dirty:
cache_file = self._get_cache_file_path(instance)
with open(cache_file, "w") as file_handle:
json.dump(self.cache, file_handle)
def get_image_width_and_height(self, image_path):
if image_path in self.cache.keys():
width, height = self.cache[image_path]
print("Caching '{}'".format(image_path))
request = requests.get(image_path)
image =
width, height = image.size
self.cache[image_path] = (width, height)
self.dirty = True
if width > self.MAX_WIDTH:
# recalculate proportions and restrain image size
original_proportion = width / height
width = self.MAX_WIDTH
height = int(width / original_proportion)
return (width, height)
def _get_cache_file_path(cls, instance):
base_path = os.path.dirname(instance.settings["PATH"])
cache_path = os.path.join(base_path, cls.CACHE_FOLDER_NAME)
if not os.path.exists(cache_path):
print("Created image cache folder '{}'".format(cache_path))
return os.path.join(cache_path, cls.CACHE_FILE_NAME)
def content_object_init(instance):
if instance._content is None:
image_cache = ImageCache()
content = instance._content
soup = BeautifulSoup(content, "html.parser")
for img in soup(["img"]):
# Text or similar containing a `<img>` tag
if not img.get("src"):
# Don't touch base64 encoded images
img_path, _ = os.path.split(img["src"])
if img_path.startswith("data:image"):
# Always define width & height
width, height = image_cache.get_image_width_and_height(img["src"])
img["width"] = img.get("width", width)
img["height"] = img.get("height", height)
# Chrome's native lazy load
img["loading"] = "lazy"
# Good practice
if img.get("alt", "") == img["src"]:
img["alt"] = ""
# Old MCE cleanup
if img.get("mce_src"):
del img["mce_src"]
instance._content = soup.decode()
def register():
