In [1]:
# All imports
import pandas as pd
from datasets import load_dataset
import os
import hashlib
from io import BytesIO
from PIL import Image
from PIL.TiffTags import TAGS
import cv2

In [2]:
# The upstream dataset is on Hugging Face: https://huggingface.co/datasets/johnbradley/Kydoimos

dataset_path = "johnbradley/Kydoimos"
# !git clone https://huggingface.co/datasets/johnbradley/Kydoimos ../../Kydoimos
# dataset_path = "../../Kydoimos"

kydoimos = load_dataset(dataset_path)

kydoimos_df = pd.DataFrame(kydoimos['train'])

# Load the metadata table into a dataframe
shaggy_dir = '../Shaggy/'
shaggy_df = pd.read_csv(os.path.join(shaggy_dir, 'metadata.csv'), encoding = 'utf-8', low_memory=False)

# Add a column showing how to get to each image from here.
shaggy_df['rel_file_path'] = shaggy_dir + shaggy_df['file_name']

Resolving data files:   0%|          | 0/111 [00:00<?, ?it/s]

### A bit more on checksums

In [3]:
# Create function for MD5 checksums

# Below are a few implementations of an MD5 checksum that can be run on a file specified by a filepath.
# Depending on the size of the file you want to calculate a checksum on, you may want to stream the calculation in chunks
# to avoid using excessive memory. 

# Simple, but could cause OOM error on a massive file
def file_md5_checksum(file_path):
    hash = hashlib.md5()
    with open(file_path, "rb") as f:
        file_bytes = f.read()
        hash.update(file_bytes)
    checksum = hash.hexdigest()
    return checksum

# # Straight-forward, memory safe
# def file_md5_checksum(file_path):
#     hash = hashlib.md5()
#     with open(file_path, "rb") as f:
#         while True:
#             chunk = f.read(4096)
#             if not chunk:
#                 break
#             hash.update(chunk)
#     checksum = hash.hexdigest()
#     return checksum
# 
# # Pythonic, memory safe
# def file_md5_checksum(file_path):
#     hash = hashlib.md5()
#     with open(file_path, "rb") as f:
#         for chunk in iter(lambda: f.read(4096), b""):
#             hash.update(chunk)
#     checksum = hash.hexdigest()
#     return checksum

# Also make one for a PIL object as before.
def pil_md5_checksum(image):
    hash = hashlib.md5()
    buffer = BytesIO()
    image.save(buffer, format=image.format)
    hash.update(buffer.getvalue())
    checksum = hash.hexdigest()
    return checksum

# And an additional function to use use OpenCV.
def cv2_md5_checksum(image):
    hash = hashlib.md5()
    image_bytes = image.tobytes()
    hash.update(image_bytes)
    checksum = hash.hexdigest()
    return checksum

For a single given image file, the calculated checksum (such as MD5) can be sensitive to the way the image data is loaded into memory.
This is because the different image processing libraries (e.g., OpenCV, Pillow) may perform internal transformations or optimizations when loading the image data,
which can result in minor differences in the raw byte representation of the image.
These byte-level differences can then lead to different checksum values being calculated, even though the image content itself is the same.

In this example, we demonstrate this effect by calculating the MD5 checksum for a TIFF image file using three different approaches:
1. Loading the image using OpenCV's cv2.imread() function
2. Loading the image using the Pillow (PIL) library's `Image.open()` function
3. Directly reading the raw bytes of the image file

The `cv2_md5_checksum()` function loads the image using OpenCV, converts the NumPy array representation to bytes, and then calculates the MD5 checksum of those bytes.
The `pil_md5_checksum()` function performs a similar operation using the Pillow library.
The `file_md5_checksum()` function directly reads the bytes of the image file and calculates the MD5 checksum of those raw bytes.

When we print the resulting checksums, we see that the three different approaches produce different MD5 hash values, even though they are all operating on the same underlying image file.
This is because the internal data representations and transformations performed by the different image processing libraries can introduce minor differences in the raw byte content.

Understanding this sensitivity of checksums to the image loading method is important when working with image metadata, file integrity checks, or other applications where the calculated checksum needs to be consistent and reliable.

The best way to handle this is to standardize the image loading approach or to account for these potential discrepancies when comparing checksum values.

In [4]:
img_cv2 = cv2.imread('../Shaggy/images/amalfreda_1.tif')
img_pil = Image.open('../Shaggy/images/amalfreda_1.tif')
img_raw = '../Shaggy/images/amalfreda_1.tif'

md5_cv2 = cv2_md5_checksum(img_cv2)
md5_pil = pil_md5_checksum(img_pil)
md5_raw = file_md5_checksum(img_raw)

print(f"cv2: {md5_cv2}")
print(f"pil: {md5_pil}")
print(f"raw: {md5_raw}")

cv2: a2b85e3545d8b525fe20d90ebe5d847e
pil: d0dd3bc05dead39e2177c59f2925cd2b
raw: baf9cc4fd984f80dea07ec93308c3a7b


One of the differences that can make there way into the bytes representation of this data is the image metadata.
We can inspect this on the raw image file as well as the PIL-loaded version. The OpenCV image is simply a NumPy array

In [9]:
# Inspect the image metadata
def get_tiff_metadata(image_input):
    if isinstance(image_input, str): # Filepath string
        with Image.open(image_input) as img:
            return extract_metadata(img)
    elif isinstance(image_input, Image.Image): # PIL image
        return extract_metadata(image_input)
    else:
        raise TypeError("Input must be a file path or a PIL Image object")

def extract_metadata(img):
    metadata = {}

    # TIFF tags
    for tag, value in img.tag.items():
        decoded_tag = TAGS.get(tag, tag)
        metadata[decoded_tag] = value

    return metadata

# Load the images using the different methods
img_cv2 = cv2.imread('../Shaggy/images/amalfreda_1.tif')
img_pil = Image.open('../Shaggy/images/amalfreda_1.tif')
img_raw = '../Shaggy/images/amalfreda_1.tif'

# Inspect the metadata for the different image representations
for img in [img_pil, img_raw]:
    print(f"Metadata for {img}:")
    meta = get_tiff_metadata(img)
    for tag, value in meta.items():
        print(f" {tag}: {value}")

    print()


Metadata for <PIL.TiffImagePlugin.TiffImageFile image mode=RGB size=124x64 at 0x1D96F23E2A0>:
 ImageWidth: (124,)
 ImageLength: (64,)
 BitsPerSample: (8, 8, 8)
 Compression: (1,)
 PhotometricInterpretation: (2,)
 ResolutionUnit: (2,)
 StripOffsets: (8, 8192, 16376)
 Orientation: (1,)
 SamplesPerPixel: (3,)
 RowsPerStrip: (22,)
 StripByteCounts: (8184, 8184, 7440)
 XResolution: ((72, 1),)
 YResolution: ((72, 1),)
 PlanarConfiguration: (1,)

Metadata for ../Shaggy/images/amalfreda_1.tif:
 ImageWidth: (124,)
 ImageLength: (64,)
 BitsPerSample: (8, 8, 8)
 Compression: (1,)
 PhotometricInterpretation: (2,)
 ResolutionUnit: (2,)
 StripOffsets: (8, 8192, 16376)
 Orientation: (1,)
 SamplesPerPixel: (3,)
 RowsPerStrip: (22,)
 StripByteCounts: (8184, 8184, 7440)
 XResolution: ((72, 1),)
 YResolution: ((72, 1),)
 PlanarConfiguration: (1,)



It's surprising that the metadata shown for these two representations of the image are identical when we can see their MD5 checksums are different.

To see the difference, we need to save the PIL image to disk and load it, since in our PIL MD5 function we save to a buffer.

In [6]:
# Demonstrating the issue: inspect the image metadata

# Save the PIL image to disk with a "_pil" suffix
img_pil.save('./amalfreda_1_pil.tif')

# Load the saved PIL image and inspect its metadata
img_pil_saved = Image.open('./amalfreda_1_pil.tif')
print("Metadata for saved PIL image:")
meta_pil_saved = get_tiff_metadata(img_pil_saved)
for tag, value in meta_pil_saved.items():
    print(f" {tag}: {value}")
print()

print("Metadata for saved raw image:")
meta_raw = get_tiff_metadata(img_raw)
for tag, value in meta_raw.items():
    print(f" {tag}: {value}")

Metadata for saved PIL image:
 ImageWidth: (124,)
 ImageLength: (64,)
 BitsPerSample: (8, 8, 8)
 Compression: (1,)
 PhotometricInterpretation: (2,)
 ResolutionUnit: (2,)
 StripOffsets: (192,)
 SamplesPerPixel: (3,)
 RowsPerStrip: (64,)
 StripByteCounts: (23808,)
 XResolution: ((72, 1),)
 YResolution: ((72, 1),)
 PlanarConfiguration: (1,)

Metadata for saved raw image:
 ImageWidth: (124,)
 ImageLength: (64,)
 BitsPerSample: (8, 8, 8)
 Compression: (1,)
 PhotometricInterpretation: (2,)
 ResolutionUnit: (2,)
 StripOffsets: (8, 8192, 16376)
 Orientation: (1,)
 SamplesPerPixel: (3,)
 RowsPerStrip: (22,)
 StripByteCounts: (8184, 8184, 7440)
 XResolution: ((72, 1),)
 YResolution: ((72, 1),)
 PlanarConfiguration: (1,)


This demonstrates the change in metadata when the PIL object is saved, which causes the MD5 difference.

However, there are further nuances to be aware of. Using a different method in the function to calculate the checksum of a PIL object yields further differences.

In [7]:
def pil_md5_checksum2(image):
    hash = hashlib.md5()
    image_bytes = image.tobytes()    
    hash.update(image_bytes)
    checksum = hash.hexdigest()
    return checksum

In [8]:
md5_cv2 = cv2_md5_checksum(img_cv2)
md5_pil = pil_md5_checksum(img_pil)
md5_pil2 = pil_md5_checksum2(img_pil)
md5_pil_saved = pil_md5_checksum(img_pil_saved)
md5_raw = file_md5_checksum(img_raw)

print(f"cv2: {md5_cv2}")
print(f"pil: {md5_pil}")
print(f"pil2: {md5_pil2}")
print(f"pil_saved: {md5_pil_saved}")
print(f"raw: {md5_raw}")

cv2: a2b85e3545d8b525fe20d90ebe5d847e
pil: d0dd3bc05dead39e2177c59f2925cd2b
pil2: 6c2fa1935394b7489494647bb6ee35b3
pil_saved: d0dd3bc05dead39e2177c59f2925cd2b
raw: baf9cc4fd984f80dea07ec93308c3a7b


The key is to be sure that data you know to be identical yields matching checksums in small scale tests prior to doing large duplicate searches or other analysis, and to make sure images are loaded using matching methods.