# 01_joblib_parallel_image_stats -- 2022-08-08-unlabeled yale fossils dataset.ipynb

Calculate basic image statistics for each yale fossil image & store results to disk for downstream analysis

Created on: Monday August 8th, 2022  
Created by: Jacob A Rose

- using `torchshow`

In [1]:
#!pip3 install torchshow

In [2]:
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.float_format', "{:,.2f}".format)

In [3]:
%load_ext autoreload
%autoreload 2


# from omegaconf import DictConfig, OmegaConf
import os
from rich import print as pp

import numpy as np
from typing import *
import inspect
from tqdm.auto import tqdm
from pathlib import Path
import logging
# import meerkat as mk

# import dask.dataframe as dd
from PIL import Image
import PIL
from PIL.ImageStat import Stat

In [4]:
import cv2
import glob
from joblib import Parallel, delayed

In [5]:
# dir(mk.config.DisplayOptions)
# display_res = 512
# # print(mk.config.DisplayOptions.max_image_width)
# mk.config.DisplayOptions.max_image_width = display_res
# mk.config.DisplayOptions.max_image_height = display_res
# print(f"{mk.config.DisplayOptions.max_image_width=}")

## yale fossils

In [6]:
# yale_fossil_dir = "/media/data_cifs/projects/prj_fossils/data/yale_fossil/outblur_yale"

### pd.DataFrame function definitions

In [7]:
%%time

from typing import *
# fpaths = {}
# dps = {}

def extract_file_list_from_directory(parent_dir) -> List[str]:
    
    blacklist = [".ipynb_checkpoints"]
    return sorted([os.path.join(parent_dir, p) for p in os.listdir(parent_dir) if p not in blacklist])


def extract_file_ids_from_file_list(fpaths: List[str]) -> Dict[str, Any]:
    return {
        "paths" : fpaths,
        "file_ids" : [Path(p).stem for p in fpaths]
    }


def make_file_info_dataframe(file_info: Dict[str, Any], **kwargs) -> pd.DataFrame:
    return pd.DataFrame(file_info, **kwargs)

CPU times: user 36 µs, sys: 17 µs, total: 53 µs
Wall time: 58.9 µs


### Image IO function definitions

In [8]:
from PIL import ImageFile
import cv2

def load_image_PIL(file_path: str,
                   mode: str="RGB"):
    img = PIL.Image.open(file_path)
    if mode == "RGB":
        return img
    if mode == "HSV":
        return img.convert("HSV")
    else:
        raise Exception(f"Invalid value for {mode=}")
    

def load_image_cv2(file_path: str,
                   mode: str="RGB"):
    img = cv2.imread(file_path)
    if mode == "BGR":
        return img
    if mode == "RGB":
        return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    if mode == "HSV":
        return cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    else:
        raise Exception(f"Invalid value for {mode=}")

def load_image(file_path: str,
               mode: str="RGB",
               backend: str="PIL",
               lazy_load: bool=False):
    error = None

    try:
        if backend == "PIL":
            img = load_image_PIL(file_path=file_path,
                                 mode=mode)
            if not lazy_load:
                img.load()
        elif backend == "cv2":
            img = load_image_cv2(file_path=file_path,
                                  mode=mode)
        else:
            raise Exception(f"Invalid value for {backend=}")

    except OSError as e:
        error = str(e)
        
        ImageFile.LOAD_TRUNCATED_IMAGES = True
        img = load_image_PIL(file_path=file_path,
                                 mode=mode)
        img.load()
        ImageFile.LOAD_TRUNCATED_IMAGES = False
        
    return img, error

### Image stats analysis function definitions

In [9]:

def calc_img_shape(img: PIL.Image.Image) -> Tuple:
    # img = PIL.Image.open(path)
    h, w = img.height, img.width
    ratio = h/w
    return {
        "height": h, 
        "width": w,
        "aspect_ratio": ratio
    }


def calc_rgb_stats(img: PIL.Image.Image) -> Tuple:
    r, g, b = Stat(img).mean
    return {
        "r": r,
        "g": g,
        "b": b
    }


def calc_hsv_stats(img: PIL.Image.Image) -> Tuple:
    img = img.convert("HSV")
    h, s, v = Stat(img).mean
    return {
        "h": h,
        "s": s,
        "v": v
    }


def analyze_image_from_file(path: str):

    img, error = load_image(path, 
                            mode="RGB",
                            backend="PIL",
                            lazy_load=False)
    
    return {
        "path": path,
        **calc_img_shape(img),
        **calc_rgb_stats(img),
        **calc_hsv_stats(img),
        "error": error
    }


## Main: Process images or load previous results from disk

In [10]:
yale_fossil_dir = "/media/data_cifs/projects/prj_fossils/data/yale_full"

In [11]:
analysis_results_root_dir = "/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/2022-yale_fossil/analysis_results/"
results_filename = "image_stats_df"

parquet_dir = os.path.join(analysis_results_root_dir, "parquet")
parquet_file_path = os.path.join(parquet_dir, f"{results_filename}.parquet")

csv_dir = os.path.join(analysis_results_root_dir, "csv")
csv_file_path = os.path.join(csv_dir, f"{results_filename}.csv")

In [12]:
n_jobs = 16

if os.path.exists(parquet_file_path):
    print(f"Found pre-computed image statistics analysis, skipping expensive parallel processing job & loading from disk")
    analysis_df = pd.read_parquet(parquet_file_path)
    
else:
    yale_file_list = extract_file_list_from_directory(parent_dir=yale_fossil_dir)
    yale_file_info_list = extract_file_ids_from_file_list(fpaths=yale_file_list)
    yale_info_df = make_file_info_dataframe(file_info=yale_file_info_list)
    df = yale_info_df

    total_rows = df.shape[0]
    file_paths = df["paths"].values

    analysis_records = Parallel(n_jobs=n_jobs, backend='threading')(
        delayed(analyze_image_from_file)(
            path) for path in tqdm(file_paths, total=total_rows)
    )

    analysis_df = pd.DataFrame.from_records(analysis_records)


analysis_df.describe(include='all')

Found pre-computed image statistics analysis, skipping expensive parallel processing job & loading from disk


Unnamed: 0,path,height,width,aspect_ratio,r,g,b,h,s,v,error
count,16444,16444.0,16444.0,16444.0,16444.0,16444.0,16444.0,16444.0,16444.0,16444.0,4
unique,16444,,,,,,,,,,2
top,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:5d464fc1-5c07-43c5-a942-31de8e57508b.png,,,,,,,,,,image file is truncated (0 bytes not processed)
freq,1,,,,,,,,,,3
mean,,2676.11,2876.15,1.0,139.91,113.34,80.43,31.61,117.67,140.32,
std,,1087.97,1198.44,0.44,36.26,35.27,37.16,18.28,45.56,36.45,
min,,198.0,284.0,0.11,18.64,18.24,9.78,4.31,4.36,19.68,
25%,,1988.0,2053.0,0.67,116.13,90.47,56.28,21.89,87.46,116.48,
50%,,2612.0,2990.0,0.85,141.51,111.54,72.13,25.4,125.38,141.89,
75%,,2990.0,2990.0,1.24,163.86,132.14,93.02,31.77,151.09,164.33,


### Output any new analysis to disk

In [13]:
# analysis_results_root_dir = "/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/2022-yale_fossil/analysis_results/"
# results_filename = "image_stats_df"

# parquet_dir = os.path.join(analysis_results_root_dir, "parquet")
# parquet_file_path = os.path.join(parquet_dir, f"{results_filename}.parquet")

# csv_dir = os.path.join(analysis_results_root_dir, "csv")
# csv_file_path = os.path.join(csv_dir, f"{results_filename}.csv")

In [14]:
%%time

# parquet_dir = os.path.join(analysis_results_root_dir, "parquet")
# parquet_file_path = os.path.join(parquet_dir, f"{results_filename}.parquet")

# csv_dir = os.path.join(analysis_results_root_dir, "csv")
# csv_file_path = os.path.join(csv_dir, f"{results_filename}.csv")


if os.path.exists(parquet_file_path):
    print(f"Skipping write to parquet after finding pre-existing parquet file at: {parquet_file_path}" + "\n" + "Manually delete pre-existing parquet file in order to allow write operation.")
else:
    os.makedirs(parquet_dir, exist_ok=True)
    analysis_df.to_parquet(parquet_file_path)




if os.path.exists(csv_file_path):
    print(f"Skipping write to csv after finding pre-existing csv file at: {csv_file_path}" + "\n" + "Manually delete pre-existing csv file in order to allow write operation.")
else:
    os.makedirs(csv_dir, exist_ok=True)
    analysis_df.to_csv(csv_file_path)

print(f"Finished analysis results can be found at either:")
print(parquet_file_path)
print("or")
print(csv_file_path)

Skipping write to parquet after finding pre-existing parquet file at: /media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/analysis_results/parquet/image_stats_df.parquet
Manually delete pre-existing parquet file in order to allow write operation.
Skipping write to csv after finding pre-existing csv file at: /media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/analysis_results/csv/image_stats_df.csv
Manually delete pre-existing csv file in order to allow write operation.
Finished analysis results can be found at either:
/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/analysis_results/parquet/image_stats_df.parquet
or
/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/analysis_results/csv/image_stats_df.csv
CPU times: user 2.63 ms, sys: 1.18 ms, total: 3.81 ms
Wal

In [15]:
pp("DONE")

### Misc analysis

In [40]:
%%time

new_df = pd.read_parquet(parquet_file_path)
new_df

CPU times: user 24.9 ms, sys: 23.5 ms, total: 48.4 ms
Wall time: 88.3 ms


Unnamed: 0,path,height,width,aspect_ratio,r,g,b,h,s,v,error
0,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:000012b6-2c07-4df6-941c-8f2d0915391c.png,4000,6000,0.67,242.49,240.56,233.72,32.94,12.03,242.74,
1,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:000162dc-43bd-4129-9081-0024b8868cac.png,6000,3728,1.61,139.02,105.91,67.84,20.96,112.18,139.72,
2,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:00016467-aeaa-45fe-a040-9c8550d0d3cf.png,1549,2990,0.52,142.83,107.33,62.47,23.30,153.75,142.83,
3,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:0008efe0-a5d3-4683-ab85-9a6612ee9b97.png,2400,2990,0.80,124.11,97.51,64.46,22.11,137.23,124.11,
4,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:000b2947-cf62-4228-a2ea-67cf9fcb8d99.png,4468,1407,3.18,150.26,120.17,75.54,26.35,136.71,150.64,
...,...,...,...,...,...,...,...,...,...,...,...
16439,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff2afdf-b056-4ff3-a9fa-38858995908e.png,1944,2592,0.75,132.91,122.69,121.05,51.90,22.77,133.41,
16440,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff4c2b0-168c-462b-a516-032b086c2703.png,5072,2928,1.73,112.81,88.63,51.19,23.08,128.98,113.30,
16441,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff70057-4228-4a7d-9146-58f31755fa77.png,3792,4688,0.81,98.17,55.98,37.02,21.99,127.88,98.34,
16442,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff96ceb-60b8-491d-a7c4-4a5aae2e255a.png,2020,2990,0.68,114.29,97.63,71.85,29.15,79.24,114.36,


In [35]:
%%time

new_df = pd.read_csv("data/csv/image_stats_df.csv")
new_df

CPU times: user 74.8 ms, sys: 7.97 ms, total: 82.8 ms
Wall time: 126 ms


Unnamed: 0.1,Unnamed: 0,path,height,width,aspect_ratio,r,g,b,h,s,v,error
0,0,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:000012b6-2c07-4df6-941c-8f2d0915391c.png,4000,6000,0.67,242.49,240.56,233.72,32.94,12.03,242.74,
1,1,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:000162dc-43bd-4129-9081-0024b8868cac.png,6000,3728,1.61,139.02,105.91,67.84,20.96,112.18,139.72,
2,2,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:00016467-aeaa-45fe-a040-9c8550d0d3cf.png,1549,2990,0.52,142.83,107.33,62.47,23.30,153.75,142.83,
3,3,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:0008efe0-a5d3-4683-ab85-9a6612ee9b97.png,2400,2990,0.80,124.11,97.51,64.46,22.11,137.23,124.11,
4,4,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:000b2947-cf62-4228-a2ea-67cf9fcb8d99.png,4468,1407,3.18,150.26,120.17,75.54,26.35,136.71,150.64,
...,...,...,...,...,...,...,...,...,...,...,...,...
16439,16439,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff2afdf-b056-4ff3-a9fa-38858995908e.png,1944,2592,0.75,132.91,122.69,121.05,51.90,22.77,133.41,
16440,16440,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff4c2b0-168c-462b-a516-032b086c2703.png,5072,2928,1.73,112.81,88.63,51.19,23.08,128.98,113.30,
16441,16441,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff70057-4228-4a7d-9146-58f31755fa77.png,3792,4688,0.81,98.17,55.98,37.02,21.99,127.88,98.34,
16442,16442,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff96ceb-60b8-491d-a7c4-4a5aae2e255a.png,2020,2990,0.68,114.29,97.63,71.85,29.15,79.24,114.36,


In [33]:
new_df.describe(include='all')

Unnamed: 0,path,height,width,aspect_ratio,r,g,b,h,s,v,error
count,16444,16444.0,16444.0,16444.0,16444.0,16444.0,16444.0,16444.0,16444.0,16444.0,4
unique,16444,,,,,,,,,,2
top,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:b500c8bd-4e7c-4984-97eb-f2606adb9fb6.png,,,,,,,,,,image file is truncated (0 bytes not processed)
freq,1,,,,,,,,,,3
mean,,2676.11,2876.15,1.0,139.91,113.34,80.43,31.61,117.67,140.32,
std,,1087.97,1198.44,0.44,36.26,35.27,37.16,18.28,45.56,36.45,
min,,198.0,284.0,0.11,18.64,18.24,9.78,4.31,4.36,19.68,
25%,,1988.0,2053.0,0.67,116.13,90.47,56.28,21.89,87.46,116.48,
50%,,2612.0,2990.0,0.85,141.51,111.54,72.13,25.4,125.38,141.89,
75%,,2990.0,2990.0,1.24,163.86,132.14,93.02,31.77,151.09,164.33,


In [25]:
error_types = analysis_df.value_counts("error").to_dict().keys()

for k in error_types:
    print(k)
    error_df = analysis_df[analysis_df.error==k]
    error_df.describe(include="all")

image file is truncated (0 bytes not processed)


Unnamed: 0,path,height,width,aspect_ratio,r,g,b,h,s,v,error
count,3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3
unique,3,,,,,,,,,,1
top,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:f4241b3a-9b1c-4856-bed5-75dc03004c08.png,,,,,,,,,,image file is truncated (0 bytes not processed)
freq,1,,,,,,,,,,3
mean,,3312.0,3176.0,0.98,118.07,100.9,65.72,28.15,83.29,118.27,
std,,2321.12,683.66,0.47,51.41,52.34,32.95,10.81,60.87,51.23,
min,,1944.0,2592.0,0.66,72.02,66.51,33.17,15.93,14.08,72.51,
25%,,1972.0,2800.0,0.71,90.34,70.78,49.06,24.01,60.69,90.59,
50%,,2000.0,3008.0,0.75,108.67,75.05,64.96,32.09,107.31,108.67,
75%,,3996.0,3468.0,1.14,141.1,118.09,82.0,34.27,117.89,141.14,


image file is truncated


Unnamed: 0,path,height,width,aspect_ratio,r,g,b,h,s,v,error
count,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
unique,1,,,,,,,,,,1
top,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:2369295c-e89b-4799-9ed6-6c5605576cb1.png,,,,,,,,,,image file is truncated
freq,1,,,,,,,,,,1
mean,,2990.0,2304.0,1.3,72.29,53.77,37.52,17.14,105.51,72.29,
std,,,,,,,,,,,
min,,2990.0,2304.0,1.3,72.29,53.77,37.52,17.14,105.51,72.29,
25%,,2990.0,2304.0,1.3,72.29,53.77,37.52,17.14,105.51,72.29,
50%,,2990.0,2304.0,1.3,72.29,53.77,37.52,17.14,105.51,72.29,
75%,,2990.0,2304.0,1.3,72.29,53.77,37.52,17.14,105.51,72.29,


In [24]:
k = "image file is truncated (0 bytes not processed)"
analysis_df[analysis_df.error==k]

Unnamed: 0,path,height,width,aspect_ratio,r,g,b,h,s,v,error
10332,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:a1be3282-5869-4abe-99c5-e9998ed70f8b.png,2000,3008,0.66,108.67,75.05,33.17,15.93,128.48,108.67,image file is truncated (0 bytes not processed)
15644,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:f4241b3a-9b1c-4856-bed5-75dc03004c08.png,5992,3928,1.53,173.54,161.13,99.05,32.09,107.31,173.61,image file is truncated (0 bytes not processed)
16007,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:f9714b38-d33c-4f27-b5c0-28f313533afc.png,1944,2592,0.75,72.02,66.51,64.96,36.44,14.08,72.51,image file is truncated (0 bytes not processed)
