# 02a+b_imagestats_EDA_plots -- 2022-08-08-unlabeled yale fossils dataset.ipynb

Perform Exploratory Data Analysis on image dataset statistics previously computed upstream

Created on: Monday August 8th, 2022  
Created by: Jacob A Rose

- using `torchshow`

In [None]:
#!pip3 install torchshow

In [None]:
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.float_format', "{:,.2f}".format)


import meerkat as mk
display_res = 1024
# print(mk.config.DisplayOptions.max_image_width)
mk.config.DisplayOptions.max_image_width = display_res
mk.config.DisplayOptions.max_image_height = display_res

mk.config.DisplayOptions.max_rows = 100
print(f"{mk.config.DisplayOptions.max_image_width=}")

In [None]:
dir(mk.config.DisplayOptions)

In [None]:
%load_ext autoreload
%autoreload 2


# from omegaconf import DictConfig, OmegaConf
import os
from rich import print as pp

import numpy as np
from typing import *
import inspect
from tqdm.auto import tqdm
from pathlib import Path
import logging
# import meerkat as mk

# import dask.dataframe as dd
from PIL import Image
import PIL
from PIL.ImageStat import Stat

In [None]:
import cv2
import glob
from joblib import Parallel, delayed

In [None]:
# dir(mk.config.DisplayOptions)
# display_res = 512
# # print(mk.config.DisplayOptions.max_image_width)
# mk.config.DisplayOptions.max_image_width = display_res
# mk.config.DisplayOptions.max_image_height = display_res
# print(f"{mk.config.DisplayOptions.max_image_width=}")

### Define key file info & metadata

In [None]:
yale_fossil_dir = "/media/data_cifs/projects/prj_fossils/data/yale_full"

analysis_results_root_dir = "/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/2022-yale_fossil/analysis_results/"
results_filename = "01_image_stats_df"

parquet_dir = os.path.join(analysis_results_root_dir, "parquet")
parquet_file_path = os.path.join(parquet_dir, f"{results_filename}.parquet")

csv_dir = os.path.join(analysis_results_root_dir, "csv")
csv_file_path = os.path.join(csv_dir, f"{results_filename}.csv")

In [None]:
# csv_dir = os.path.join(analysis_results_root_dir, "csv")
# csv_file_path = os.path.join(csv_dir, f"{results_filename}.csv")

# df = pd.read_csv(csv_file_path, index_col=0)
# df

### Load previously computed image stats from parquet file

In [None]:
if os.path.exists(parquet_file_path):
    print(f"Found pre-computed image statistics analysis, loading from file on disk at location: {parquet_file_path}")
    df = pd.read_parquet(parquet_file_path)
    
else:
    raise IOError(f"Couldn't find required parquet file at specified location: {parquet_file_path}")

df = df.assign(identifier = df.path.apply(lambda x: Path(x).stem))
df

In [None]:

occurrence_catalog_path = "/media/data_cifs/projects/prj_fossils/data/raw_data/2022-yale_fossil/metadata-clean/occurrence-clean.txt"
multimedia_catalog_path = "/media/data_cifs/projects/prj_fossils/data/raw_data/2022-yale_fossil/metadata-clean/multimedia-clean.txt"


occurrence_catalog = pd.read_csv(occurrence_catalog_path,
                                 sep="\t")
occurrence_catalog.columns.values

multimedia_catalog = pd.read_csv(multimedia_catalog_path,
                                 sep="\t")
multimedia_catalog.columns.values

### Image IO function definitions

In [None]:
from PIL import ImageFile
import cv2

def load_image_PIL(file_path: str,
                   mode: str="RGB"):
    img = PIL.Image.open(file_path)
    if mode == "BGR":
        return np.array(img)[:,:,::-1]
    if mode == "RGB":
        return img
    if mode == "HSV":
        return img.convert("HSV")
    else:
        raise Exception(f"Invalid value for {mode=}")
    

def load_image_cv2(file_path: str,
                   mode: str="RGB"):
    img = cv2.imread(file_path)
    if mode == "BGR":
        return img
    if mode == "RGB":
        return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    if mode == "HSV":
        return cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    else:
        raise Exception(f"Invalid value for {mode=}")


def load_image(file_path: str,
               mode: str="RGB",
               backend: str="PIL",
               lazy_load: bool=False):
    error = None

    try:
        if backend == "PIL":
            img = load_image_PIL(file_path=file_path,
                                 mode=mode)
            if (not lazy_load) and isinstance(img, PIL.Image.Image):
                img.load()

        elif backend == "cv2":
            img = load_image_cv2(file_path=file_path,
                                  mode=mode)
        else:
            raise Exception(f"Invalid value for {backend=}")

    except OSError as e:
        error = str(e)
        
        ImageFile.LOAD_TRUNCATED_IMAGES = True
        img = load_image_PIL(file_path=file_path,
                             mode=mode)
        if isinstance(img, PIL.Image.Image):
            img.load()
        ImageFile.LOAD_TRUNCATED_IMAGES = False

    return img, error

In [None]:
def print_file_size(path):
    size = os.path.getsize(path)
    size_MB = round(size/1024/1024,2)

    print("Image File Size is " + str(size_MB) + "MB" )


def rescale_image(img: np.ndarray, max_size: int=512) -> np.ndarray:
    h, w, c = img.shape
    scale = min([max_size/h, max_size/w])
    output_size = int(scale*w), int(scale*h)
    
    return cv2.resize(img, output_size, interpolation=cv2.INTER_AREA)



def create_image_thumbnail(image_path, output_dir, max_size: int, ext: str="jpg"):
    
    # img = cv2.imread(image_path)
    file_path = os.path.join(output_dir, Path(image_path).stem + f".{ext}")

    if (not os.path.isfile(file_path)):
    
        img, error = load_image(file_path=image_path,
                                mode="BGR", backend="PIL")
        img = rescale_image(img=img, max_size=max_size)
        cv2.imwrite(file_path, img, [cv2.IMWRITE_JPEG_QUALITY, 100])
    
    return {"source_path":image_path,
            "target_path":file_path}


## Initial EDA

In [None]:
# import numpy as np
# import pandas as pd

# df = pd.DataFrame({'percentage': abs(np.random.normal(loc=50, scale=30, size=100)),
#                    'var1': np.random.rand(100),
#                    'var2': np.random.rand(100),
#                    'var3': np.random.rand(100)})

# # Find out percentiles
# lower = np.percentile(df['percentage'], 10)
# upper = np.percentile(df['percentage'], 90)

# # Select data between
# trimmed = df[df.percentage.between(lower, upper)]

In [None]:
# !mamba install -q -y -c pyviz panel
# import panel as pn
# pn.extension()


# # n_jobs = 16

# if os.path.exists(parquet_file_path):
#     print(f"Found pre-computed image statistics analysis, skipping expensive parallel processing job & loading from disk")
#     analysis_df = pd.read_parquet(parquet_file_path)
    
# else:
#     yale_file_list = extract_file_list_from_directory(parent_dir=yale_fossil_dir)
#     yale_file_info_list = extract_file_ids_from_file_list(fpaths=yale_file_list)
#     yale_info_df = make_file_info_dataframe(file_info=yale_file_info_list)
#     df = yale_info_df

#     total_rows = df.shape[0]
#     file_paths = df["paths"].values

#     analysis_records = Parallel(n_jobs=n_jobs, backend='threading')(
#         delayed(analyze_image_from_file)(
#             path) for path in tqdm(file_paths, total=total_rows)
#     )

#     analysis_df = pd.DataFrame.from_records(analysis_records)


# analysis_df.describe(include='all')

In [None]:
tmp_dir = "/dev/shm"

thumbnail_resolution = 512

thumbnail_dir = os.path.join(tmp_dir, f"jrose3/2022-yale_fossils/image_thumbnails/res={thumbnail_resolution}")
os.makedirs(thumbnail_dir, exist_ok=True)

# sample_path = '/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:000012b6-2c07-4df6-941c-8f2d0915391c.png'
# sample_thumbnail_path = os.path.join(thumbnail_dir, "urn:uuid:000012b6-2c07-4df6-941c-8f2d0915391c.jpg")

In [None]:
from pqdm.threads import pqdm
from functools import partial


copy_img_func = partial(create_image_thumbnail,
                        output_dir=thumbnail_dir,
                        max_size=thumbnail_resolution,
                        ext="jpg")

source_paths = df.path.values.tolist()
n_jobs = 8
inputs = source_paths

thumbnail_file_paths = pqdm(inputs, copy_img_func, n_jobs=n_jobs)
# errors = [p for p in thumbnail_file_paths if not isinstance(p, dict)]

errors = [not isinstance(p, dict) for p in thumbnail_file_paths]
errors_idx = np.where(errors)
df.iloc[errors_idx]

In [None]:
thumbnail_df = pd.DataFrame.from_records(thumbnail_file_paths).rename(columns={"source_path":"path",
                                                                               "target_path":"thumb_path"}
                                                                     )
df = df.merge(thumbnail_df, on="path", how="inner")

In [None]:
occurrence_catalog.info()
# multimedia_catalog.columns.values

## Remove low value columns

### Drop columns with all null values

In [None]:
num_cols = len(occurrence_catalog.columns)
null_cols = occurrence_catalog.isnull().sum().sort_values(ascending=False)
null_cols2drop_all = null_cols[null_cols == len(occurrence_catalog)].index
null_cols2drop_all

occurrence_catalog = occurrence_catalog.drop(columns=null_cols2drop_all)

In [None]:
print(f"Dropping {len(null_cols2drop_all)} cols out of {num_cols}")

### Drop columns with more than `73,000` null values

In [None]:
num_cols = len(occurrence_catalog.columns)
null_cols = occurrence_catalog.isnull().sum().sort_values(ascending=False)
null_cols2drop_threshold = null_cols[null_cols > 73000].index
null_cols2drop_threshold
occurrence_catalog = occurrence_catalog.drop(columns=null_cols2drop_threshold)

In [None]:
# num_cols = len(occurrence_catalog.columns)
print(f"Dropping {len(null_cols2drop_threshold)} cols out of {num_cols}")

### Drop columns with fewer than 5 unique values

In [None]:
num_cols = len(occurrence_catalog.columns)
col_cardinalities = occurrence_catalog.nunique().sort_values()
low_cardinality_cols = col_cardinalities[col_cardinalities <= 5].index
low_cardinality_cols

occurrence_catalog = occurrence_catalog.drop(columns=low_cardinality_cols)


print(f"Dropping {len(low_cardinality_cols)} cols out of {num_cols}")
occurrence_catalog.info()
occurrence_catalog.describe(include='all')

In [None]:
len(occurrence_catalog.columns)

### Merge our 16,444 rows of image file info with our more comprehensive occurrence catalog with 40 remaining columns

In [None]:
# df.describe(include='all')
# multimedia_catalog.describe(include='all')
multimedia_catalog.columns

intermediate_df = df.merge(multimedia_catalog[['id', 'identifier']], on="identifier") #.describe(include='all')

intermediate_df = intermediate_df.merge(occurrence_catalog, on="id") #.describe(include='all')

### Manually drop 2 more columns based on low value

In [None]:
custom_cols2drop = [
    'eventDate',
    'year'
]
num_cols = len(intermediate_df.columns)
intermediate_df = intermediate_df.drop(columns=custom_cols2drop)
print(f"Dropping {len(custom_cols2drop)} cols out of {num_cols}")


intermediate_df.info()
intermediate_df.describe(include='all')
intermediate_df.head()

In [None]:
final_clean_cols = [
    'thumb_path',
    'path',
    'r', 'g', 'b',
    'h', 's', 'v',
    'height', 
    'width', 
    'aspect_ratio',
    'kingdom', 'phylum', 'class', 'order', 'family',
    'genus', 'specificEpithet', 'taxonRank', 'vernacularName',
    'continent', 'country', 'stateProvince', 'county', 'municipality', 'locality',
    'higherGeography', 'formation', 'scientificName', 'higherClassification',
    'earliestPeriodOrLowestSystem', 'earliestEpochOrLowestSeries', 'earliestAgeOrLowestStage',
    'bibliographicCitation', 
    'references', 
    'decimalLatitude', 'decimalLongitude', 'coordinateUncertaintyInMeters',
    'occurrenceID', 
    'catalogNumber', 
    'occurrenceRemarks', 
    'recordedBy',
    'identifier', 'id',
    'modified',
    'previousIdentifications',
    'georeferencedBy', 'georeferencedDate', 'georeferenceSources',
    'dynamicProperties',
    'error'
    ]

df = intermediate_df[final_clean_cols]
df.convert_dtypes()

In [None]:
df.info()

In [None]:
%%time

analysis_results_root_dir = "/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/2022-yale_fossil/analysis_results"
results_filename = "02a_rich_metadata_full_catalog"

parquet_dir = os.path.join(analysis_results_root_dir, "parquet")
parquet_file_path = os.path.join(parquet_dir, f"{results_filename}.parquet")

csv_dir = os.path.join(analysis_results_root_dir, "csv")
csv_file_path = os.path.join(csv_dir, f"{results_filename}.csv")



if os.path.exists(parquet_file_path):
    print(f"Skipping write to parquet after finding pre-existing parquet file at: {parquet_file_path}" + "\n" + "Manually delete pre-existing parquet file in order to allow write operation.")
else:
    os.makedirs(parquet_dir, exist_ok=True)
    df.to_parquet(parquet_file_path)


if os.path.exists(csv_file_path):
    print(f"Skipping write to csv after finding pre-existing csv file at: {csv_file_path}" + "\n" + "Manually delete pre-existing csv file in order to allow write operation.")
else:
    os.makedirs(csv_dir, exist_ok=True)
    df.to_csv(csv_file_path)


In [None]:
import seaborn as sns

# intermediate_df[['decimalLatitude', 'decimalLongitude']].plot(kind="hist", x='decimalLatitude', y='decimalLongitude')
# sns.histplot(stat="density",x='decimalLatitude', y='decimalLongitude', data=intermediate_df[['decimalLatitude', 'decimalLongitude']])
# intermediate_df[['decimalLatitude', 'decimalLongitude', "coordinateUncertaintyInMeters"]].isna().sum()
# intermediate_df.describe(include='all')
# intermediate_df.columns


# occurrence_catalog.georeferenceProtocol.value_counts()
# occurrence_catalog.georeferenceSources.value_counts()
# occurrence_catalog.geodeticDatum.value_counts()
# occurrence_catalog.disposition.value_counts()
# occurrence_catalog.continent.value_counts()

# print(len(occurrence_catalog.columns))
# occurrence_catalog.describe(include='all')

# sns.heatmap(occurrence_catalog.isnull(), cmap="viridis")

In [None]:
# cols = [
#     'thumb_path',
#     'path',
#     'r',
#     'g',
#     'b',
#     'h',
#     's',
#     'v',
#     'height',
#     'width',
#     'aspect_ratio',
#     'error',
#     'identifier'
# ]

# df = df[cols]

## Embed Images in a mk.DataPanel & cache to disk

In [None]:
import meerkat as mk

dp = mk.DataPanel.from_pandas(df)
dp

In [None]:
%%time

dp["thumbnail"] = mk.ImageColumn.from_filepaths(dp["thumb_path"])

In [None]:
analysis_cols = [
    'thumbnail',
    'thumb_path',
    'path',
    'r',
    'g',
    'b',
    'h',
    's',
    'v',
    'height',
    'width',
    'aspect_ratio',
    'kingdom',
    'phylum',
    'class',
    'order',
    'family',
    'genus',
    'specificEpithet',
    'taxonRank',
    'vernacularName',
    'continent',
    'country',
    'stateProvince',
    'county',
    'municipality',
    'locality',
    'higherGeography',
    'formation',
    'scientificName',
    'higherClassification',
    'earliestPeriodOrLowestSystem',
    'earliestEpochOrLowestSeries',
    'earliestAgeOrLowestStage',
    'bibliographicCitation',
    'references',
    'decimalLatitude',
    'decimalLongitude',
    'coordinateUncertaintyInMeters',
    'occurrenceID',
    'catalogNumber',
    'occurrenceRemarks',
    'recordedBy',
    'identifier',
    'id',
    'modified',
    'previousIdentifications',
    'georeferencedBy',
    'georeferencedDate',
    'georeferenceSources',
    'dynamicProperties',
    'error']

dp = dp[analysis_cols]

In [None]:
# !rm -r "/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/2022-yale_fossil/analysis_results/02b_rich_metadata_embedded_images_meerkat_datapanel"

In [None]:
analysis_results_root_dir = "/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/2022-yale_fossil/analysis_results"

meerkat_dir = os.path.join(analysis_results_root_dir, "meerkat")
embedded_images_meerkat_datapanel_filename = "02b_rich_metadata_embedded_images_meerkat_datapanel"

mk_datapanel_path = os.path.join(meerkat_dir, embedded_images_meerkat_datapanel_filename)

dp.write(mk_datapanel_path)
########### Fully formatted datapanel can now be reloaded in another notebook by uncommenting the following:
# reloaded_dp = mk.DataPanel.read(mk_datapanel_path)
# reloaded_dp

## Sort by HSV value `v` and browse images

In [None]:
df = dp.to_pandas()
df = df.sort_values(
    "v",
    ascending=False,
    ignore_index=True
)

sorted_dp = mk.DataPanel.from_pandas(df)

sorted_dp["thumbnail"] = mk.ImageColumn.from_filepaths(sorted_dp["thumb_path"])
sorted_dp = sorted_dp[analysis_cols]
sorted_dp.head(100)

## Sort by HSV saturation `s` and browse images

In [None]:
df = dp.to_pandas()
df = df.sort_values(
    "s",
    ascending=False,
    ignore_index=True
)

sorted_dp = mk.DataPanel.from_pandas(df)

sorted_dp["thumbnail"] = mk.ImageColumn.from_filepaths(sorted_dp["thumb_path"])
sorted_dp = sorted_dp[analysis_cols]
sorted_dp.head(100)

In [None]:
sorted_dp.tail(100)

In [None]:
from joblib import Parallel, delayed
n_jobs = 16
thumbnail_file_paths = Parallel(n_jobs=n_jobs)(delayed(copy_img_func)(path) for path in source_paths)

%%time

file_path = create_image_thumbnail(image_path=sample_path,
                                   output_dir=thumbnail_dir,
                                   max_size=thumbnail_resolution,
                                   ext="jpg")



print(os.path.isfile(file_path))

print_file_size(sample_path)
print_file_size(file_path)

# image_path = sample_path
# max_size = thumbnail_resolution

import time

from random import randrange
from multiprocessing.pool import ThreadPool

from tqdm.auto import tqdm


def func_call(position, total):
    text = 'progressbar #{position}'.format(position=position)
    with  tqdm(total=total, position=position, desc=text) as progress:
        for _ in range(0, total, 5):
            progress.update(5)
            time.sleep(randrange(3))


pool = ThreadPool(10)
tasks = range(5)
for i, url in enumerate(tasks, 1):
    pool.apply_async(func_call, args=(i, 100))
pool.close()
pool.join()

## Seaborn image attributes facet grid plots

In [None]:
g = sns.PairGrid(df[["r","g","b", "h", "s", "v"]])#, hue="species")
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
g.add_legend()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


f, ax = plt.subplots(figsize=(7, 5))
sns.despine(f)
sns.kdeplot(df["h"], shade=True)
sns.kdeplot(df["s"], shade=True)
sns.kdeplot(df["v"], shade=True)
# plt.show()

In [None]:
full_df = df.merge(multimedia_catalog, on="identifier")
full_df = full_df.merge(occurrence_catalog, on="id")
full_df

In [None]:
df.shape
full_df.shape
multimedia_catalog.shape

In [None]:
df[df.apply(lambda x: x.identifier not in full_df.identifier.values, axis=1)]

In [None]:
# missing_df = multimedia_catalog[multimedia_catalog.apply(lambda x: x.identifier not in full_df.identifier.values, axis=1)]

# from PIL import Image
# import requests
# import IPython


# def load_remote_image(url):
#     # return Image.open(requests.get(url, stream=True).raw)
#     try:
#         return IPython.display.Image(url, width = 250)
#     except:
#         return "Image not found"



# # url = 'https://newevolutiondesigns.com/images/freebies/colorful-background-14.jpg'
# # IPython.display.Image(url, width = 250)

# img_col = missing_df.assign(img = missing_df.accessURI.apply(load_remote_image))
# img_col

In [None]:
df = df.sort_values(
    "v",
    ascending=True, # False,
    ignore_index=True
)

df

In [None]:
bins = [0.0, 0.25, 0.5, 0.75, 1.0] #[:-1]

df["quantiles"], o_bins = pd.qcut(
    df["v"],
    len(bins),
    labels=bins,
    precision=2,
    retbins=True
)

df.describe(include="all")

In [None]:
largest_idx = df.groupby("quantiles")["v"].nlargest(10).reset_index(level=0).index
smallest_idx = df.groupby("quantiles")["v"].nsmallest(10).reset_index(level=0).index


largest_idx
smallest_idx

In [None]:
from more_itertools import unzip

In [None]:
smallest = df.loc[smallest_idx,:]
largest = df.loc[largest_idx,:]

In [None]:

i, paths, quantiles, v_list = [
    list(c) for c in unzip(
        smallest[["path", "quantiles", "v"]].to_records()
    )
]


# i, paths, quantiles, v_list = [
#     list(c) for c in unzip(
#         largest[["path", "quantiles", "v"]].to_records()
#     )
# ]


In [None]:
import ipyplot

In [None]:
print(len(paths))

In [None]:
tabs_order=sorted(set(quantiles))

In [None]:
tabs_order


quantiles

In [None]:
ipyplot.plot_class_tabs(paths,
                        labels=[f"{q:.2%}" for q in quantiles],
                        custom_texts=[f"{v=}" for v in v_list],
                        tabs_order=np.sort(list(set(quantiles)))
                    )

In [None]:
records = df.to_records()
records

records[0]



In [None]:
ipyplot.plot_class_tabs

ipyplot.plot_class_tabs?

In [None]:
from functools import partial

In [None]:
import pandas as pd
from IPython.core.display import HTML


def generate_file_path_dataframe_fixture() -> pd.DataFrame:
    df = pd.DataFrame(
        [
            [2768571, 130655, 1155027, 34713051, 331002277],
            [1448753, 60632, 790040, 3070447, 212558178],
            [654405, 9536, 422931, 19852167, 145934619],
            [605216, 17848, 359891, 8826585, 1379974505],
            [288477, 9860, 178245, 1699369, 32969875]
        ],
        columns = ['Total Cases', 'Total Deaths', 'Total Recovered', 'Total Tests', 'Population']
    )

    df['Country'] = [
        'https://www.countries-ofthe-world.com/flags-normal/flag-of-United-States-of-America.png',
        'https://www.countries-ofthe-world.com/flags-normal/flag-of-Brazil.png',
        'https://www.countries-ofthe-world.com/flags-normal/flag-of-Russia.png',
        'https://www.countries-ofthe-world.com/flags-normal/flag-of-India.png',
        'https://www.countries-ofthe-world.com/flags-normal/flag-of-Peru.png'
    ]
    return df






def path_to_image_html(path: str,
                       width: int=128):
    return f'<img src="{path}" width="{width}" >'


def display_image_df(df: pd.DataFrame,
                     formatters: Dict[str,Callable]
                    ):
    return HTML(
        df.to_html(
            escape=False,
            formatters=formatters
        )
    )

In [None]:
formatters = {
    "img": 
    partial(
        path_to_image_html#, width-50
    )
}



df_html = display_image_df(
    df=df.assign(img=df.path.values),
    formatters=formatters
)
df_html

In [None]:
# Rendering the dataframe as HTML table
# df.to_html(escape=False, formatters=dict(Country=path_to_image_html))


# Rendering the images in the dataframe using the HTML method.
# HTML(df.to_html(escape=False,formatters=dict(Country=path_to_image_html)))



# Saving the dataframe as a webpage
# df.to_html('webpage.html',escape=False, formatters=dict(Country=path_to_image_html))

In [None]:
type(HTML)

In [None]:
df.to_html(

In [None]:
ipyplot()

In [None]:
quantiles

In [None]:
pd.qcut?

dir(df.v)

In [None]:
#!pip3 install ipyplot



In [None]:
dir(ipyplot)

ipyplot.plot_images?

ipyplot.plot_class_representations?

In [None]:
df_brightness = df.sort_values(by="v",
                               ascending=False,
                               ignore_index=True)

In [None]:
%%time

df_brightness = df_brightness.assign(imgs = df_brightness.path.apply(PIL.Image.open))

df_brightness

### pd.DataFrame function definitions

In [None]:
%%time

from typing import *
# fpaths = {}
# dps = {}

def extract_file_list_from_directory(parent_dir) -> List[str]:
    
    blacklist = [".ipynb_checkpoints"]
    return sorted([os.path.join(parent_dir, p) for p in os.listdir(parent_dir) if p not in blacklist])


def extract_file_ids_from_file_list(fpaths: List[str]) -> Dict[str, Any]:
    return {
        "paths" : fpaths,
        "file_ids" : [Path(p).stem for p in fpaths]
    }


def make_file_info_dataframe(file_info: Dict[str, Any], **kwargs) -> pd.DataFrame:
    return pd.DataFrame(file_info, **kwargs)

### Image stats analysis function definitions

In [None]:

def calc_img_shape(img: PIL.Image.Image) -> Tuple:
    # img = PIL.Image.open(path)
    h, w = img.height, img.width
    ratio = h/w
    return {
        "height": h, 
        "width": w,
        "aspect_ratio": ratio
    }


def calc_rgb_stats(img: PIL.Image.Image) -> Tuple:
    r, g, b = Stat(img).mean
    return {
        "r": r,
        "g": g,
        "b": b
    }


def calc_hsv_stats(img: PIL.Image.Image) -> Tuple:
    img = img.convert("HSV")
    h, s, v = Stat(img).mean
    return {
        "h": h,
        "s": s,
        "v": v
    }


def analyze_image_from_file(path: str):

    img, error = load_image(path, 
                            mode="RGB",
                            backend="PIL",
                            lazy_load=False)
    
    return {
        "path": path,
        **calc_img_shape(img),
        **calc_rgb_stats(img),
        **calc_hsv_stats(img),
        "error": error
    }


## Main: Process images or load previous results from disk

In [None]:
# yale_fossil_dir = "/media/data_cifs/projects/prj_fossils/data/yale_full"

# analysis_results_root_dir = "/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/2022-yale_fossil/analysis_results/"
# results_filename = "image_stats_df"

# parquet_dir = os.path.join(analysis_results_root_dir, "parquet")
# parquet_file_path = os.path.join(parquet_dir, f"{results_filename}.parquet")

# csv_dir = os.path.join(analysis_results_root_dir, "csv")
# csv_file_path = os.path.join(csv_dir, f"{results_filename}.csv")

# # n_jobs = 16

# if os.path.exists(parquet_file_path):
#     print(f"Found pre-computed image statistics analysis, skipping expensive parallel processing job & loading from disk")
#     analysis_df = pd.read_parquet(parquet_file_path)
    
# else:
#     yale_file_list = extract_file_list_from_directory(parent_dir=yale_fossil_dir)
#     yale_file_info_list = extract_file_ids_from_file_list(fpaths=yale_file_list)
#     yale_info_df = make_file_info_dataframe(file_info=yale_file_info_list)
#     df = yale_info_df

#     total_rows = df.shape[0]
#     file_paths = df["paths"].values

#     analysis_records = Parallel(n_jobs=n_jobs, backend='threading')(
#         delayed(analyze_image_from_file)(
#             path) for path in tqdm(file_paths, total=total_rows)
#     )

#     analysis_df = pd.DataFrame.from_records(analysis_records)


# analysis_df.describe(include='all')

### Output any new analysis to disk

In [None]:
# analysis_results_root_dir = "/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/2022-yale_fossil/analysis_results/"
# results_filename = "image_stats_df"

# parquet_dir = os.path.join(analysis_results_root_dir, "parquet")
# parquet_file_path = os.path.join(parquet_dir, f"{results_filename}.parquet")

# csv_dir = os.path.join(analysis_results_root_dir, "csv")
# csv_file_path = os.path.join(csv_dir, f"{results_filename}.csv")

In [None]:
%%time

# parquet_dir = os.path.join(analysis_results_root_dir, "parquet")
# parquet_file_path = os.path.join(parquet_dir, f"{results_filename}.parquet")

# csv_dir = os.path.join(analysis_results_root_dir, "csv")
# csv_file_path = os.path.join(csv_dir, f"{results_filename}.csv")


if os.path.exists(parquet_file_path):
    print(f"Skipping write to parquet after finding pre-existing parquet file at: {parquet_file_path}" + "\n" + "Manually delete pre-existing parquet file in order to allow write operation.")
else:
    os.makedirs(parquet_dir, exist_ok=True)
    analysis_df.to_parquet(parquet_file_path)




if os.path.exists(csv_file_path):
    print(f"Skipping write to csv after finding pre-existing csv file at: {csv_file_path}" + "\n" + "Manually delete pre-existing csv file in order to allow write operation.")
else:
    os.makedirs(csv_dir, exist_ok=True)
    analysis_df.to_csv(csv_file_path)

print(f"Finished analysis results can be found at either:")
print(parquet_file_path)
print("or")
print(csv_file_path)

In [None]:
pp("DONE")

### Misc analysis

In [None]:
%%time

new_df = pd.read_parquet(parquet_file_path)
new_df

In [None]:
%%time

new_df = pd.read_csv("data/csv/image_stats_df.csv")
new_df

In [None]:
new_df.describe(include='all')

In [None]:
error_types = analysis_df.value_counts("error").to_dict().keys()

for k in error_types:
    print(k)
    error_df = analysis_df[analysis_df.error==k]
    error_df.describe(include="all")

In [None]:
k = "image file is truncated (0 bytes not processed)"
analysis_df[analysis_df.error==k]