# 2022-07-16 -- `unlabeled yale fossils dataset` -- playground notebook

Created on: Saturday July 16th, 2022  
Created by: Jacob A Rose

- using `torchshow`

In [1]:
#!pip3 install torchshow

In [2]:
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.float_format', "{:,.2f}".format)

In [3]:
%load_ext autoreload
%autoreload 2


# from omegaconf import DictConfig, OmegaConf
import os
from rich import print as pp

import numpy as np
from typing import *
import inspect
from tqdm.auto import tqdm
from pathlib import Path
import logging
# import meerkat as mk

# import dask.dataframe as dd
from PIL import Image
import PIL
from PIL.ImageStat import Stat

In [14]:
import cv2
import glob
from joblib import Parallel, delayed

In [7]:
# dir(mk.config.DisplayOptions)
# display_res = 512
# # print(mk.config.DisplayOptions.max_image_width)
# mk.config.DisplayOptions.max_image_width = display_res
# mk.config.DisplayOptions.max_image_height = display_res
# print(f"{mk.config.DisplayOptions.max_image_width=}")

mk.config.DisplayOptions.max_image_width=512


## yale fossils

In [6]:
# yale_fossil_dir = "/media/data_cifs/projects/prj_fossils/data/yale_fossil/outblur_yale"

yale_fossil_dir = "/media/data_cifs/projects/prj_fossils/data/yale_full"

### pd.DataFrame function definitions

In [9]:
%%time

from typing import *
# fpaths = {}
# dps = {}

def extract_file_list_from_directory(parent_dir) -> List[str]:
    
    blacklist = [".ipynb_checkpoints"]
    return sorted([os.path.join(parent_dir, p) for p in os.listdir(parent_dir) if p not in blacklist])


def extract_file_ids_from_file_list(fpaths: List[str]) -> Dict[str, Any]:
    return {
        "paths" : fpaths,
        "file_ids" : [Path(p).stem for p in fpaths]
    }


def make_file_info_dataframe(file_info: Dict[str, Any], **kwargs) -> pd.DataFrame:
    return pd.DataFrame(file_info, **kwargs)

CPU times: user 39 µs, sys: 14 µs, total: 53 µs
Wall time: 58.2 µs


### Image IO function definitions

In [8]:
from PIL import ImageFile
import cv2

def load_image_PIL(file_path: str,
                   mode: str="RGB"):
    img = PIL.Image.open(file_path)
    if mode == "RGB":
        return img
    if mode == "HSV":
        return img.convert("HSV")
    else:
        raise Exception(f"Invalid value for {mode=}")
    

def load_image_cv2(file_path: str,
                   mode: str="RGB"):
    img = cv2.imread(file_path)
    if mode == "BGR":
        return img
    if mode == "RGB":
        return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    if mode == "HSV":
        return cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    else:
        raise Exception(f"Invalid value for {mode=}")

def load_image(file_path: str,
               mode: str="RGB",
               backend: str="PIL",
               lazy_load: bool=False):
    error = None

    try:
        if backend == "PIL":
            img = load_image_PIL(file_path=file_path,
                                 mode=mode)
            if not lazy_load:
                img.load()
        elif backend == "cv2":
            img = load_image_cv2(file_path=file_path,
                                  mode=mode)
        else:
            raise Exception(f"Invalid value for {backend=}")

    except OSError as e:
        error = str(e)
        
        ImageFile.LOAD_TRUNCATED_IMAGES = True
        img = load_image_PIL(file_path=file_path,
                                 mode=mode)
        img.load()
        ImageFile.LOAD_TRUNCATED_IMAGES = False
        
    return img, error

### Image stats analysis function definitions

In [13]:

def calc_img_shape(img: PIL.Image.Image) -> Tuple:
    # img = PIL.Image.open(path)
    h, w = img.height, img.width
    ratio = h/w
    return {
        "height": h, 
        "width": w,
        "aspect_ratio": ratio
    }


def calc_rgb_stats(img: PIL.Image.Image) -> Tuple:
    r, g, b = Stat(img).mean
    return {
        "r": r,
        "g": g,
        "b": b
    }


def calc_hsv_stats(img: PIL.Image.Image) -> Tuple:
    img = img.convert("HSV")
    h, s, v = Stat(img).mean
    return {
        "h": h,
        "s": s,
        "v": v
    }


def analyze_image_from_file(path: str):

    img, error = load_image(path, 
                            mode="RGB",
                            backend="PIL",
                            lazy_load=False)
    
    return {
        "path": path,
        **calc_img_shape(img),
        **calc_rgb_stats(img),
        **calc_hsv_stats(img),
        "error": error
    }


In [15]:
yale_file_list = extract_file_list_from_directory(parent_dir=yale_fossil_dir)
yale_file_info_list = extract_file_ids_from_file_list(fpaths=yale_file_list)
yale_info_df = make_file_info_dataframe(file_info=yale_file_info_list)

# df = yale_info_df
# df

df = yale_info_df #.iloc[2260:2280,:]
df.shape

(16444, 2)

In [16]:
%%time


# process the train images
total_rows = df.shape[0]
file_paths = df["paths"].values #[:total_rows]


analysis_records = Parallel(n_jobs=16, backend='threading')(
    delayed(analyze_image_from_file)(
        path) for path in tqdm(file_paths, total=total_rows)
)

  0%|          | 0/16444 [00:00<?, ?it/s]

CPU times: user 4h 11min 7s, sys: 8min 51s, total: 4h 19min 59s
Wall time: 34min 10s


In [17]:
analysis_df = pd.DataFrame.from_records(analysis_records)
# analysis_df

analysis_df.describe(include='all')

## Output analysis results to disk

In [36]:
analysis_results_root_dir = "/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/analysis_results/"
results_filename = "image_stats_df"

In [37]:
%%time

parquet_dir = os.path.join(analysis_results_root_dir, "parquet")
parquet_file_path = os.path.join(parquet_dir, f"{results_filename}.parquet")

os.makedirs(parquet_dir, exist_ok=True)
analysis_df.to_parquet(parquet_file_path) #"data/parquet/image_stats_df.parquet")

CPU times: user 34.9 ms, sys: 55.8 ms, total: 90.6 ms
Wall time: 178 ms


In [38]:
%%time

csv_dir = os.path.join(analysis_results_root_dir, "csv")
csv_file_path = os.path.join(csv_dir, f"{results_filename}.csv")


os.makedirs(csv_dir, exist_ok=True)
analysis_df.to_csv(csv_file_path)

CPU times: user 390 ms, sys: 111 ms, total: 501 ms
Wall time: 741 ms


In [39]:
print(f"Finished analysis results can be found at either:")
print(parquet_file_path)
print("or")
print(csv_file_path)

Finished analysis results can be found at either:
/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/analysis_results/parquet/image_stats_df.parquet
or
/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/analysis_results/csv/image_stats_df.csv


In [40]:
%%time

new_df = pd.read_parquet(parquet_file_path)
new_df

CPU times: user 24.9 ms, sys: 23.5 ms, total: 48.4 ms
Wall time: 88.3 ms


Unnamed: 0,path,height,width,aspect_ratio,r,g,b,h,s,v,error
0,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:000012b6-2c07-4df6-941c-8f2d0915391c.png,4000,6000,0.67,242.49,240.56,233.72,32.94,12.03,242.74,
1,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:000162dc-43bd-4129-9081-0024b8868cac.png,6000,3728,1.61,139.02,105.91,67.84,20.96,112.18,139.72,
2,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:00016467-aeaa-45fe-a040-9c8550d0d3cf.png,1549,2990,0.52,142.83,107.33,62.47,23.30,153.75,142.83,
3,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:0008efe0-a5d3-4683-ab85-9a6612ee9b97.png,2400,2990,0.80,124.11,97.51,64.46,22.11,137.23,124.11,
4,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:000b2947-cf62-4228-a2ea-67cf9fcb8d99.png,4468,1407,3.18,150.26,120.17,75.54,26.35,136.71,150.64,
...,...,...,...,...,...,...,...,...,...,...,...
16439,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff2afdf-b056-4ff3-a9fa-38858995908e.png,1944,2592,0.75,132.91,122.69,121.05,51.90,22.77,133.41,
16440,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff4c2b0-168c-462b-a516-032b086c2703.png,5072,2928,1.73,112.81,88.63,51.19,23.08,128.98,113.30,
16441,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff70057-4228-4a7d-9146-58f31755fa77.png,3792,4688,0.81,98.17,55.98,37.02,21.99,127.88,98.34,
16442,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff96ceb-60b8-491d-a7c4-4a5aae2e255a.png,2020,2990,0.68,114.29,97.63,71.85,29.15,79.24,114.36,


In [35]:
%%time

new_df = pd.read_csv("data/csv/image_stats_df.csv")
new_df

CPU times: user 74.8 ms, sys: 7.97 ms, total: 82.8 ms
Wall time: 126 ms


Unnamed: 0.1,Unnamed: 0,path,height,width,aspect_ratio,r,g,b,h,s,v,error
0,0,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:000012b6-2c07-4df6-941c-8f2d0915391c.png,4000,6000,0.67,242.49,240.56,233.72,32.94,12.03,242.74,
1,1,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:000162dc-43bd-4129-9081-0024b8868cac.png,6000,3728,1.61,139.02,105.91,67.84,20.96,112.18,139.72,
2,2,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:00016467-aeaa-45fe-a040-9c8550d0d3cf.png,1549,2990,0.52,142.83,107.33,62.47,23.30,153.75,142.83,
3,3,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:0008efe0-a5d3-4683-ab85-9a6612ee9b97.png,2400,2990,0.80,124.11,97.51,64.46,22.11,137.23,124.11,
4,4,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:000b2947-cf62-4228-a2ea-67cf9fcb8d99.png,4468,1407,3.18,150.26,120.17,75.54,26.35,136.71,150.64,
...,...,...,...,...,...,...,...,...,...,...,...,...
16439,16439,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff2afdf-b056-4ff3-a9fa-38858995908e.png,1944,2592,0.75,132.91,122.69,121.05,51.90,22.77,133.41,
16440,16440,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff4c2b0-168c-462b-a516-032b086c2703.png,5072,2928,1.73,112.81,88.63,51.19,23.08,128.98,113.30,
16441,16441,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff70057-4228-4a7d-9146-58f31755fa77.png,3792,4688,0.81,98.17,55.98,37.02,21.99,127.88,98.34,
16442,16442,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:fff96ceb-60b8-491d-a7c4-4a5aae2e255a.png,2020,2990,0.68,114.29,97.63,71.85,29.15,79.24,114.36,


In [33]:
new_df.describe(include='all')

Unnamed: 0,path,height,width,aspect_ratio,r,g,b,h,s,v,error
count,16444,16444.0,16444.0,16444.0,16444.0,16444.0,16444.0,16444.0,16444.0,16444.0,4
unique,16444,,,,,,,,,,2
top,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:b500c8bd-4e7c-4984-97eb-f2606adb9fb6.png,,,,,,,,,,image file is truncated (0 bytes not processed)
freq,1,,,,,,,,,,3
mean,,2676.11,2876.15,1.0,139.91,113.34,80.43,31.61,117.67,140.32,
std,,1087.97,1198.44,0.44,36.26,35.27,37.16,18.28,45.56,36.45,
min,,198.0,284.0,0.11,18.64,18.24,9.78,4.31,4.36,19.68,
25%,,1988.0,2053.0,0.67,116.13,90.47,56.28,21.89,87.46,116.48,
50%,,2612.0,2990.0,0.85,141.51,111.54,72.13,25.4,125.38,141.89,
75%,,2990.0,2990.0,1.24,163.86,132.14,93.02,31.77,151.09,164.33,


In [25]:
error_types = analysis_df.value_counts("error").to_dict().keys()

for k in error_types:
    print(k)
    error_df = analysis_df[analysis_df.error==k]
    error_df.describe(include="all")

image file is truncated (0 bytes not processed)


Unnamed: 0,path,height,width,aspect_ratio,r,g,b,h,s,v,error
count,3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3
unique,3,,,,,,,,,,1
top,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:f4241b3a-9b1c-4856-bed5-75dc03004c08.png,,,,,,,,,,image file is truncated (0 bytes not processed)
freq,1,,,,,,,,,,3
mean,,3312.0,3176.0,0.98,118.07,100.9,65.72,28.15,83.29,118.27,
std,,2321.12,683.66,0.47,51.41,52.34,32.95,10.81,60.87,51.23,
min,,1944.0,2592.0,0.66,72.02,66.51,33.17,15.93,14.08,72.51,
25%,,1972.0,2800.0,0.71,90.34,70.78,49.06,24.01,60.69,90.59,
50%,,2000.0,3008.0,0.75,108.67,75.05,64.96,32.09,107.31,108.67,
75%,,3996.0,3468.0,1.14,141.1,118.09,82.0,34.27,117.89,141.14,


image file is truncated


Unnamed: 0,path,height,width,aspect_ratio,r,g,b,h,s,v,error
count,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
unique,1,,,,,,,,,,1
top,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:2369295c-e89b-4799-9ed6-6c5605576cb1.png,,,,,,,,,,image file is truncated
freq,1,,,,,,,,,,1
mean,,2990.0,2304.0,1.3,72.29,53.77,37.52,17.14,105.51,72.29,
std,,,,,,,,,,,
min,,2990.0,2304.0,1.3,72.29,53.77,37.52,17.14,105.51,72.29,
25%,,2990.0,2304.0,1.3,72.29,53.77,37.52,17.14,105.51,72.29,
50%,,2990.0,2304.0,1.3,72.29,53.77,37.52,17.14,105.51,72.29,
75%,,2990.0,2304.0,1.3,72.29,53.77,37.52,17.14,105.51,72.29,


In [24]:
k = "image file is truncated (0 bytes not processed)"
analysis_df[analysis_df.error==k]

Unnamed: 0,path,height,width,aspect_ratio,r,g,b,h,s,v,error
10332,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:a1be3282-5869-4abe-99c5-e9998ed70f8b.png,2000,3008,0.66,108.67,75.05,33.17,15.93,128.48,108.67,image file is truncated (0 bytes not processed)
15644,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:f4241b3a-9b1c-4856-bed5-75dc03004c08.png,5992,3928,1.53,173.54,161.13,99.05,32.09,107.31,173.61,image file is truncated (0 bytes not processed)
16007,/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:f9714b38-d33c-4f27-b5c0-28f313533afc.png,1944,2592,0.75,72.02,66.51,64.96,36.44,14.08,72.51,image file is truncated (0 bytes not processed)


In [30]:

# img, error = load_image(analysis_df[~analysis_df["error"].isna()].path.iloc[0])
# img

# path = "/media/data_cifs/projects/prj_fossils/data/yale_full/urn:uuid:2369295c-e89b-4799-9ed6-6c5605576cb1.png"
# img = PIL.Image.open(path)

# img

In [None]:
def add_image_rgb_stats2datapanel(dp: mk.DataPanel) -> mk.DataPanel:
    mean_dp = dp["imgs"].to_lambda(fn=lambda x: Stat(x).mean)
    
    dp["r"], dp["g"], dp["b"] = (
        mean_dp.to_lambda(lambda x: x[0]), 
        mean_dp.to_lambda(lambda x: x[1]),
        mean_dp.to_lambda(lambda x: x[2])
    )
    # dp["rms_r"], dp["rms_g"], dp["rms_b"] = dp["imgs"].to_lambda(fn=lambda x: Stat(x).rms)
    return dp




def add_image_hsv_stats2datapanel(dp: mk.DataPanel) -> mk.DataPanel:
    """
    
    Update 1:
        - slightly faster (hopefully) version than before, since I now take the mean across all pixels for each channel in the 1st step, rather than calculating 3 separate means in series.
    Update 2:
        - Also reduced process once more by utilizing the `outputs` kwarg to map output column names to a mapped function's output tuple.
    """
    cv2_img_dp = dp["paths"].to_lambda(
        fn=lambda x: np.mean(
            load_image(x, "HSV"), axis=(0, 1)
        )
        # outputs=("h", "s", "v")
    )
    
    # dp["h"], dp["s"], dp["v"] = cv2_img_dp

    dp["h"], dp["s"], dp["v"] = (
        cv2_img_dp.to_lambda(lambda x: x[0]),
        cv2_img_dp.to_lambda(lambda x: x[1]),
        cv2_img_dp.to_lambda(lambda x: x[2])
                       )
    return dp

In [None]:
yale_file_list = extract_file_list_from_directory(parent_dir=yale_fossil_dir)
yale_file_info_list = extract_file_ids_from_file_list(fpaths=yale_file_list)
yale_info_df = make_file_info_dataframe(file_info=yale_file_info_list)

In [None]:
yale_info_df

In [None]:
dp = make_image_datapanel(dp=yale_info_dp,
                          img_col_only=False)

### mk.DataPanel

In [None]:
def make_file_info_datapanel(file_info: Dict[str, Any]) -> mk.DataPanel:
    return mk.DataPanel(file_info)


def make_image_datapanel(dp: mk.DataPanel,
                         img_col_only: bool=False
                        ) -> mk.DataPanel:
    dp["imgs"] = mk.ImageColumn.from_filepaths(dp["paths"])
    
    if not img_col_only:
        dp["height"] = dp["imgs"].to_lambda(fn=lambda x: x.height)
        dp["width"] = dp["imgs"].to_lambda(fn=lambda x: x.width)
        dp["aspect_ratio"] = dp[["height", "width"]].to_lambda(fn=lambda x: x["height"] / x["width"])

    return dp


def add_image_rgb_stats2datapanel(dp: mk.DataPanel) -> mk.DataPanel:
    mean_dp = dp["imgs"].to_lambda(fn=lambda x: Stat(x).mean)
    
    dp["r"], dp["g"], dp["b"] = (
        mean_dp.to_lambda(lambda x: x[0]), 
        mean_dp.to_lambda(lambda x: x[1]),
        mean_dp.to_lambda(lambda x: x[2])
    )
    # dp["rms_r"], dp["rms_g"], dp["rms_b"] = dp["imgs"].to_lambda(fn=lambda x: Stat(x).rms)
    return dp




def add_image_hsv_stats2datapanel(dp: mk.DataPanel) -> mk.DataPanel:
    """
    
    Update 1:
        - slightly faster (hopefully) version than before, since I now take the mean across all pixels for each channel in the 1st step, rather than calculating 3 separate means in series.
    Update 2:
        - Also reduced process once more by utilizing the `outputs` kwarg to map output column names to a mapped function's output tuple.
    """
    cv2_img_dp = dp["paths"].to_lambda(
        fn=lambda x: np.mean(
            load_image(x, "HSV"), axis=(0, 1)
        )
        # outputs=("h", "s", "v")
    )
    
    # dp["h"], dp["s"], dp["v"] = cv2_img_dp

    dp["h"], dp["s"], dp["v"] = (
        cv2_img_dp.to_lambda(lambda x: x[0]),
        cv2_img_dp.to_lambda(lambda x: x[1]),
        cv2_img_dp.to_lambda(lambda x: x[2])
                       )
    return dp

In [None]:
    # return cv2_img_dp
    
    
#     dp["h"], dp["s"], dp["v"] = (
#         cv2_img_dp["h"],
#         cv2_img_dp["s"],
#         cv2_img_dp["v"]
#                        )
    
#     return dp

###############################
###############################


# def add_image_hsv_stats2datapanel(dp: mk.DataPanel) -> mk.DataPanel:
#     """
    
#     Update 1:
#         - slightly faster (hopefully) version than before, since I now take the mean across all pixels for each channel in the 1st step, rather than calculating 3 separate means in series.
#     """
#     cv2_img_dp = dp["paths"].to_lambda(
#         fn=lambda x: np.mean(
#             load_image(x, "HSV"), axis=(0, 1)
#         )
#     )

    # dp["h"], dp["s"], dp["v"] = (
    #     cv2_img_dp.to_lambda(lambda x: x[:,:,0]),
    #     cv2_img_dp.to_lambda(lambda x: x[:,:,1]),
    #     cv2_img_dp.to_lambda(lambda x: x[:,:,2])
    #                    )
    
#     return dp


##############################
##############################

# def add_image_hsv_stats2datapanel(dp: mk.DataPanel) -> mk.DataPanel:

#     cv2_img_dp = dp["paths"].to_lambda(fn=lambda x: load_image(x, "HSV"))

#     dp["h"], dp["s"], dp["v"] = (
#         cv2_img_dp.to_lambda(lambda x: np.mean(x[:,:,0])),
#         cv2_img_dp.to_lambda(lambda x: np.mean(x[:,:,1])),
#         cv2_img_dp.to_lambda(lambda x: np.mean(x[:,:,2]))
#                        )
    
#     return dp

In [None]:
yale_file_list = extract_file_list_from_directory(parent_dir=yale_fossil_dir)
yale_file_info_list = extract_file_ids_from_file_list(fpaths=yale_file_list)
yale_info_dp = make_file_info_datapanel(file_info=yale_file_info_list)

In [None]:
dp = make_image_datapanel(dp=yale_info_dp,
                          img_col_only=False)

In [None]:
# dp = add_image_rgb_stats2datapanel(dp)
dp2 = add_image_hsv_stats2datapanel(dp)

In [None]:
batch_loader = dp2.lz[:128].batch(
    batch_size=64,
    num_workers=16
)

batch_loader

In [None]:
a = tqdm(iter(batch_loader))
a

In [None]:
%%time

b = list(a)

b
# a = next(batch_loader)

In [None]:
dir(batch_loader)

In [None]:
dir(batch_loader)

In [None]:
dp2.head(5)

In [None]:
b = rows.map(lambda x: x, materialize=True, batch_size=1024)

In [None]:
dp.head(1)

In [None]:
x = np.ones((100,100,3))

In [None]:
# x_mean = np.mean(x[:,:,0])

x_mean = np.mean(x, axis=(0,1))
print(x.shape, x_mean.shape)

In [None]:
x_mean

In [None]:
dir(x_mean)

x_mean.item()

x_mean.take()

In [None]:
# h = dp["h"].copy()

# h = dp["h"][np.arange(len(hp))]

# h = dp["h"][np.arange(10)]

h = dp["h"][np.arange(10)].to_pandas()
print(type(h))
h

In [None]:
type(h[0])

In [None]:
type(h)

In [None]:
h_df = h.to_pandas()

In [None]:
h_df = h.to_tensor()

In [None]:
h_df

In [None]:
# h_df.values

from itertools import islice



In [None]:
rows = dp.lz[:4]
rows.shape

dir(b['b'])

In [None]:
b['b'].to_pandas()

In [None]:
rows['b'][np.arange(len(rows))].to_pandas()

In [None]:
b = rows.map(lambda x: x, materialize=True, batch_size=4)

In [None]:
dir(b)

In [None]:
b.to_pandas()

In [None]:
dir(dp["h"])

In [None]:
stats_cols = ['file_ids',
              'height',
              'width',
              'aspect_ratio',
              'r', 'g', 'b',
              'h', 's', 'v']

stats_cols = [1,3,4,5,6,7,8,9,10]

stats_df = dp[stats_cols,0].to_pandas()

stats_df.head()

In [None]:
stats_df.iloc[0,:].get()

In [None]:
dir(stats_df.iloc[0,1])

In [None]:
dp.columns

In [None]:
path = dp["paths"][0]

img = load_image(path, "HSV")

import matplotlib.pyplot as plt

plt.imshow(img)

In [None]:
# hist = cv.calcHist([img],[0],None,[256],[0,256])

In [None]:
dp.head()

In [None]:
dir(dp)

In [None]:
dp.__format__?

In [None]:
# import inspect
# print(inspect.getsource(yale_info_dp.add_column))

In [None]:
yale_info_dp['paths']

In [None]:
yale_full = extract_dataset_from_directory(parent_dir=yale_fossil_dirs["yale_full"])
dp = yale_full
dp.lz[:3]

In [None]:
# dir(dp["height"])

# %%time

# # dp["height"] = mk.PandasSeriesColumn(dp["height"].copy())
# height = dp.lz[:10]["height"].copy()

# height

# %%time

# # dp["height"] = mk.PandasSeriesColumn(dp["height"].copy())
# height = dp["height"].lz[:10].copy()
# height

# %%time

# # dp["height"] = mk.PandasSeriesColumn(dp["height"].copy())
# height = dp["height"].copy().lz[:10]
# height

# height._get_state()

# height._get

# %%time

# # dp["height"] = mk.PandasSeriesColumn(dp["height"].copy())
# height = dp["height"].copy()

# height

# %%time
# height = dp["height"].view()
# height
# dp["height"] = dp["height"].copy()
# dp["height"] = mk.PandasSeriesColumn(dp["height"].copy())
# dp.head()

# height = dp["height"].data
# height

### Export parsed file ids catalog to tsv

In [None]:
dp[["paths", "file_ids"]].to_pandas().to_csv("/media/data_cifs/projects/prj_fossils/data/raw_data/2022-yale_fossil/metadata-merged/0_image_file_ids.tsv", sep="\t", na_rep="NaN")

### extracting yale metadata

In [None]:
# metadata_dir = "/media/data_cifs/projects/prj_fossils/data/raw_data/2022-yale_fossil/metadata/"
metadata_dir = "/media/data_cifs/projects/prj_fossils/data/raw_data/2022-yale_fossil/metadata-clean"
occurrence_df = pd.read_csv(Path(metadata_dir, "occurrence-clean.txt"), delimiter="\t")#, nrows=9449)
# occurrence_df = pd.read_json(Path(metadata_dir, "occurrence.txt"), skiprows)
occurrence_df.head()

In [None]:
multimedia_df = pd.read_csv(Path(metadata_dir, "multimedia-clean.txt"), delimiter="\t", nrows=9449)
# occurrence_df = pd.read_json(Path(metadata_dir, "occurrence.txt"), skiprows)

multimedia_df.head()

In [None]:
multimedia_df.columns

occurrence_df.columns

In [None]:
from matplotlib_venn import venn2
# venn2(subsets = (3, 2, 1))
subsets = (set(multimedia_df.columns.values.tolist()),
           set(occurrence_df.columns.values.tolist()))
venn2(subsets = subsets)

In [None]:
subsets[0].intersection(subsets[1])

In [None]:
from supervenn import supervenn
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%%time

merged = multimedia_df.merge(occurrence_df, how="inner", on="id")
print(f"{multimedia_df.shape=}, {occurrence_df.shape=}, {merged.shape=}")

merged = merged[sorted(merged.columns)]

merged.describe(include='all')

In [None]:
merged_cols = ['id',
               'identifier',
               'catalogNumber',
               'year',
               'datasetID',
               'datasetName',
               'dateIdentified',
               'class',
               'kingdom',
               'order',
               'family',
               'genus',
               'subgenus',
               'specificEpithet',
               'phylum',
               'scientificName',
               'taxonRank',
               'institutionID',
               'locality',
               'accessURI',
               'associatedMedia',
               'associatedOccurrences',
               'associatedReferences',
               'associatedSequences',
               'associatedTaxa',
               'MetadataDate',
               'WebStatement',
 'accessRights',
 'basisOfRecord',
 'behavior',
 'bibliographicCitation',
 'collectionCode',
 'collectionID',
 'comments',
 'continent',
 'coordinateUncertaintyInMeters',
 'country',
 'county',
 'creator',
 'creator.1',
 'dataGeneralizations',
 'day',
 'decimalLatitude',
 'decimalLongitude',
 'digitizationDate',
 'disposition',
 'dynamicProperties',
 'earliestAgeOrLowestStage',
 'earliestEpochOrLowestSeries',
 'earliestPeriodOrLowestSystem',
 'eventDate',
 'eventTime',
 'fieldNumber',
 'format',
 'format.1',
 'formation',
 'geodeticDatum',
 'georeferenceProtocol',
 'georeferenceRemarks',
 'georeferenceSources',
 'georeferencedBy',
 'georeferencedDate',
 'group',
 'habitat',
 'higherClassification',
 'higherGeography',
 'identificationQualifier',
 'identificationReferences',
 'identificationRemarks',
 'identifiedBy',
 'individualCount',
 'informationWithheld',
 'infraspecificEpithet',
 'institutionCode',
 'language',
 'license',
 'lifeStage',
 'lowestBiostratigraphicZone',
 'maximumDepthInMeters',
 'maximumElevationInMeters',
 'member',
 'metadataLanguage',
 'minimumDepthInMeters',
 'minimumElevationInMeters',
 'modified',
 'month',
 'municipality',
 'nomenclaturalCode',
 'occurrenceID',
 'occurrenceRemarks',
 'otherCatalogNumbers',
 'ownerInstitutionCode',
 'preparations',
 'previousIdentifications',
 'providerManagedID',
 'recordNumber',
 'recordedBy',
 'references',
 'reproductiveCondition',
 'rights',
 'rights.1',
 'rightsHolder',
 'scientificNameAuthorship',
 'sex',
 'stateProvince',
 'taxonRemarks',
 'title',
 'type.1',
 'typeStatus',
 'type_x',
 'type_y',
 'verbatimDepth',
 'verbatimElevation',
 'verbatimEventDate',
 'verbatimLatitude',
 'verbatimLongitude',
 'vernacularName',
 'waterBody']


merged = merged[merged_cols]

merged = merged.convert_dtypes()

merged = merged.assign(digitizationDate = merged.digitizationDate.astype(pd.StringDtype()))


merged.describe(include='all')

In [None]:
merged.head()

In [None]:
merged.info(verbose=True)

In [None]:
output_dir = "/media/data_cifs/projects/prj_fossils/data/raw_data/2022-yale_fossil/metadata-merged"
output_filename = "1_multimedia_merged_with_occurrences_metadata_on_id.tsv"

merged.to_csv(Path(output_dir, output_filename), sep="\t", na_rep="NaN")

In [None]:
loaded = pd.read_csv(Path(output_dir, output_filename), sep="\t", index_col=0) #, na_rep="NaN")

loaded = loaded.convert_dtypes()
loaded = loaded.assign(digitizationDate = loaded.digitizationDate.astype(pd.StringDtype()))

assert loaded.equals(merged)
# loaded.info(verbose=True)

In [None]:
merged_multimedia_w_occurrences_df = merged

In [None]:
%%time

# merged = dp.merge(mk.DataPanel.from_pandas(multimedia_df), how="left", left_on="file_ids", right_on="identifier")
df = dp.view().to_pandas().drop(columns=["imgs", "height", "width", "aspect_ratio"])

merged = df.merge(merged_multimedia_w_occurrences_df, how="left", left_on="file_ids", right_on="identifier")
print(f"{df.shape=}, {dp.shape=}, {merged.shape=}, {merged_multimedia_w_occurrences_df.shape=}")

In [None]:
output_dir = "/media/data_cifs/projects/prj_fossils/data/raw_data/2022-yale_fossil/metadata-merged"
output_filename = "2_image_file_ids_matched_with_multimedia_identifier_column.tsv"

merged.to_csv(Path(output_dir, output_filename), sep="\t", na_rep="NaN")

In [None]:
merged.head()

In [None]:
merged.describe(include='all')

In [None]:
columns = ['paths',
 'file_ids',
 'identifier',
 'id',
 'imgs',
 'height',
 'width',
 'aspect_ratio',
 'type',
 'type.1',
 'title',
 'MetadataDate',
 'metadataLanguage',
 'providerManagedID',
 'comments',
 'rights',
 'rights.1',
 'WebStatement',
 'creator',
 'creator.1',
 'digitizationDate',
 'accessURI',
 'format',
 'format.1']

merged[columns]

In [None]:
dp

In [None]:
merged.digitizationDate.value_counts()#dtype

In [None]:
loaded.digitizationDate.value_counts()#dtype

In [None]:
loaded.describe(include='all')

In [None]:
a = merged
b = loaded

comparison_result = (a == b) | ((a != a) & (b != b))

comparison_result

# a[a.digitizationDate != b.digitizationDate]


ab = a.merge(b, on="id")#, indicator
ab = ab[sorted(ab.columns)]

ab[ab.digitizationDate_x != ab.digitizationDate_y]
# b.digitizationDate

ab = ab.convert_dtypes()
# b.digitizationDate


ab.info(verbose=True)
# b.digitizationDate

comparison_result.all()

In [None]:
list(merged.columns)

In [None]:
occurrence_df[occurrence_df.id.apply(lambda x: x in merged.id)]

In [None]:
%%time

# merged = dp.merge(mk.DataPanel.from_pandas(multimedia_df), how="left", left_on="file_ids", right_on="identifier")
df = dp.to_pandas()

merged = df.merge(multimedia_df, how="left", left_on="file_ids", right_on="identifier")
print(f"{dp.shape=}, {merged.shape=}")

merged.describe(include='all')

In [None]:
columns = ['paths',
 'file_ids',
 'identifier',
 'id',
 'imgs',
 'height',
 'width',
 'aspect_ratio',
 'type',
 'type.1',
 'title',
 'MetadataDate',
 'metadataLanguage',
 'providerManagedID',
 'comments',
 'rights',
 'rights.1',
 'WebStatement',
 'creator',
 'creator.1',
 'digitizationDate',
 'accessURI',
 'format',
 'format.1']

merged[columns]

In [None]:
# print(occurrence_df.shape)
# occurrence_df.describe(include='all')

### plots

In [None]:
#!pip3 install streamlit-aggrid
# !pip3 install streamlit-pandas-profiling

In [None]:
from streamlit_pandas_profiling import st_profile_report

In [None]:
st_profile_report

In [None]:
#Create two columns with different width
col1, col2 = st.columns( [0.8, 0.2])
with col1:               # To display the header text using css style
    st.markdown(""" <style> .font {
    font-size:35px ; font-family: 'Cooper Black'; color: #FF9633;} 
    </style> """, unsafe_allow_html=True)
    st.markdown('<p class="font">Upload your photo here...</p>', unsafe_allow_html=True)
    
with col2:               # To display brand logo
    st.image(image,  width=150)

In [None]:
import missingno as msno
%matplotlib inline

In [None]:
%%time

msno.matrix(occurrence_df, sort='descending') #.sample(2000))

In [None]:
%%time
import matplotlib.pyplot as plt

plt.figure(figsize=(35,30))
msno.heatmap(occurrence_df, sort='descending', ax=plt.gca())#.sample(2000))

In [None]:
%%time
import matplotlib.pyplot as plt

# plt.figure(figsize=(35,30))
msno.dendrogram(occurrence_df, orientation="top", figsize=(40,20))

In [None]:
%%time

msno.bar(occurrence_df, sort='descending')#.sample(2000))

In [None]:
dir(msno)

In [None]:




dir(mk.DataPanel)

In [None]:
mk.DataPanel
merged = dp.merge(mk.DataPanel.from_pandas(occurrence_df), how="left", left_on="file_ids", right_on="id")

print(f"{dp.shape=}, {merged.shape=}")

In [None]:
%%time

merged_df = merged.to_pandas()

merged_df.describe(include='all')

In [None]:
merged.to_pandas().describe(include='all')

In [None]:
merged.columns

### Fuzzy matching

In [None]:
#!pip3 install fuzzy_pandas
!pip3 install fuzzymatcher

In [None]:
import pandas as pd
# import fuzzy_pandas as fpd

df1 = df.iloc[:10,:]
# df2 = occurrence_df

df2 = multimedia_df

import fuzzymatcher

In [None]:
merged = fuzzymatcher.fuzzy_left_join(df1, df2,
                             left_on = "file_ids",
                             right_on = "identifier")
                             # right_on = "id")

merged

In [None]:
merged.columns

In [None]:
merged.sort_values("best_match_score", ascending=False)

In [None]:
merged = fpd.fuzzy_merge(df1, df2,
                         left_on=['file_ids'],
                         right_on=['id'],
                         method="levenshtein",
                         ignore_case=True,
                         keep='match')
merged

In [None]:
import pandas as pd
import fuzzy_pandas as fpd

df1 = df.iloc[:10,:]
df2 = occurrence_df

merged = fpd.fuzzy_merge(df1, df2,
                         left_on=['file_ids'],
                         right_on=['id'],
                         method="levenshtein",
                         ignore_case=True,
                         keep='match')
merged

In [None]:
!head -n 1 {metadata_dir}/occurrence.txt

In [None]:
multimedia_df = pd.read_csv(Path(metadata_dir, "multimedia.txt"), delimiter="\t", nrows=9449)
# occurrence_df = pd.read_json(Path(metadata_dir, "occurrence.txt"), skiprows)

multimedia_df.head()

In [None]:
multimedia_dp = extract_dataset_from_file_list(fpaths = multimedia_df.accessURI)
multimedia_dp.lz



In [None]:
occurrence_df = pd.read_csv(Path(metadata_dir, "occurrence.txt"), delimiter="\t", nrows=9449)
# occurrence_df = pd.read_json(Path(metadata_dir, "occurrence.txt"), skiprows)

occurrence_df.head()

In [None]:
occurrence_df.describe(include='all')

In [None]:
occurrence_df.columns.values

In [None]:
# dir(Path(fpaths[k][0]))
# Path(fpaths[k][0]).stem
# Path(fpaths[k][0]).name









# Misc

In [None]:
# from supervenn import supervenn
# import matplotlib.pyplot as plt
# import seaborn as sns

# style_list = ['default', 'classic'] + sorted(
#     style for style in plt.style.available if style != 'classic'
# )

# pp(style_list)
# style_label = "seaborn-notebook"
# plt.style.context(style_label)



# Plot a demonstration figure for every available style sheet.
# for style_label in style_list:
    # with plt.rc_context({"figure.max_open_warning": len(style_list)}):
        # with plt.style.context(style_label)





    
    
    
# for k,v in yale_fossil_dirs.items():
#     fpaths[k] = [os.path.join(v, p) for p in os.listdir(v)]
#     dps[k] = mk.DataPanel({
#         "paths" : fpaths[k],
#         "file_ids" : [Path(p).stem for p in fpaths[k]],
#         "imgs" : mk.ImageColumn.from_filepaths(fpaths[k])
#     })
#     dps[k]["height"] = dps[k]["imgs"].to_lambda(fn=lambda x: x.height)
#     # dps[k]["mean"] = dps[k]["imgs"].to_lambda(fn=lambda x: Stat(x).mean)
#     dps[k]["width"] = dps[k]["imgs"].to_lambda(fn=lambda x: x.width)