# 04_image deduplication through clustering -- 2022-09-21-unlabeled yale fossils dataset.ipynb

Inputs: version 1 of the cleaned 2022 unlabeled yale fossils dataset, having identified & removed **irrelevant outlier images**

Outputs: version 2 of the cleaned 2022 unlabeled yale fossils dataset, having identified & removed **duplicate images** as well

Created by: Jacob A Rose  
Created on: Tuesday September 20th, 2022  
<!-- Updated on:  -->

In [None]:
#!pip3 install torchshow

from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.float_format', "{:,.2f}".format)

from rich import print as pp
import meerkat as mk
display_res = 1024
# print(mk.config.DisplayOptions.max_image_width)
mk.config.display.max_rows = 100
mk.config.display.max_image_width = display_res
mk.config.display.max_image_height = display_res

# mk.config.DisplayOptions.max_rows = 100
# mk.config.DisplayOptions.max_image_width = display_res
# mk.config.DisplayOptions.max_image_height = display_res

# print(f"{mk.config.DisplayOptions.max_image_width=}")
print("mk.config.display=")
pp(dict(vars(mk.config.display)))

mk.config

In [None]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
from typing import *
import inspect
from tqdm.auto import tqdm
from pathlib import Path
import logging

from PIL import Image
import PIL
from PIL.ImageStat import Stat

import cv2
import glob
from joblib import Parallel, delayed

In [None]:
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)

## Functions

In [None]:
import cv2
import numpy as np
import PIL
import PIL.Image


def rescale_image(img: np.ndarray, max_size: int=512) -> np.ndarray:
    h, w, c = img.shape
    scale = min([max_size/h, max_size/w])
    output_size = int(scale*w), int(scale*h)
    
    return cv2.resize(img, output_size, interpolation=cv2.INTER_AREA)

def rescale_image_PIL(img: PIL.Image.Image, max_size: int=512) -> PIL.Image:
    # print(img)
    # img.load()
    w, h = img.width, img.height
    scale = max([max_size/w, max_size/h])
    output_size = int(scale*w), int(scale*h)
    
    return img.resize(output_size, resample=PIL.Image.Resampling.BICUBIC)



import cv2


def hconcat_resize_min(im_list, interpolation=cv2.INTER_CUBIC):
    h_min = min(im.shape[0] for im in im_list)
    im_list_resize = [cv2.resize(im, (int(im.shape[1] * h_min / im.shape[0]), h_min), interpolation=interpolation)
                      for im in im_list]
    return cv2.hconcat(im_list_resize)

import math
from pathlib import Path

def image_grid(image_paths, 
               col: int=5,
               max_imgs: int=-1,
               include_filenames_as_titles=False):
    
    if max_imgs > 0:
        image_paths = image_paths[:max_imgs]
    
    image_count = len(image_paths)
    row = math.ceil(image_count/col)
    fig = plt.figure(figsize=(col*4,row*4))

    for i, img_path in enumerate(image_paths):
        img_path = str(img_path)

        img = plt.imread(img_path)

        ax = plt.subplot(row, col, i + 1)
        plt.imshow(img)
        if include_filenames_as_titles:
            plt.title(Path(img_path).name)

        # ax.set_xticklabels([])
        # ax.set_yticklabels([])

        plt.axis("off")
    plt.subplots_adjust(wspace=0, hspace=0, top=0.97)
    return fig



In [None]:
def is_empty(path):
    """
    Returns True if the input path is an empty directory, False if anything's in it. Throws an error if target isn't a directory.
    
    """
    
    return len(os.listdir(path)) == 0

def get_version_from_path(path: str) -> int:
    path = Path(path).stem
    return int(path.split("_")[1])



def get_latest_version(root_dir: str,
                       skip_version_if_exists: bool=False) -> Path:
    """
    
    Input a root dir, and this function will either create & return a new subdir labeled "version_0", or it will find the latest created version that doesn't have any files in it.
    
    Should run once in an experiment & save in a variable if need to reference version elsewhere in script.
    
    root_dir: str
        Location in which multiple version subdirs will be located (e.g. "./version_{0,1,2,3...}"
    skip_version_if_exists: bool, default=False
        If False, attempt to load previous annotations if found on disk. If True, always go to the next version number if version subdir is not empty.
        By default, attempts to load previous annotations if they exist.
    
    """
    v = 0
    if not is_empty(root_dir):
        for d in sorted(os.listdir(root_dir)):
            v = max([v, get_version_from_path(d)])
            if (
                skip_version_if_exists
                and (not is_empty(Path(root_dir, d)))
            ):
                v = get_version_from_path(d) + 1

    save_dir = Path(root_dir, f"version_{v}")
    os.makedirs(save_dir, exist_ok=True)
    
    return save_dir

### Functions for caching annotations

* Functions to load (`load_cached_annotations` and save (`cache_annotations`) versioned catalogs of annotated/labeled datasets to iteratively work through a large set in many small increments.

* Saves an `annotated_df` and a `non_annotated_df` containing the same columns, with the latter having NaN for all values of the `label` column.

In [None]:
from typing import *


def cache_annotations(save_dir: str,
                      annotated_df: pd.DataFrame,
                      non_annotated_df: pd.DataFrame
                     ) -> None:
    """
    Save 2 different dataframes into a cache directory as parquet & csv files.
    
    The 1st contains only rows that have been manually annotated at least once.
    The 2nd contains only rows that have never been annotated (indicated by a value of label=="")
    """
    annotations_cache_dir = Path(save_dir, "annotations_cache")
    os.makedirs(annotations_cache_dir, exist_ok=True)

    annotated_df.to_parquet(annotations_cache_dir / f"annotated.parquet")
    annotated_df.to_csv(annotations_cache_dir / f"annotated.csv")
    annotated_df.describe(include='all').to_csv(annotations_cache_dir / "annotated_summary.csv")

    non_annotated_df.to_parquet(annotations_cache_dir / f"non_annotated.parquet")
    non_annotated_df.to_csv(annotations_cache_dir / f"non_annotated.csv")
    non_annotated_df.describe(include='all').to_csv(annotations_cache_dir / "non_annotated_summary.csv")
    
    
def load_cached_annotations(
    save_dir: str
) -> Tuple[Any]:
    """
    Load 2 different dataframes from a cache directory from either parquet or csv files.
    
    The 1st contains only rows that have been manually annotated at least once.
    The 2nd contains only rows that have never been annotated (indicated by a value of label=="")
    """
    annotations_cache_dir = Path(save_dir, "annotations_cache")
    if not os.path.isdir(annotations_cache_dir):
        return None, None
    
    
    if os.path.isfile(annotations_cache_dir / f"annotated.parquet"):
        annotated_df = pd.read_parquet(annotations_cache_dir / f"annotated.parquet")
    else:
        annotated_df = pd.read_csv(annotations_cache_dir / f"annotated.csv", index_col=0)

    if os.path.isfile(annotations_cache_dir / f"non_annotated.parquet"):
        non_annotated_df = pd.read_parquet(annotations_cache_dir / f"non_annotated.parquet")
    else:
        non_annotated_df = pd.read_csv(annotations_cache_dir / f"non_annotated.csv", index_col=0)


    return annotated_df, non_annotated_df

## Specify & Load dataset

### Define key file info & metadata

In [None]:
yale_fossil_dir = "/media/data_cifs/projects/prj_fossils/data/yale_full"

analysis_results_root_dir = "/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/2022-yale_fossil/analysis_results/"
results_filename = "01_image_stats_df"

meerkat_dir = os.path.join(analysis_results_root_dir, "meerkat")
meerkat_path = os.path.join(meerkat_dir, "02b_rich_metadata_embedded_images_meerkat_datapanel")

In [None]:
annotations_dir = "/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/2022-yale_fossil/manual_annotations"
save_dir = get_latest_version(root_dir=annotations_dir)
print(f"Checking for cached annotations in {save_dir}")
pp(f'Loading from: {Path(save_dir).stem.replace("_", " ")}')
# cache_annotations(save_dir=save_dir,
#                   annotated_df=annotated_df,
#                   non_annotated_df=non_annotated_df)


annotated_df, non_annotated_df = load_cached_annotations(save_dir=save_dir)
if (
    isinstance(annotated_df, pd.DataFrame) 
    and isinstance(non_annotated_df, pd.DataFrame)
):
    df = pd.concat([
        non_annotated_df, annotated_df
    ])
    dp = mk.DataPanel.from_pandas(df)
    print(f"Successfully loaded from cache")
else:
    dp = mk.DataPanel.read(meerkat_path)
    print(f"No cache exists, loading raw data")

In [None]:
annotated_df.describe(include='all')
non_annotated_df.describe(include='all')

In [None]:
cols = dp.columns
dp = dp.sort(by="v", ascending=False)

In [None]:
image_record_cols = [
    'thumbnail',
    'thumb_path',
    'path'
]


stats_cols = [    
    # 'thumbnail',
    'thumb_path',
    'path',
    'r',
    'g',
    'b',
    'h',
    's',
    'v',
    'height',
    'width',
    'aspect_ratio'
]

geo_cols = [
    # 'thumbnail',
    'thumb_path',
    'path',
    'kingdom',
    'phylum',
    'class',
    'order',
    'family',
    'genus',
    'specificEpithet',
    'taxonRank',
    'vernacularName',
    'continent',
    'country',
    'stateProvince',
    'county',
    'municipality',
    'locality'
]

column_groups = [
    "image_record_cols",
    "stats_cols",
    "geo_cols"
]



options=[
    'Fossil Leaf', 
    'Cleared Leaf',
    'other', 
    'unknown',
    ''
    ]

## Main Interface: Annotation Widget

In [None]:
df = non_annotated_df.sort_values("v", ascending=False)

pp(f"Skipping a total of {len(annotated_df)} previously annotated samples distributed as follows:")
annotated_df.value_counts("label")

pp(f"Beginning the continued annotation process on the remaining {len(non_annotated_df)} samples")

In [None]:
# root_path = "/media/data_cifs/projects/prj_fossils/data/yale_full"
root_path = "/dev/shm/jrose3/2022-yale_fossils/image_thumbnails/res=512"

paths = [Path(root_path, p) for p in os.listdir(root_path)]
print(len(paths))

In [None]:
%%time


from difPy import dif
# search = dif(root_path)

In [None]:
help(dif)

In [None]:
def display_fn(filename: str, **kwargs):
    f = open(filename, "rb").read()
    return Image(value=f, format=Path(filename).suffix.strip("."), **kwargs)

In [None]:
# df.value_counts("label")
# import pandas as pd
# import pigeonXT as pixt

from IPython.display import display#, Image
from ipywidgets import Image

# df = dp.to_pandas()
# df.index.name = "idx"
# df = df.reset_index()

# ddf = annotated_df
# ddf.columns
# ddf.value_counts('label')
# ddf.value_counts('aspect_ratio')
# non_annotated_df.value_counts('aspect_ratio')

In [None]:
# import seaborn as sns
# sns.histplot(non_annotated_df, x='aspect_ratio')
# sns.histplot(annotated_df, x='aspect_ratio')

In [None]:
df = pd.concat([annotated_df, non_annotated_df])
df.describe(include='all')

In [None]:
# sns.histplot(df, x='stateProvince', kde=True)

# rb_df = df.sort_values("recordedBy")
# rb_df.value_counts("recordedBy")

# import ipyplot
# ipyplot.plot_class_representations(images, labels, img_width=150)
# rb_df.columns

# images = rb_df.thumb_path.values.tolist()
# labels = rb_df.

# I. EDA & metadata-guided analysis based on `recordedBy` and `country` columns

## Inspect `recordedBy` column

    * Sort df by value of `recordedBy` column

### A. Create na-contribution, single-contribution & multi-contribution partitions of the dataset
- by dividing samples between  
    i. those with NaN values for `recordedBy`  
    ii. those from contributors with only 1 included specimen  
    iii. and those from contributors with 2 or more,  
respectively.

* **Goal**: Looking for unique patterns that often result from a single source
* **Note**: all rows with NaN values in `recordedBy` column are included in the single_contrib collection

In [None]:
import ipyplot

In [None]:
rb_df = df.sort_values("recordedBy")#.head(10)
v_counts = rb_df.value_counts("recordedBy")

v_counts = v_counts[v_counts>1]

# `values` are the recordedBy names that satisfy the test condition: having more than 1 specimen.
values = v_counts.index.values

multi_contrib = rb_df[rb_df.recordedBy.apply(lambda x: x in values) & ~rb_df.recordedBy.isna()]
single_contrib = rb_df[rb_df.recordedBy.apply(lambda x: x not in values) & ~rb_df.recordedBy.isna()]

na_contrib = rb_df[rb_df.recordedBy.isna()]

print(f"{multi_contrib.shape=}, {single_contrib.shape=}, {na_contrib.shape=}")

assert single_contrib.recordedBy.isna().sum() == 0
assert multi_contrib.recordedBy.isna().sum() == 0
assert na_contrib.recordedBy.isna().sum() == na_contrib.shape[0]


In [None]:
12263 + 47 + 4134

In [None]:
# single_contrib[single_contrib.recordedBy.isna()].describe(include='all')
# multi_contrib[multi_contrib.recordedBy.isna()]

# labels[labels=="None"]

# labels.isna().sum()#.info()

# labels.iloc[labels.isna()] = "NA"

# labels.value_counts()

# II. Visual Inspection of image groups

## A. Plotting single contributor images in a grid

<!-- * Replace NaN values with a string placeholder of "NA" -->

### (A.0) - display plots

In [None]:
imgs, labels = single_contrib.thumb_path, single_contrib.recordedBy
# labels.iloc[labels.isna()] = "NA"
labels.value_counts().T

ipyplot.plot_class_representations(imgs.values, labels.values, img_width=200)

In [None]:
labels.shape

### (A.1) - Conclusion: 

`0` undesired images out of `47`, all Fossils

In [None]:
## B. Plotting na-contributor images in tabs, grouped by `country`

# na_contrib.shape

# Plotting images by class between tabs

## Looking closely at all rows with NaN values for `recordedBy`, group them by `country`

1. `4,134` out of `16,444` specimens have `NaN` values for `recordedBy`  
2. `92%` of these have either `NaN` (50%) or `USA` (42%) for `country`
3. After manually inspecting all images grouped by country below, all of the countries except `NA`, `USA`, and `Brazil` contain valid entries

In [None]:
## Instantiate a list to keep track of our annotations

marked_for_removal = []

### (B.0) - Inspect low-contribution groups
    * Manually inspect rows with both  
        a. NaN in `recordedBy` column, and  
        b. column `country` value with fewer than 120 rows in total

* Sort values by HSV `v` channel value

In [None]:
# na_contrib = na_contrib.sort_values("v", ascending=False)
na_contrib = na_contrib.sort_values("h", ascending=False)


In [None]:
## 1. Select all rows with `recordedBy` == NaN
# na_recordedBy = rb_df[rb_df.recordedBy.isna()]
# print(f"{rb_df.shape=}, {na_recordedBy.shape=}")

print(f"{rb_df.shape=}, {na_contrib.shape=}")

na_contrib.columns
# na_contrib.describe(include='all')

In [None]:
label_key = 'country'

# imgs, labels = na_contrib.thumb_path, na_contrib.loc[:,label_key]
imgs, labels = na_contrib.thumb_path, na_contrib[label_key]
ids = na_contrib["identifier"]
h, s, v = na_contrib["h"], na_contrib["s"], na_contrib["v"]
#Replace NaN values with string placeholder "NA"
# labels.iloc[labels.isna()] = "NA"
labels = labels.fillna("NA")

# labels.value_counts()

assert ids.value_counts().shape[0] == labels.shape[0]
assert labels.value_counts().sum() == labels.shape[0]

vc_norm = labels.value_counts(True)
vc = labels.value_counts()
pd.concat([vc, vc_norm], axis=1)

tabs_order = labels.value_counts().index.values[::-1]

#### Plot max=120 images per country's tab to quickly weed out the smaller countries

In [None]:
ipyplot.plot_class_tabs(imgs.values, labels.values, custom_texts=ids.values, max_imgs_per_tab=120, img_width=150, tabs_order=tabs_order)

### (B.0) - Conclusion:
* Saving `USA` and `NA` for the next step, the only low-contribution group with undesirable specimens is `1` row with `recordedBy=NaN` & `country="Brazil"`

In [None]:
idx_to_drop = labels[labels=="Brazil"].index
thumb_paths_to_drop = imgs[idx_to_drop].values.tolist()

thumb_paths_to_drop
assert len(thumb_paths_to_drop) == 1

marked_for_removal.append({
    "thumb_path": thumb_paths_to_drop[0],
    "reason": "Image does not contain a Fossil"
})

In [None]:
marked_for_removal

### (B.1) - Inspect high-contribution groups
    * Narrow down to only the 3 countries of interest

1. `NA` - `2,051` specimens need to be reviewed
2. `USA` - `1,720` specimens need to be reviewed
3. `Brazil` - `1` specimen -- patently undesired image based upon manual inspection

In [None]:
# labels.value_counts()

In [None]:
# # Select only specimens from countries with at least `thresh` entries
# thresh = 25
# in_thresh = labels.value_counts() >= thresh
# keep_idx = labels.apply(lambda x: in_thresh[x])
# imgs = imgs[keep_idx.index]
# labels = labels[keep_idx]

#######################
# # Select only specimens from countries in our manually constructed search query
search_query = ["NA", "USA", "Brazil"]
in_query = labels.apply(lambda x: x in search_query)
keep_idx = labels[in_query]

imgs = imgs[keep_idx.index]
labels = labels[keep_idx.index]
ids = ids[keep_idx.index]

h = h[keep_idx.index]
s = s[keep_idx.index]
v = v[keep_idx.index]
###############
###############

imgs.shape
labels.shape
labels.value_counts()#label_key)

### sort by `v`

In [None]:
sorted_idx = v.sort_values(ascending=False).index.values

imgs = imgs[sorted_idx]
labels = labels[sorted_idx]
ids = ids[sorted_idx]
h = h[sorted_idx]
s = s[sorted_idx]
v = v[sorted_idx]

#### Display the 3 countries of interest

In [None]:
# ipyplot.plot_class_tabs(imgs.values, labels.values, max_imgs_per_tab=2500, img_width=200)
# ipyplot.plot_class_tabs(imgs.values, labels.values, max_imgs_per_tab=1025, img_width=200)

# start_idx = 400
# end_idx = 1025

start_idx = 0
end_idx = -1



kwargs = {
    "images": imgs.values[start_idx:end_idx],
    "labels": labels.values[start_idx:end_idx],
    "custom_texts": ids[start_idx:end_idx]
}
ipyplot.plot_class_tabs(**kwargs,
                        max_imgs_per_tab=2500, # 1025,
                        img_width=200)

In [None]:
# import cv2
# import matplotlib.pyplot as plt

# img = cv2.imread(file_paths_to_drop[0])
# plt.imshow(img[:,:,::-1])

# rows2drop = rb_df[rb_df.thumb_path.apply(lambda x: x in file_paths_to_drop)]
# identifier2drop = rows2drop.identifier
# identifier2drop

In [None]:
# start_idx = 0
# end_idx = -1

idx = labels[labels=="NA"].index.values

kwargs = {
    "images": imgs[idx].values,
    "labels": labels[idx].values,
    "custom_texts": ids[idx].values
}

kwargs["images"].shape

In [None]:
selected_idx = [
2,
3,
7,
8,
13,
15,
19,
22,
24,
25,
27,
33,
34,
38,
42,
52,
53,
55,
56,
57,
58,
62,
67,
68,
87,
91,
93,
99,
106,
108,
111,
115,
119,
121,
122,
124,
126,
127,
129,
130,
131,
132,
139,
140,
143,
144,
145,
146,
147,
148,
153,
154,
156,
157,
158,
161,
162,
163,
164,
165,
166,
170,
171,
172,
174,
175,
176,
177,
178,
181,
183,
188,
189,
190,
191,
192,
193,
194,
195,
196,
197,
198,
199,
204,
208,
209,
210,
213,
225,
231,
235,
260,
263,
289,
318,
550,
664,
1561
]

In [None]:
kwargs = {k: v[selected_idx] for k, v in kwargs.items()}
kwargs['images'].shape

In [None]:
marked_for_removal.extend(
    [
        {
            "thumb_path": path,
            "reason": "Specimen image appears not to be a Fossil, found by manual inspection after filtering for rows with NaN values in `recordedBy`, then rows with NaN values in `country`, then sorting by `v`"
        }
        for path in kwargs["images"]
    ]
)

In [None]:
len(marked_for_removal)

### (B.1) - Conclusion:
* Manually reviewed the top 2 `country` values (after filtering for recordedBy=NaN) for rows to remove
Found:  
1. `USA` has `0` rows to remove
2. `NA` has `98` rows to remove

## Outcome of our 1st exploration procedure (I.(B.0) and I.(B.1)):
    1. Filtered to include only those with NaN values in the `recordedBy` column ( `4,134` out of `16,444` specimens)
    2. Grouped by each row's value in the `country` column (max: NaN with 2,051 | min: 4-way tie with 1)
    
    
Found a total of:  
    1. `1` rows that need to be removed with `column="Brazil"`.  
    2. `0` rows that need to be removed with `column="USA"`.  
    3. `98` rows that need to be removed with `column=NaN`.  

In [None]:
# removal_df = rb_df[rb_df.identifier.apply(lambda x: x in kwargs["custom_texts"])]
# removal_df


# dp = mk.DataPanel.from_pandas(removal_df.sort_values(["recordedBy", "country"], ascending=False))

# _cols = dp.columns
# _cols.remove("recordedBy")
# dp["thumbnail"] = mk.ImageColumn.from_filepaths(dp["thumb_path"])

In [None]:
16444-4082-99

## Summary of Section I.

We've so far found:

* `99` rows out of `16,444` to remove
* `4082` (`47` + `4035`) rows out of `16,444` to keep

What remains to be reviewed:

* `12,263` rows out of `16,444` remain to be seen

In [None]:
16444 - 4134

# II. Double check entries with valid `recordedBy` values

In [None]:
# imgs, labels = multi_contrib.thumb_path, multi_contrib.recordedBy
# # labels.iloc[labels.isna()] = "NA"
# labels.value_counts()

## II.a) -- 1st check those with contributors with < 50 contributions

1. `1,196` out of `12,263`/`16,444` (remaining/total) specimens have values for `recordedBy` corresponding to contributors with more than 1 but less than 50 contributions  

In [None]:
## 1. Select all rows with `recordedBy` values from contributors with more than 1 but less than thresh # of contributions

label_key = "recordedBy"
thresh = 50

labels = rb_df.recordedBy
vc = labels.value_counts()
included_labels = vc[(vc < thresh) & (vc > 1)].index.values


valid_recordedBy = rb_df[rb_df.recordedBy.apply(lambda x: x in included_labels)]
print(f"{rb_df.shape=}, {valid_recordedBy.shape=}")
print(f"{rb_df.shape[0] - valid_recordedBy.shape[0]=}")


# label_key = 'country'
label_key = 'recordedBy'
imgs, labels = valid_recordedBy.thumb_path, valid_recordedBy.loc[:,label_key]
assert labels.isna().sum() == 0

vc_norm = labels.value_counts(True)
vc = labels.value_counts()
pd.concat([vc, vc_norm], axis=1)

tabs_order = labels.value_counts().index.values

In [None]:
vc.shape
vc.sum()

In [None]:
12310 - 1196

16444 - 12310

4134 + 1196

In [None]:
ipyplot.plot_class_tabs(imgs.values, labels.values, max_imgs_per_tab=55, img_width=200, tabs_order=tabs_order)#, force_b64=True)

In [None]:
16444 - (4082 + 1196 + 99)

In [None]:
rb_df.value_counts("recordedBy").shape

### II.a) - Conclusion:
* Manually reviewed the rows with valid `recordedBy` values from contributors with # of contributions between 1 and 50
Found:  
1. `0` out of `1,196` rows to remove from any of the `117` of `195` total unique contributors

So far, I've reviewed `5,377` of `16,444` total, leaving `11,067` left to review

## 2nd, check those with contributors with >= 50 contributions

1. `11,067` out of `16,444` specimens have `recordedBy` values from contributors >= 50 contributions from `31` out of `195` known contributors

In [None]:
rb_df.value_counts('recordedBy').shape

In [None]:
vc[(vc >= thresh)].sum() +vc[(vc < thresh)].sum()

In [None]:
## 2. Select all rows with `recordedBy` values from contributors with more than or equal to thresh # of contributions

thresh = 50

labels = rb_df.recordedBy
vc = labels.value_counts()
included_labels = vc[(vc >= thresh)].index.values


valid_recordedBy = rb_df[rb_df.recordedBy.apply(lambda x: x in included_labels)]
print(f"{rb_df.shape=}, {valid_recordedBy.shape=}")
print(f"{rb_df.shape[0] - valid_recordedBy.shape[0]=}")


# label_key = 'country'
label_key = 'recordedBy'
imgs, labels = valid_recordedBy.thumb_path, valid_recordedBy.loc[:,label_key]
assert labels.isna().sum() == 0

In [None]:
vc_norm = labels.value_counts(True)
vc = labels.value_counts()
pd.concat([vc, vc_norm], axis=1)

tabs_order = labels.value_counts().index.values

vc.sum()

In [None]:
vc.iloc[1:].sum() + 4774

In [None]:
inspection_dict = {
    "contains many valid Fossils with misleadingly bright color/saturation values":
        ["H. F. Wells",
         "George R. Wieland"]
}

### Since it's such an outlier in terms of specimens-per-contributor, let's display the top 1 contributor separately

In [None]:
imgs.shape

In [None]:
ipyplot.plot_class_tabs(imgs.values, labels.values, max_imgs_per_tab=1000, img_width=150, tabs_order=tabs_order[1:][::-1])

In [None]:
### Found `0` rows to remove out of `4,774` total contributed by `"Samuel S. Strong"`

In [None]:
675

In [None]:
# ipyplot.plot_class_tabs(imgs.values, labels.values, max_imgs_per_tab=1000, img_width=150, tabs_order=tabs_order[:1])

# labels_select = [l for l in labels if l in tabs_order[:1]]
labels_select = labels[labels.apply(lambda x: x in tabs_order[:1])]
idx_select = labels_select.index.values

imgs_select = imgs.loc[idx_select]

In [None]:
labels_select.shape
imgs_select.head()

### Found `0` rows to remove out of `4,774` total contributed by `"Samuel S. Strong"`

In [None]:
# imgs_sel, 

ipyplot.plot_class_tabs(imgs_select.values, labels_select.values,
                    max_imgs_per_tab=5000, img_width=150)#, tabs_order=tabs_order[:1])

In [None]:
# mark_for_inspection = {"recordedBy":
#                            ["Leo J. Hickey"]
#                       }
# Found 1 possible mistake in "Leo J. Hickey"

In [None]:
_imgs = mk.ImageColumn.from_filepaths(rb_df[rb_df.recordedBy=="Leo J. Hickey"].thumb_path.values)
_imgs

In [None]:
## 1. Select all rows with `recordedBy` values from contributors with more than thresh # of contributions

# thresh = 1
# labels = rb_df.recordedBy

# vc = labels.value_counts()
# included_labels = vc[vc > thresh].index.values

# valid_recordedBy = rb_df[rb_df.recordedBy.apply(lambda x: x in included_labels)]
# print(f"{rb_df.shape=}, {valid_recordedBy.shape=}")
# print(f"{rb_df.shape[0] - valid_recordedBy.shape[0]=}")

# thresh = 25
# in_thresh = labels.value_counts() >= thresh
# keep_idx = labels.apply(lambda x: in_thresh[x])
# imgs = imgs[keep_idx.index]
# labels = labels[keep_idx]


# label_key = 'country'
# label_key = 'recordedBy'
# imgs, labels = valid_recordedBy.thumb_path, valid_recordedBy.loc[:,label_key]

# assert labels.isna().sum() == 0
#Replace NaN values with string placeholder "NA"
# labels.iloc[labels.isna()] = "NA"

# vc_norm = labels.value_counts(True)
# vc = labels.value_counts()
# pd.concat([vc, vc_norm], axis=1)

# tabs_order = labels.value_counts().index.values

# ipyplot.plot_class_tabs(imgs.values, labels.values, max_imgs_per_tab=50, img_width=200, tabs_order=tabs_order)

* (1:35 AM Monday September 19th, 2022) -- Inspected all `recordedBy` values for contributors with < 50 included specimens, found no issues except for possible herbarium twigs & stems included in 1 contributor's entries (`Jeffrey B. Doran`)


* (4:29 AM Monday September 19th, 2022) -- Inspected all `recordedBy` values for contributors with >= 50 included specimens (except for `Samuel S. Strong` who contributed the most out of anyone, with a total of `4,774` records), found only 1 problem specimen contributed by `Leo J. Hickey`

### Below, we visualize the specimens selected for possible removal

In [None]:
# marked_for_removal = []

In [None]:
dp = mk.DataPanel.from_pandas(rb_df.sort_values("recordedBy", ascending=False))

_cols = dp.columns
_cols.remove("recordedBy")
dp["thumbnail"] = mk.ImageColumn.from_filepaths(dp["thumb_path"])
dp = dp[["thumbnail", "recordedBy", *_cols]]
# dp

In [None]:
selected = dp[dp["recordedBy"] == "Leo J. Hickey"].lz[22:23]

In [None]:
selected
len(marked_for_removal)

marked_for_removal.append({
    "thumb_path": selected["thumb_path"][0],
    "reason": "Image does not contain a Fossil"
})

len(marked_for_removal)

Some samples contributed by Jeffrey B. Doran were marked as being possibly non-Fossils

In [None]:
selected = dp[dp["recordedBy"] == "Jeffrey B. Doran"]# .lz[22:23]
selected

Based on visual inspection of this small subset (19 rows), it becomes clear that simply sorting the subset in order of increasing value for `v` column (`value` in HSV image color formats) allows us to cleanly divide into 2 categories, valid vs. invalid specimens 

In [None]:
selected = selected.sort(by="v") 
#[selected.columns[:5]]

to_remove_dp = selected[12:]
to_keep_dp = selected[:12]

In [None]:
marked_for_removal.extend(
    [
        {
            "thumb_path": path,
            "reason": "Image does not contain a Fossil, but what appears to be a collection of stems or roots on bright white backgrounds."
        }
        for path in to_remove_dp["thumb_path"].to_pandas().values.tolist()
    ]
)
# marked_for_removal

len(rb_df)
len(marked_for_removal)
len(rb_df) - len(marked_for_removal)

In [None]:
rb_df.describe(include='all')

In [None]:
# marked_for_removal[0]["thumb_path"] = marked_for_removal[0]["thumb_path"][0]

In [None]:
marked_for_removal_dict = {}
for k in marked_for_removal[0].keys():
    marked_for_removal_dict[k] = [i[k] for i in marked_for_removal]
    
len(marked_for_removal_dict)
    

In [None]:
removal_df = pd.DataFrame.from_records(marked_for_removal)

removal_df = (
    removal_df
    .merge(rb_df, on="thumb_path")
    .rename(columns={"reason":"reason_removed"})
    .drop(columns=["label"])
)
# removal_df
assert removal_df.shape == (107,52)

ids2remove = removal_df.identifier.values.tolist()

# rb_df[rb_df.identifier.apply(lambda x: x in ids2remove)].shape

In [None]:
removal_df.describe(include='all')

In [None]:
final_df2keep = rb_df[rb_df.identifier.apply(lambda x: x not in ids2remove)]

final_df2keep = (
    final_df2keep.drop(columns=["label"])
)
assert final_df2keep.shape == (16337,51)

final_df2keep.describe(include='all')

# III. Save Versions 0 (inputs to this notebook) and 1 (outputs of this notebook) as csv catalogs for sharing with others

In [None]:
# save_dir = Path('/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/2022-yale_fossil/official_releases/version_0')
save_dir = Path("/media/data_cifs/projects/prj_fossils/data/raw_data/2022-yale_fossil/official_releases/version_0")
os.makedirs(save_dir, exist_ok=True)

rb_df.to_csv(Path(save_dir, "original_catalog.csv"))

with open(Path(save_dir, "README.md"), "w") as f:
    f.write(
        """
# 2022 yale fossil dataset
## version 0

Created on: Monday Sept 19th, 2022  
Created by: Jacob A Rose, working on data provided by Peter Wilf  

Contains a total of `16,444` specimens without removing any of the many duplicates and non-Fossil images.  
See versions 1+ for cleaner versions of thhe catalog.
        """
           )


In [None]:
# save_dir = Path('/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/notebooks/fossil dataset preprocessing/2022-yale_fossil/official_releases/version_1')
save_dir = Path("/media/data_cifs/projects/prj_fossils/data/raw_data/2022-yale_fossil/official_releases/version_1")
os.makedirs(save_dir, exist_ok=True)

save_dir

In [None]:
# removal_df.to_pandas().drop(columns=["thumbnail"]).to_csv(Path(save_dir, "catalog_marked_for_removal.csv"))

removal_df.to_csv(Path(save_dir, "catalog_marked_for_removal.csv"))
final_df2keep.to_csv(Path(save_dir, "catalog_marked_to_keep.csv"))

removal_df.to_parquet(Path(save_dir, "catalog_marked_for_removal.parquet"))
final_df2keep.to_parquet(Path(save_dir, "catalog_marked_to_keep.parquet"))

with open(Path(save_dir, "README.md"), "w") as f:
    f.write(
        f"""
# 2022 yale fossil dataset
## version 1

Created on: Monday Sept 19th, 2022  
Created by: Jacob A Rose, working on data provided by Peter Wilf  

Contains 2 separate catalogs containing respectively:  
1. `catalog_marked_for_removal`: `{removal_df.shape[0]}` specimens for removal, and  
2. `catalog_marked_to_keep`: `{final_df2keep.shape[0]}` specimens for keeping  

from an original total of `16,444` specimens.  

------------  
* The `catalog_marked_for_removal` contains an extra column describing the reason for removal  
* Version 2 will deal with removing the still included sequence of duplicate images
        """
           )


In [None]:
# mark_for_inspection = {"recordedBy":
#                            ["Jeffrey B. Doran"]
#                       }

## Misc extra functions

In [None]:
df = df.sort_values(
    "v",
    ascending=True, # False,
    ignore_index=True
)

df

bins = [0.0, 0.25, 0.5, 0.75, 1.0] #[:-1]

df["quantiles"], o_bins = pd.qcut(
    df["v"],
    len(bins),
    labels=bins,
    precision=2,
    retbins=True
)

df.describe(include="all")

largest_idx = df.groupby("quantiles")["v"].nlargest(10).reset_index(level=0).index
smallest_idx = df.groupby("quantiles")["v"].nsmallest(10).reset_index(level=0).index


largest_idx
smallest_idx

from more_itertools import unzip

smallest = df.loc[smallest_idx,:]
largest = df.loc[largest_idx,:]


i, paths, quantiles, v_list = [
    list(c) for c in unzip(
        smallest[["path", "quantiles", "v"]].to_records()
    )
]


# i, paths, quantiles, v_list = [
#     list(c) for c in unzip(
#         largest[["path", "quantiles", "v"]].to_records()
#     )
# ]


import ipyplot

tabs_order=sorted(set(quantiles))

ipyplot.plot_class_tabs(paths,
                        labels=[f"{q:.2%}" for q in quantiles],
                        custom_texts=[f"{v=}" for v in v_list],
                        tabs_order=np.sort(list(set(quantiles)))
                    )

records = df.to_records()
records

records[0]



In [None]:
from functools import partial

import pandas as pd
from IPython.core.display import HTML


def generate_file_path_dataframe_fixture() -> pd.DataFrame:
    df = pd.DataFrame(
        [
            [2768571, 130655, 1155027, 34713051, 331002277],
            [1448753, 60632, 790040, 3070447, 212558178],
            [654405, 9536, 422931, 19852167, 145934619],
            [605216, 17848, 359891, 8826585, 1379974505],
            [288477, 9860, 178245, 1699369, 32969875]
        ],
        columns = ['Total Cases', 'Total Deaths', 'Total Recovered', 'Total Tests', 'Population']
    )

    df['Country'] = [
        'https://www.countries-ofthe-world.com/flags-normal/flag-of-United-States-of-America.png',
        'https://www.countries-ofthe-world.com/flags-normal/flag-of-Brazil.png',
        'https://www.countries-ofthe-world.com/flags-normal/flag-of-Russia.png',
        'https://www.countries-ofthe-world.com/flags-normal/flag-of-India.png',
        'https://www.countries-ofthe-world.com/flags-normal/flag-of-Peru.png'
    ]
    return df






def path_to_image_html(path: str,
                       width: int=128):
    return f'<img src="{path}" width="{width}" >'


def display_image_df(df: pd.DataFrame,
                     formatters: Dict[str,Callable]
                    ):
    return HTML(
        df.to_html(
            escape=False,
            formatters=formatters
        )
    )

formatters = {
    "img": 
    partial(
        path_to_image_html#, width-50
    )
}



df_html = display_image_df(
    df=df.assign(img=df.path.values),
    formatters=formatters
)
df_html