In [1]:
import pandas as pd

# Update Beetle Image Metadata files for viewer

The viewer likely needs unique rows per image for the full-sized images (`group_images`, assuming it's not truly limited by their size) and we want to add more info to the file for the resized images so they can be viewed by `genus`, `species`, `NEON_sampleID`, and `siteID`

In [2]:
# get metadata from HF
df = pd.read_csv("https://huggingface.co/datasets/imageomics/2018-NEON-beetles/resolve/0420eb8c5d582b83220f16aa2f11f36e2e832674/BeetleMeasurements_resized.csv", low_memory = False)
df.head()

Unnamed: 0,pictureID,file_name
0,A00000046175.jpg,beetle_images_resized/A00000046175.jpg
1,A00000022197.jpg,beetle_images_resized/A00000022197.jpg
2,A00000051179.jpg,beetle_images_resized/A00000051179.jpg
3,A00000040712.jpg,beetle_images_resized/A00000040712.jpg
4,A00000041430.jpg,beetle_images_resized/A00000041430.jpg


In [3]:
# get metadata from HF
df_detail = pd.read_csv("https://huggingface.co/datasets/imageomics/2018-NEON-beetles/resolve/54c160e18d3032e4f13003691bb514db4eef4ece/BeetleMeasurements.csv", low_memory = False)
df_detail.head(2)

Unnamed: 0,pictureID,scalebar,cm_pix,individual,structure,lying_flat,coords_pix,dist_pix,dist_cm,scientificName,...,siteID,site_name,plotID,user_name,workflowID,genus,species,combinedID,measureID,file_name
0,A00000032929.jpg,"{""x1"": 815, ""y1"": 244, ""x2"": 892, ""y2"": 244}",77.0,1,ElytraLength,Yes,"{""x1"": 1055, ""y1"": 154, ""x2"": 1163, ""y2"": 149}",108.115679,1.4041,Carabus goryi,...,HARV,Harvard Forest & Quabbin Watershed NEON,HARV_001,IsaFluck,21652,Carabus,goryi,A00000032929_1,581c1309-6b06-4445-9ed5-55ebe366f6ed,group_images/A00000032929.jpg
1,A00000032929.jpg,"{""x1"": 815, ""y1"": 244, ""x2"": 892, ""y2"": 244}",77.0,1,ElytraWidth,Yes,"{""x1"": 1053, ""y1"": 129, ""x2"": 1057, ""y2"": 179}",50.159745,0.651425,Carabus goryi,...,HARV,Harvard Forest & Quabbin Watershed NEON,HARV_001,IsaFluck,21652,Carabus,goryi,A00000032929_1,464836fd-853e-40d5-861c-8c279aec6a55,group_images/A00000032929.jpg


In [4]:
print(df["pictureID"].nunique(), df.shape)

577 (577, 2)


In [5]:
cols_to_add = ["scientificName", "genus", "species", "NEON_sampleID", "siteID"]

In [6]:
for pic_id in list(df["pictureID"]):
    temp = df_detail.loc[df_detail["pictureID"] == pic_id].copy()
    for col in cols_to_add:
        df.loc[df["pictureID"] == pic_id, col] = temp[col].values[0]

df.head()

Unnamed: 0,pictureID,file_name,scientificName,genus,species,NEON_sampleID,siteID
0,A00000046175.jpg,beetle_images_resized/A00000046175.jpg,Pterostichus lachrymosus,Pterostichus,lachrymosus,MLBS_005.20180814.PTELAC2.01,MLBS
1,A00000022197.jpg,beetle_images_resized/A00000022197.jpg,Apristus sp.,Apristus,,SRER_008.S.20180523.APRSP.01,SRER
2,A00000051179.jpg,beetle_images_resized/A00000051179.jpg,Euryderus grossus,Euryderus,grossus,CPER_003.20180719.EURGRO.01,CPER
3,A00000040712.jpg,beetle_images_resized/A00000040712.jpg,Bembidion muscicola,Bembidion,muscicola,TREE_005.W.20180828.BEMMUS.01,TREE
4,A00000041430.jpg,beetle_images_resized/A00000041430.jpg,Pasimachus depressus,Pasimachus,depressus,KONZ_002.W.20180730.PASDEP.01,KONZ


In [7]:
df.to_csv("../data/BeetleMeasurements_resized.csv", index = False)

## Update `group_images` and `group_images_masks` Metadata

Run [sum-buddy](https://github.com/Imageomics/sum-buddy) for folder image contents information (run at root of repo, relative path to local copy of [HF repo](https://huggingface.co/datasets/imageomics/2018-NEON-beetles)).
```console
sum-buddy --output-file metadata/group_images_sb.csv ../2018-NEON-beetles/group_images
sum-buddy --output-file metadata/group_images_masks_sb.csv ../2018-NEON-beetles/group_images_masks
```

In [2]:
# Get metadata for images
meta_df = pd.read_csv("https://huggingface.co/datasets/imageomics/2018-NEON-beetles/resolve/10f6ed40764864e1edc0c0022f66642367161606/BeetleMeasurements.csv", low_memory=False)
meta_df.head(2)

Unnamed: 0,pictureID,scalebar,cm_pix,individual,structure,lying_flat,coords_pix,dist_pix,dist_cm,scientificName,...,user_name,workflowID,genus,species,combinedID,measureID,file_name,image_dim,resized_image_dim,coords_pix_scaled_up
0,A00000032929.jpg,"{""x1"": 815, ""y1"": 244, ""x2"": 892, ""y2"": 244}",77.0,1,ElytraLength,Yes,"{""x1"": 1055, ""y1"": 154, ""x2"": 1163, ""y2"": 149}",108.115679,1.4041,Carabus goryi,...,IsaFluck,21652,Carabus,goryi,A00000032929_1,581c1309-6b06-4445-9ed5-55ebe366f6ed,group_images/A00000032929.jpg,"(3712, 5568, 3)","(1299, 1949, 3)","{'x1': 3014, 'y1': 439, 'x2': 3323, 'y2': 425}"
1,A00000032929.jpg,"{""x1"": 815, ""y1"": 244, ""x2"": 892, ""y2"": 244}",77.0,1,ElytraWidth,Yes,"{""x1"": 1053, ""y1"": 129, ""x2"": 1057, ""y2"": 179}",50.159745,0.651425,Carabus goryi,...,IsaFluck,21652,Carabus,goryi,A00000032929_1,464836fd-853e-40d5-861c-8c279aec6a55,group_images/A00000032929.jpg,"(3712, 5568, 3)","(1299, 1949, 3)","{'x1': 3009, 'y1': 368, 'x2': 3020, 'y2': 511}"


In [3]:
gp_df = pd.read_csv("../metadata/group_images_sb.csv", low_memory=False)
gp_df.head()

Unnamed: 0,filepath,filename,md5
0,../2018-NEON-beetles/group_images/A00000046175...,A00000046175.jpg,ea3cbb9cef140de9b2926650f89710b0
1,../2018-NEON-beetles/group_images/A00000022197...,A00000022197.jpg,d8674743b1edf7a97d133da029835ff9
2,../2018-NEON-beetles/group_images/A00000051179...,A00000051179.jpg,0c2e8c229e231311a1f3c4d34bb7926c
3,../2018-NEON-beetles/group_images/A00000040712...,A00000040712.jpg,86a97b6932627fada9f7461ff961b751
4,../2018-NEON-beetles/group_images/A00000041430...,A00000041430.jpg,b1eb585e813d304837ab579e727e91ae


In [4]:
print(meta_df.shape, gp_df.shape)

(39064, 24) (577, 3)


In [9]:
gp_meta = pd.merge(meta_df, gp_df[["filename", "md5"]],
                   left_on = "pictureID",
                   right_on = "filename",
                   how = "right")
gp_meta.shape

(39063, 26)

In [12]:
gp_meta["dupes"] = gp_meta.duplicated(subset = ["pictureID", "filename", "md5"], keep = "first")
gp_meta["dupes"].value_counts()

dupes
True     38486
False      577
Name: count, dtype: int64

In [13]:
gp_meta_cleaned = gp_meta.loc[~gp_meta["dupes"]].copy()
gp_meta_cleaned.shape

(577, 27)

In [15]:
gp_meta[["pictureID", "filename", "md5"]].nunique()

pictureID    576
filename     577
md5          577
dtype: int64

We have one `pictureID` duplicated, though the `filename` and `md5` are unique. It should be a unique identifier, so why is that happening?

In [16]:
gp_meta_cleaned["double-picID"] = gp_meta_cleaned.duplicated("pictureID", keep = False)
gp_meta_cleaned.loc[gp_meta_cleaned["double-picID"]]

Unnamed: 0,pictureID,scalebar,cm_pix,individual,structure,lying_flat,coords_pix,dist_pix,dist_cm,scientificName,...,combinedID,measureID,file_name,image_dim,resized_image_dim,coords_pix_scaled_up,filename,md5,dupes,double-picID


Or it's missing...

In [17]:
gp_meta_cleaned[["pictureID", "filename", "md5"]].info()

<class 'pandas.core.frame.DataFrame'>
Index: 577 entries, 0 to 39003
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pictureID  576 non-null    object
 1   filename   577 non-null    object
 2   md5        577 non-null    object
dtypes: object(3)
memory usage: 18.0+ KB


In [18]:
gp_meta_cleaned.loc[gp_meta_cleaned["pictureID"].isna()]

Unnamed: 0,pictureID,scalebar,cm_pix,individual,structure,lying_flat,coords_pix,dist_pix,dist_cm,scientificName,...,combinedID,measureID,file_name,image_dim,resized_image_dim,coords_pix_scaled_up,filename,md5,dupes,double-picID
25164,,,,,,,,,,,...,,,,,,,A00000006924.jpg,4ee862a3969b432c296ffc7de937afac,False,False


It looks like this has an extra `0` added.

In [21]:
meta_df.loc[meta_df["pictureID"] == "A0000006924.jpg"]

Unnamed: 0,pictureID,scalebar,cm_pix,individual,structure,lying_flat,coords_pix,dist_pix,dist_cm,scientificName,...,user_name,workflowID,genus,species,combinedID,measureID,file_name,image_dim,resized_image_dim,coords_pix_scaled_up


Actually, found the image in the folder. Opened the image, its ID should be `A00000069245.jpg`, as the ID comes from the tube the beetles were in and `A00000069245` is the code on the tube label. We'll rename this image and update the file.

In [22]:
gp_df.loc[gp_df["filename"] == "A00000006924.jpg"] = "A00000069245.jpg"

gp_meta = pd.merge(meta_df, gp_df[["filename", "md5"]],
                   left_on = "pictureID",
                   right_on = "filename",
                   how = "inner")
print(gp_meta.shape)

gp_meta["dupes"] = gp_meta.duplicated(subset = ["pictureID", "filename", "md5"], keep = "first")
print(gp_meta["dupes"].value_counts())

gp_meta_cleaned = gp_meta.loc[~gp_meta["dupes"]].copy()
gp_meta_cleaned.shape

(39064, 26)
dupes
True     38487
False      577
Name: count, dtype: int64


(577, 27)

Now we just adjust the `filename` column to be `file_name` for HF dataset viewer and drop the `dupes` column.

In [24]:
gp_meta_cleaned.rename(columns={"filename": "file_name"}, inplace=True)
gp_meta_cleaned.drop(columns="dupes", inplace=True)
gp_meta_cleaned.head(2)

Unnamed: 0,pictureID,scalebar,cm_pix,individual,structure,lying_flat,coords_pix,dist_pix,dist_cm,scientificName,...,genus,species,combinedID,measureID,file_name,image_dim,resized_image_dim,coords_pix_scaled_up,file_name.1,md5
0,A00000032929.jpg,"{""x1"": 815, ""y1"": 244, ""x2"": 892, ""y2"": 244}",77.0,1,ElytraLength,Yes,"{""x1"": 1055, ""y1"": 154, ""x2"": 1163, ""y2"": 149}",108.115679,1.4041,Carabus goryi,...,Carabus,goryi,A00000032929_1,581c1309-6b06-4445-9ed5-55ebe366f6ed,group_images/A00000032929.jpg,"(3712, 5568, 3)","(1299, 1949, 3)","{'x1': 3014, 'y1': 439, 'x2': 3323, 'y2': 425}",A00000032929.jpg,e2110ecefbf13d48f20fb4a51c6ff5a9
78,A00000033585.jpg,"{""x1"": 781, ""y1"": 253, ""x2"": 891, ""y2"": 254}",110.004545,1,ElytraLength,Yes,"{""x1"": 1010, ""y1"": 172, ""x2"": 1029, ""y2"": 229}",60.083276,0.546189,Synuchus impunctatus,...,Synuchus,impunctatus,A00000033585_1,ea4658bb-c7fe-49b5-b3e7-c4c42aa8e084,group_images/A00000033585.jpg,"(3712, 5568, 3)","(1336, 2004, 3)","{'x1': 2806, 'y1': 477, 'x2': 2859, 'y2': 636}",A00000033585.jpg,37a7fb236e3e557fad52d3d5c1f6f054


In [25]:
# Add subset Column
gp_meta_cleaned["subset"] = "group_images"

### Save `group_images` metadata

Use relative path to local copy of [HF repo](https://huggingface.co/datasets/imageomics/2018-NEON-beetles).

In [26]:
gp_meta_cleaned.to_csv("../../2018-NEON-beetles/group_images/metadata.csv", index = False)

### Update Masks Subset

Check for image that was mislabeled.

In [27]:
gp_m_df = pd.read_csv("../metadata/group_images_masks_sb.csv", low_memory=False)

gp_m_df.loc[gp_m_df["filename"] == "A00000006924.jpg"]

Unnamed: 0,filepath,filename,md5


In [29]:
gp_m_df.head()

Unnamed: 0,filepath,filename,md5
0,../2018-NEON-beetles/group_images_masks/A00000...,A00000022093_mask.png,e5e9d762534da4f9cb13752cb9ebf925
1,../2018-NEON-beetles/group_images_masks/BART_0...,BART_068.E.20180627.SYNIMP.01_mask.png,269a237c22d1614d8302fed14dbec9b8
2,../2018-NEON-beetles/group_images_masks/A00000...,A00000034553_mask.png,14a67fceb0171e48592693324f1adf86
3,../2018-NEON-beetles/group_images_masks/A00000...,A00000008976_mask.png,2d41c0c3e36fb0e2d9698a23a2814eae
4,../2018-NEON-beetles/group_images_masks/A00000...,A00000044094_mask.png,fea43f94b5910e10b55f343c57febea6


In [31]:
gp_m_df.loc[gp_m_df["filename"] == "A00000006924_mask.png"]

Unnamed: 0,filepath,filename,md5
419,../2018-NEON-beetles/group_images_masks/A00000...,A00000006924_mask.png,e133b77cc3b12f56b6b9f2613c9194a4


Okay, let's rename this and then we'll create a `pictureID` column to merge with larger dataframe.

In [32]:
gp_m_df.loc[gp_m_df["filename"] == "A00000006924_mask.png", "filename"] = "A00000069245_mask.png"

In [36]:
for pic in list(gp_m_df["filename"]):
    gp_m_df.loc[gp_m_df["filename"] == pic, "pictureID"] = pic.split("_mask")[0] + ".jpg"

gp_m_df.head()

Unnamed: 0,filepath,filename,md5,pictureID
0,../2018-NEON-beetles/group_images_masks/A00000...,A00000022093_mask.png,e5e9d762534da4f9cb13752cb9ebf925,A00000022093.jpg
1,../2018-NEON-beetles/group_images_masks/BART_0...,BART_068.E.20180627.SYNIMP.01_mask.png,269a237c22d1614d8302fed14dbec9b8,BART_068.E.20180627.SYNIMP.01.jpg
2,../2018-NEON-beetles/group_images_masks/A00000...,A00000034553_mask.png,14a67fceb0171e48592693324f1adf86,A00000034553.jpg
3,../2018-NEON-beetles/group_images_masks/A00000...,A00000008976_mask.png,2d41c0c3e36fb0e2d9698a23a2814eae,A00000008976.jpg
4,../2018-NEON-beetles/group_images_masks/A00000...,A00000044094_mask.png,fea43f94b5910e10b55f343c57febea6,A00000044094.jpg


In [37]:
print(gp_m_df.shape)

gp_m_meta = pd.merge(meta_df, gp_m_df[["filename", "md5", "pictureID"]],
                   on = "pictureID",
                   how = "inner")
print(gp_m_meta.shape)

gp_m_meta["dupes"] = gp_m_meta.duplicated(subset = ["pictureID", "filename", "md5"], keep = "first")
print(gp_m_meta["dupes"].value_counts())

gp_m_meta_cleaned = gp_m_meta.loc[~gp_m_meta["dupes"]].copy()
gp_m_meta_cleaned.shape

(577, 4)
(39064, 26)
dupes
True     38487
False      577
Name: count, dtype: int64


(577, 27)

In [39]:
gp_m_meta_cleaned.head(2)

Unnamed: 0,pictureID,scalebar,cm_pix,individual,structure,lying_flat,coords_pix,dist_pix,dist_cm,scientificName,...,species,combinedID,measureID,file_name,image_dim,resized_image_dim,coords_pix_scaled_up,filename,md5,dupes
0,A00000032929.jpg,"{""x1"": 815, ""y1"": 244, ""x2"": 892, ""y2"": 244}",77.0,1,ElytraLength,Yes,"{""x1"": 1055, ""y1"": 154, ""x2"": 1163, ""y2"": 149}",108.115679,1.4041,Carabus goryi,...,goryi,A00000032929_1,581c1309-6b06-4445-9ed5-55ebe366f6ed,group_images/A00000032929.jpg,"(3712, 5568, 3)","(1299, 1949, 3)","{'x1': 3014, 'y1': 439, 'x2': 3323, 'y2': 425}",A00000032929_mask.png,af0c4ca9ba88e7011d90a2cdec0e264b,False
78,A00000033585.jpg,"{""x1"": 781, ""y1"": 253, ""x2"": 891, ""y2"": 254}",110.004545,1,ElytraLength,Yes,"{""x1"": 1010, ""y1"": 172, ""x2"": 1029, ""y2"": 229}",60.083276,0.546189,Synuchus impunctatus,...,impunctatus,A00000033585_1,ea4658bb-c7fe-49b5-b3e7-c4c42aa8e084,group_images/A00000033585.jpg,"(3712, 5568, 3)","(1336, 2004, 3)","{'x1': 2806, 'y1': 477, 'x2': 2859, 'y2': 636}",A00000033585_mask.png,f40800c61227a884e6ae608d9decb911,False


Now we just adjust the `filename` column to be `file_name` for HF dataset viewer and drop the `dupes` column.

In [40]:
gp_m_meta_cleaned.rename(columns={"filename": "file_name"}, inplace=True)
gp_m_meta_cleaned.drop(columns="dupes", inplace=True)
gp_m_meta_cleaned.head(2)

Unnamed: 0,pictureID,scalebar,cm_pix,individual,structure,lying_flat,coords_pix,dist_pix,dist_cm,scientificName,...,genus,species,combinedID,measureID,file_name,image_dim,resized_image_dim,coords_pix_scaled_up,file_name.1,md5
0,A00000032929.jpg,"{""x1"": 815, ""y1"": 244, ""x2"": 892, ""y2"": 244}",77.0,1,ElytraLength,Yes,"{""x1"": 1055, ""y1"": 154, ""x2"": 1163, ""y2"": 149}",108.115679,1.4041,Carabus goryi,...,Carabus,goryi,A00000032929_1,581c1309-6b06-4445-9ed5-55ebe366f6ed,group_images/A00000032929.jpg,"(3712, 5568, 3)","(1299, 1949, 3)","{'x1': 3014, 'y1': 439, 'x2': 3323, 'y2': 425}",A00000032929_mask.png,af0c4ca9ba88e7011d90a2cdec0e264b
78,A00000033585.jpg,"{""x1"": 781, ""y1"": 253, ""x2"": 891, ""y2"": 254}",110.004545,1,ElytraLength,Yes,"{""x1"": 1010, ""y1"": 172, ""x2"": 1029, ""y2"": 229}",60.083276,0.546189,Synuchus impunctatus,...,Synuchus,impunctatus,A00000033585_1,ea4658bb-c7fe-49b5-b3e7-c4c42aa8e084,group_images/A00000033585.jpg,"(3712, 5568, 3)","(1336, 2004, 3)","{'x1': 2806, 'y1': 477, 'x2': 2859, 'y2': 636}",A00000033585_mask.png,f40800c61227a884e6ae608d9decb911


In [41]:
# Add subset Column
gp_m_meta_cleaned["subset"] = "group_images_masks"

#### Save `group_images_masks` metadata file

Use relative path to local copy of [HF repo](https://huggingface.co/datasets/imageomics/2018-NEON-beetles).

In [42]:
gp_m_meta_cleaned.to_csv("../../2018-NEON-beetles/group_images_masks/metadata.csv", index = False)