In [1]:
import pandas as pd
import seaborn as sns

# Beetlepalooza Beetle Measurement Data

In [2]:
#df = pd.read_csv("https://huggingface.co/datasets/imageomics/BeetlePalooza/resolve/15a82c862588b2e7b709b1aa982161d8c3a7c75f/BeetleMeasurements.csv", low_memory = False)
df = pd.read_csv("../data/BeetleMeasurements.csv", low_memory = False)
df.head()

Unnamed: 0,pictureID,scalebar,cm_pix,individual,structure,lying_flat,coords_pix,dist_pix,dist_cm,scientificName,...,siteID,site_name,plotID,user_name,workflowID,genus,species,combinedID,measureID,file_name
0,A00000032929.jpg,"{""x1"": 815, ""y1"": 244, ""x2"": 892, ""y2"": 244}",77.0,1,ElytraLength,Yes,"{""x1"": 1055, ""y1"": 154, ""x2"": 1163, ""y2"": 149}",108.115679,1.4041,Carabus goryi,...,HARV,Harvard Forest & Quabbin Watershed NEON,HARV_001,IsaFluck,21652,Carabus,goryi,A00000032929_1,581c1309-6b06-4445-9ed5-55ebe366f6ed,group_images/A00000032929.jpg
1,A00000032929.jpg,"{""x1"": 815, ""y1"": 244, ""x2"": 892, ""y2"": 244}",77.0,1,ElytraWidth,Yes,"{""x1"": 1053, ""y1"": 129, ""x2"": 1057, ""y2"": 179}",50.159745,0.651425,Carabus goryi,...,HARV,Harvard Forest & Quabbin Watershed NEON,HARV_001,IsaFluck,21652,Carabus,goryi,A00000032929_1,464836fd-853e-40d5-861c-8c279aec6a55,group_images/A00000032929.jpg
2,A00000032929.jpg,"{""x1"": 815, ""y1"": 244, ""x2"": 892, ""y2"": 244}",77.0,2,ElytraLength,Yes,"{""x1"": 1390, ""y1"": 150, ""x2"": 1314, ""y2"": 241}",118.56222,1.539769,Carabus goryi,...,HARV,Harvard Forest & Quabbin Watershed NEON,HARV_001,IsaFluck,21652,Carabus,goryi,A00000032929_2,80d48e56-c274-4ca9-854e-07605a62e140,group_images/A00000032929.jpg
3,A00000032929.jpg,"{""x1"": 815, ""y1"": 244, ""x2"": 892, ""y2"": 244}",77.0,2,ElytraWidth,Yes,"{""x1"": 1369, ""y1"": 136, ""x2"": 1407, ""y2"": 169}",50.328918,0.653622,Carabus goryi,...,HARV,Harvard Forest & Quabbin Watershed NEON,HARV_001,IsaFluck,21652,Carabus,goryi,A00000032929_2,839d9bde-1972-49d6-b31c-8aa81c84c0a2,group_images/A00000032929.jpg
4,A00000032929.jpg,"{""x1"": 815, ""y1"": 244, ""x2"": 892, ""y2"": 244}",77.0,3,ElytraLength,Yes,"{""x1"": 507, ""y1"": 378, ""x2"": 501, ""y2"": 487}",109.165013,1.417727,Carabus goryi,...,HARV,Harvard Forest & Quabbin Watershed NEON,HARV_001,IsaFluck,21652,Carabus,goryi,A00000032929_3,d24c06fa-2779-45f9-8985-71c8e6e9418e,group_images/A00000032929.jpg


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39064 entries, 0 to 39063
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   pictureID       39064 non-null  object 
 1   scalebar        39064 non-null  object 
 2   cm_pix          39064 non-null  float64
 3   individual      39064 non-null  int64  
 4   structure       39064 non-null  object 
 5   lying_flat      39064 non-null  object 
 6   coords_pix      39064 non-null  object 
 7   dist_pix        39064 non-null  float64
 8   dist_cm         39064 non-null  float64
 9   scientificName  39030 non-null  object 
 10  NEON_sampleID   39064 non-null  object 
 11  siteID          39064 non-null  object 
 12  site_name       39064 non-null  object 
 13  plotID          39064 non-null  object 
 14  user_name       39064 non-null  object 
 15  workflowID      39064 non-null  int64  
 16  genus           39030 non-null  object 
 17  species         39002 non-null 

In [4]:
df.nunique()

pictureID           577
scalebar           1045
cm_pix              327
individual           99
structure             2
lying_flat            2
coords_pix        38724
dist_pix           4198
dist_cm           31566
scientificName       85
NEON_sampleID       576
siteID               30
site_name            43
plotID              144
user_name             3
workflowID            5
genus                36
species              78
combinedID        11104
measureID         39064
file_name           577
dtype: int64

### Fix Outliers for BeetleMeasurements.csv

We saw in EDA-0-2 that we have 2 outliers: Looking at picture, the first one (`A00000046078_10`, annotated by `IsaFluck`) is missing half the elytra (length-wise cut). `A00000046104_10`, annotated by `rileywolcheski`, is just at an angle, length is definitely more than the width. We'll have to adjust both of these to be labeled correctly.

We'll get the `measureID` for each so we can switch the length/width labels, then save the updated `BeetleMeasurements.csv` (this will be the final version). From there we'll update `all_measurement.csv` and create `individual_metadata.csv` with just Isadora's annotations so that we should have one row/pair of measurements per beetle.

In [5]:
df.loc[(df["combinedID"] == "A00000046078_10") & (df["user_name"] == "IsaFluck")]

Unnamed: 0,pictureID,scalebar,cm_pix,individual,structure,lying_flat,coords_pix,dist_pix,dist_cm,scientificName,...,siteID,site_name,plotID,user_name,workflowID,genus,species,combinedID,measureID,file_name
14542,A00000046078.jpg,"{""x1"": 1242, ""y1"": 307, ""x2"": 1335, ""y2"": 305}",93.021503,10,ElytraLength,Yes,"{""x1"": 1575, ""y1"": 797, ""x2"": 1570, ""y2"": 830}",33.376639,0.358806,Pterostichus lachrymosus,...,MLBS,Mountain Lake Biological Station NEON,MLBS_009,IsaFluck,21840,Pterostichus,lachrymosus,A00000046078_10,c5347b8e-5789-41b6-82f9-5d44228ad48d,group_images/A00000046078.jpg
14543,A00000046078.jpg,"{""x1"": 1242, ""y1"": 307, ""x2"": 1335, ""y2"": 305}",93.021503,10,ElytraWidth,Yes,"{""x1"": 1574, ""y1"": 812, ""x2"": 1649, ""y2"": 824}",75.953933,0.81652,Pterostichus lachrymosus,...,MLBS,Mountain Lake Biological Station NEON,MLBS_009,IsaFluck,21840,Pterostichus,lachrymosus,A00000046078_10,6046bedc-d727-457e-89dc-8bb73183aa32,group_images/A00000046078.jpg


Thankfully there is just one individual with this `combinedID`, so we can save the length and width `measureID`s and re-asign those labels in the `structure` column.

In [6]:
len_meas_id = df.loc[(df["combinedID"] == "A00000046078_10") & (df["user_name"] == "IsaFluck") & (df["structure"] == "ElytraLength"), "measureID"].values[0]
w_meas_id = df.loc[(df["combinedID"] == "A00000046078_10") & (df["user_name"] == "IsaFluck") & (df["structure"] == "ElytraWidth"), "measureID"].values[0]

print(f"We will reasign the measure {len_meas_id} to be structure 'ElytraWidth', and measure {w_meas_id} to be structure 'ElytraLength")

We will reasign the measure c5347b8e-5789-41b6-82f9-5d44228ad48d to be structure 'ElytraWidth', and measure 6046bedc-d727-457e-89dc-8bb73183aa32 to be structure 'ElytraLength


In [7]:
df.loc[df["measureID"] == len_meas_id, "structure"] = "ElytraWidth"
df.loc[df["measureID"] == w_meas_id, "structure"] = "ElytraLength"

df.loc[(df["combinedID"] == "A00000046078_10") & (df["user_name"] == "IsaFluck")]

Unnamed: 0,pictureID,scalebar,cm_pix,individual,structure,lying_flat,coords_pix,dist_pix,dist_cm,scientificName,...,siteID,site_name,plotID,user_name,workflowID,genus,species,combinedID,measureID,file_name
14542,A00000046078.jpg,"{""x1"": 1242, ""y1"": 307, ""x2"": 1335, ""y2"": 305}",93.021503,10,ElytraWidth,Yes,"{""x1"": 1575, ""y1"": 797, ""x2"": 1570, ""y2"": 830}",33.376639,0.358806,Pterostichus lachrymosus,...,MLBS,Mountain Lake Biological Station NEON,MLBS_009,IsaFluck,21840,Pterostichus,lachrymosus,A00000046078_10,c5347b8e-5789-41b6-82f9-5d44228ad48d,group_images/A00000046078.jpg
14543,A00000046078.jpg,"{""x1"": 1242, ""y1"": 307, ""x2"": 1335, ""y2"": 305}",93.021503,10,ElytraLength,Yes,"{""x1"": 1574, ""y1"": 812, ""x2"": 1649, ""y2"": 824}",75.953933,0.81652,Pterostichus lachrymosus,...,MLBS,Mountain Lake Biological Station NEON,MLBS_009,IsaFluck,21840,Pterostichus,lachrymosus,A00000046078_10,6046bedc-d727-457e-89dc-8bb73183aa32,group_images/A00000046078.jpg


Perfect!

Now let's fix the other measurements from `A00000046104_10`. Should also be just the two measurements (lenght & width), but we'll double check before proceeding.

In [8]:
df.loc[(df["combinedID"] == "A00000046104_10") & (df["user_name"] == "rileywolcheski")]

Unnamed: 0,pictureID,scalebar,cm_pix,individual,structure,lying_flat,coords_pix,dist_pix,dist_cm,scientificName,...,siteID,site_name,plotID,user_name,workflowID,genus,species,combinedID,measureID,file_name
20866,A00000046104.jpg,"{""x1"": 943, ""y1"": 305, ""x2"": 1037, ""y2"": 304}",94.005319,10,ElytraLength,Yes,"{""x1"": 1710, ""y1"": 421, ""x2"": 1744, ""y2"": 402}",38.948684,0.414324,Pterostichus lachrymosus,...,MLBS,Mountain Lake Biological Station NEON,MLBS_009,rileywolcheski,21840,Pterostichus,lachrymosus,A00000046104_10,3c7b6409-60c8-4006-b25e-69bdcc4a37b8,group_images/A00000046104.jpg
20867,A00000046104.jpg,"{""x1"": 943, ""y1"": 305, ""x2"": 1037, ""y2"": 304}",94.005319,10,ElytraWidth,Yes,"{""x1"": 1727, ""y1"": 413, ""x2"": 1766, ""y2"": 473}",71.561163,0.761246,Pterostichus lachrymosus,...,MLBS,Mountain Lake Biological Station NEON,MLBS_009,rileywolcheski,21840,Pterostichus,lachrymosus,A00000046104_10,42156d07-59af-49cb-a77f-9bd6636c2890,group_images/A00000046104.jpg


Yep, just the two!

In [9]:
len_meas_id = df.loc[(df["combinedID"] == "A00000046104_10") & (df["user_name"] == "rileywolcheski") & (df["structure"] == "ElytraLength"), "measureID"].values[0]
w_meas_id = df.loc[(df["combinedID"] == "A00000046104_10") & (df["user_name"] == "rileywolcheski") & (df["structure"] == "ElytraWidth"), "measureID"].values[0]

print(f"We will reasign the measure {len_meas_id} to be structure 'ElytraWidth', and measure {w_meas_id} to be structure 'ElytraLength")

We will reasign the measure 3c7b6409-60c8-4006-b25e-69bdcc4a37b8 to be structure 'ElytraWidth', and measure 42156d07-59af-49cb-a77f-9bd6636c2890 to be structure 'ElytraLength


In [10]:
df.loc[df["measureID"] == len_meas_id, "structure"] = "ElytraWidth"
df.loc[df["measureID"] == w_meas_id, "structure"] = "ElytraLength"

df.loc[(df["combinedID"] == "A00000046104_10") & (df["user_name"] == "rileywolcheski")]

Unnamed: 0,pictureID,scalebar,cm_pix,individual,structure,lying_flat,coords_pix,dist_pix,dist_cm,scientificName,...,siteID,site_name,plotID,user_name,workflowID,genus,species,combinedID,measureID,file_name
20866,A00000046104.jpg,"{""x1"": 943, ""y1"": 305, ""x2"": 1037, ""y2"": 304}",94.005319,10,ElytraWidth,Yes,"{""x1"": 1710, ""y1"": 421, ""x2"": 1744, ""y2"": 402}",38.948684,0.414324,Pterostichus lachrymosus,...,MLBS,Mountain Lake Biological Station NEON,MLBS_009,rileywolcheski,21840,Pterostichus,lachrymosus,A00000046104_10,3c7b6409-60c8-4006-b25e-69bdcc4a37b8,group_images/A00000046104.jpg
20867,A00000046104.jpg,"{""x1"": 943, ""y1"": 305, ""x2"": 1037, ""y2"": 304}",94.005319,10,ElytraLength,Yes,"{""x1"": 1727, ""y1"": 413, ""x2"": 1766, ""y2"": 473}",71.561163,0.761246,Pterostichus lachrymosus,...,MLBS,Mountain Lake Biological Station NEON,MLBS_009,rileywolcheski,21840,Pterostichus,lachrymosus,A00000046104_10,42156d07-59af-49cb-a77f-9bd6636c2890,group_images/A00000046104.jpg


### Save Updated Beetle Measurement CSV

In [11]:
df.to_csv("../data/BeetleMeasurements.csv", index = False)