In [1]:
import os

import pandas as pd
import numpy as np
import seaborn as sb
import shutil
import sklearn.metrics as skm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
def find_files_with_extension(directory, extension):
    # List to store the full paths of the files
    file_path_ls = []
    
    # Walk through the directory
    for dirpath, dirnames, filenames in os.walk(directory):
        # Iterate over each file in the current directory
        for filename in filenames:
            # Check if the file ends with the given extension
            if filename.endswith(extension):
                # Construct the full path of the file
                full_path = os.path.join(dirpath, filename)
                # Append the full path to the list
                file_path_ls.append(full_path)

    return file_path_ls

# Wing

In [3]:
DIR_PATH = "datasets/raw/wing_cut/"
file_path_ls = find_files_with_extension(DIR_PATH, ".png")
file_ls = [[path.split(os.sep)[-1],
            path.split(os.sep)[-2].split("_")[0],
            path.split(os.sep)[-1].split("_")[1][:2],
            path.split(os.sep)[-1].split("_")[1][:5],
            path.split(os.sep)[-1].split("_")[2].lower(),
            path.split(os.sep)[-1].split("_")[0]] for path in file_path_ls]

df = pd.DataFrame(file_ls, columns=["File_Name","Depiction", "Species", "ID", "Wing", "Device", ])
df["ID_Wing"] = df["ID"] + "_" + df["Wing"]
df

Unnamed: 0,File_Name,Depiction,Species,ID,Wing,Device,ID_Wing
0,Micro_KOa17_r_.png,wing,KO,KOa17,r,Micro,KOa17_r
1,Macro_JAb04_l_.png,wing,JA,JAb04,l,Macro,JAb04_l
2,Micro_JAb49_r_.png,wing,JA,JAb49,r,Micro,JAb49_r
3,Micro_ALa53_r_.png,wing,AL,ALa53,r,Micro,ALa53_r
4,Macro_AEa19_r_.png,wing,AE,AEa19,r,Macro,AEa19_r
...,...,...,...,...,...,...,...
3145,Macro_KOa06_l_.png,wing,KO,KOa06,l,Macro,KOa06_l
3146,Macro_JAb58_l_.png,wing,JA,JAb58,l,Macro,JAb58_l
3147,Micro_AEa08_l_.png,wing,AE,AEa08,l,Micro,AEa08_l
3148,Macro_ALa42_l_.png,wing,AL,ALa42,l,Macro,ALa42_l


In [4]:
# Find Wing with lowest damage label
df_ref = pd.read_pickle("old/wing_dataframe_dmg_label.pkl")
df_ref = df_ref.loc[df_ref["device"] == "Micro"]
df_ref = df_ref.groupby(["img_id", "orientation", "total_dmg"])["img_name"].count().reset_index()
df_ref = df_ref.sort_values(["img_id", "total_dmg"], ascending=True)
df_ref = df_ref.drop_duplicates(subset='img_id', keep='first')
df_ref["ID_Wing"] = df_ref["img_id"] + "_" + df_ref["orientation"]

# Mark the image with the lowest damage
df["Less_DMG"] = 0
df["Less_DMG"].loc[df["ID_Wing"].isin(df_ref["ID_Wing"])] = 1

# Take only imags with lowest damage value
df = df.loc[df["Less_DMG"] == 1]
df = df.sort_values("ID")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Less_DMG"].loc[df["ID_Wing"].isin(df_ref["ID_Wing"])] = 1


### Split

In [5]:
train, valtest = train_test_split(list(df["ID"].unique()), test_size=0.3, random_state=19, stratify=[x[:2] for x in list(df["ID"].unique())])
val, test = train_test_split(valtest, test_size=0.5, random_state=19, stratify=[x[:2] for x in valtest])

In [6]:
df["Datasplit"] = np.nan
df["Datasplit"].loc[df["ID"].isin(train)] = "train"
df["Datasplit"].loc[df["ID"].isin(val)] = "val"
df["Datasplit"].loc[df["ID"].isin(test)] = "test"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Datasplit"].loc[df["ID"].isin(train)] = "train"
  df["Datasplit"].loc[df["ID"].isin(train)] = "train"


In [7]:
df.groupby(["Datasplit", "Species"])["Device"].value_counts().reset_index()

Unnamed: 0,Datasplit,Species,Device,count
0,test,AE,Macro,30
1,test,AE,Micro,30
2,test,AL,Micro,30
3,test,AL,Macro,30
4,test,JA,Micro,30
5,test,JA,Macro,30
6,test,KO,Micro,30
7,test,KO,Macro,30
8,train,AE,Macro,139
9,train,AE,Micro,139


In [348]:
#df.to_pickle("wing_dataset.pkl")

## move

In [356]:
for img in df["File_Name"]:
    reference = df.loc[df["File_Name"] == img] 
    
    src_path = os.path.join("datasets", "raw", "wing_cut", img)
    dir_path = os.path.join("datasets", "train_ready", "DepictionDataset", "wing",
                            reference["Datasplit"].item(), reference["Species"].item(), img)

    #shutil.copy2(src_path, dir_path)

    if reference["Datasplit"].item() != "train":
        dir_path = os.path.join("datasets", "train_ready", "DeviceDataset", "wing",
                                reference["Datasplit"].item(), reference["Species"].item(), img)
    else:
        dir_path = os.path.join("datasets", "train_ready", "DeviceDataset", "wing",
                                reference["Datasplit"].item(), reference["Device"].item().lower(),reference["Species"].item(), img)

    #shutil.copy2(src_path, dir_path)

# Body

In [8]:
DIR_PATH = "datasets/raw/body_cut/"
file_path_ls = find_files_with_extension(DIR_PATH, ".png")
file_ls = [[path.split(os.sep)[-1],
            path.split(os.sep)[-2].split("_")[0],
            path.split(os.sep)[-1].split("_")[1][:2],
            path.split(os.sep)[-1].split("_")[1][:5],
            path.split(os.sep)[-1].split("_")[0]] for path in file_path_ls]

df = pd.DataFrame(file_ls, columns=["File_Name","Depiction", "Species", "ID", "Device", ])

df["Datasplit"] = np.nan
df["Datasplit"].loc[df["ID"].isin(train)] = "train"
df["Datasplit"].loc[df["ID"].isin(val)] = "val"
df["Datasplit"].loc[df["ID"].isin(test)] = "test"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Datasplit"].loc[df["ID"].isin(train)] = "train"
  df["Datasplit"].loc[df["ID"].isin(train)] = "train"


In [9]:
df

Unnamed: 0,File_Name,Depiction,Species,ID,Device,Datasplit
0,Micro_ALa34_.png,body,AL,ALa34,Micro,test
1,Macro_KOa70_2977_.png,body,KO,KOa70,Macro,train
2,Macro_ALb80_3122_.png,body,AL,ALb80,Macro,train
3,Phone_KOa02_2610_.png,body,KO,KOa02,Phone,train
4,Phone_AEb79_3554_.png,body,AE,AEb79,Phone,train
...,...,...,...,...,...,...
2392,Phone_ALa50_2202_.png,body,AL,ALa50,Phone,test
2393,Phone_KOb84_6860_.png,body,KO,KOb84,Phone,val
2394,Phone_JAa76_6619_.png,body,JA,JAa76,Phone,train
2395,Phone_AEa96_2599_.png,body,AE,AEa96,Phone,train


In [10]:
df.groupby(["Datasplit", "Species"])["Device"].value_counts().reset_index()

Unnamed: 0,Datasplit,Species,Device,count
0,test,AE,Macro,30
1,test,AE,Micro,30
2,test,AE,Phone,30
3,test,AL,Phone,30
4,test,AL,Micro,30
5,test,AL,Macro,30
6,test,JA,Phone,30
7,test,JA,Micro,30
8,test,JA,Macro,30
9,test,KO,Phone,30


In [11]:
for img in df.loc[df["Device"] != "Phone", "File_Name"]:
    try: 
        reference = df.loc[df["File_Name"] == img] 
        
        src_path = os.path.join("datasets", "raw", "body_cut", img)
        dir_path = os.path.join("datasets", "train_ready", "DepictionDataset", "body",
                                reference["Datasplit"].item(), reference["Species"].item(), img)

        #shutil.copy2(src_path, dir_path)
    except TypeError:
        print(img)
        
for img in df["File_Name"]:
    try: 
        reference = df.loc[df["File_Name"] == img]
        src_path = os.path.join("datasets", "raw", "body_cut", img)
        
        if reference["Datasplit"].item() != "train":
            dir_path = os.path.join("datasets", "train_ready", "DeviceDataset", "body",
                                    reference["Datasplit"].item(), reference["Species"].item(), img)
        else:
            dir_path = os.path.join("datasets", "train_ready", "DeviceDataset", "body",
                                    reference["Datasplit"].item(), reference["Device"].item().lower(),reference["Species"].item(), img)
        
        shutil.copy2(src_path, dir_path)
        
    except TypeError:
        print(img)

Macro_ALa67_2268_.png
Macro_KOb78_4034_.png
Micro_AEa67_.png
Micro_ALa67_.png
Micro_KOb78_.png
Macro_AEa67_2536_.png
Macro_ALa67_2268_.png
Phone_AEa67_2534_.png
Macro_KOb78_4034_.png
Phone_KOb78_4035_.png
Micro_AEa67_.png
Micro_ALa67_.png
Phone_ALa67_2266_.png
Micro_KOb78_.png
Macro_AEa67_2536_.png
