In [1]:
import numpy as np
import pandas as pd
import shutil
import os
import math

In [2]:
# define variables
IMAGES_DIR = '/home/luchocode/projects/tesis/data/ExoNet_Images_curated'

# directories for storing trainining, testing and validation images
TRAIN_SELECTED_IMAGES_OUTPUT_DIR = '/home/luchocode/projects/tesis/data/selected_exoimages/train'
TEST_SELECTED_IMAGES_OUTPUT_DIR = '/home/luchocode/projects/tesis/data/selected_exoimages/test'
VAL_SELECTED_IMAGES_OUTPUT_DIR = '/home/luchocode/projects/tesis/data/selected_exoimages/val'

# define train/test/val data proportion
TRAIN_IMAGE_RATIO = 0.7
VAL_IMAGE_RATIO = 0.2
TEST_IMAGE_RATIO = 0.1

In [3]:
# Obtain dataset from curated exonet images
RAW_LABELS_FILEPATH = "/mnt/f/Datasets/Tesis/labels_df.pkl"
df_images = pd.read_pickle(RAW_LABELS_FILEPATH)
df_images.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 922782 entries, 0 to 922781
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   video   922782 non-null  object
 1   frame   922782 non-null  int64 
 2   class   922782 non-null  object
 3   exist   922782 non-null  bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 22.0+ MB


In [4]:
def check_image_existence(row, base_dir=IMAGES_DIR):
    """
    Checks if the specific image file exists for a given row.
    """
    # 1. Construct the image name (Be careful with those bracket quotes '["..."'] in the filename)
    # Your original image name format: ['video_name'] frame 123.jpg
    image_name = f"['{row['video']}'] frame {row['frame']}.jpg"
    
    # 2. Construct the full file path
    # Path: base_dir / class / image_name
    filepath = os.path.join(base_dir, row["class"], image_name)
    
    # 3. Check and return the boolean result
    return os.path.exists(filepath)

df_images['exist'] = df_images.apply(
    lambda row: check_image_existence(row, base_dir=IMAGES_DIR),
    axis=1
)

# 2. Filter the result
df_images_check = df_images[df_images["exist"] == True]

# 3. Check info
df_images_check = df_images_check.reset_index(drop=True)
df_images_check.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 876549 entries, 0 to 876548
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   video   876549 non-null  object
 1   frame   876549 non-null  int64 
 2   class   876549 non-null  object
 3   exist   876549 non-null  bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 20.9+ MB


In [5]:
# set seed for np random numbers
def get_np_random() -> np.random:
    return np.random.default_rng(42)

In [6]:
# select videos used as train, val and test
unique_name_videos = df_images_check["video"].unique()
n_videos = len(unique_name_videos)
print(f"Number of videos contained in the dataset: {n_videos}")

n_train_videos = math.floor(n_videos * TRAIN_IMAGE_RATIO)
n_val_videos = math.floor(n_videos * VAL_IMAGE_RATIO)
n_test_videos = n_videos - n_train_videos - n_val_videos

print(f"Number of train videos: {n_train_videos}")
print(f"Number of test videos: {n_test_videos}")
print(f"Number of val videos: {n_val_videos}")

# break dataframe down into train/val/test dataframe
np_random = get_np_random()
video_names = np_random.choice(unique_name_videos, size=n_videos, replace=False)

train_video_names = video_names[:n_train_videos]
val_video_names = video_names[n_train_videos:n_train_videos+n_val_videos]
test_video_names = video_names[n_train_videos+n_val_videos:]

# split global dataframes into train/val/test dataframes
df_train = df_images_check[df_images_check["video"].isin(train_video_names)].reset_index(drop=True)
df_test = df_images_check[df_images_check["video"].isin(test_video_names)].reset_index(drop=True)
df_val = df_images_check[df_images_check["video"].isin(val_video_names)].reset_index(drop=True)

print(f"Rows in train: {len(df_train)}")
print(f"Rows in test: {len(df_test)}")
print(f"Rows in val: {len(df_val)}")

Number of videos contained in the dataset: 56
Number of train videos: 39
Number of test videos: 6
Number of val videos: 11
Rows in train: 599202
Rows in test: 96260
Rows in val: 181087


In [7]:
df_train.to_pickle('pickle/df_train.pkl')
df_test.to_pickle('pickle/df_test.pkl')
df_val.to_pickle('pickle/df_val.pkl')

In [32]:
def move_images_per_dataframe(
    dataframe_name,
    dataframe,    
    input_images_dir,
    output_images_dir
):
    print(f"Copying {dataframe_name}...")
    
    for idx, row in dataframe.iterrows():
        image_name = f"['{row['video']}'] frame {row['frame']}.jpg"
        
        from_class_folder = os.path.join(input_images_dir, row['class'])
        to_class_folder = os.path.join(output_images_dir, row['class'])

        if not os.path.exists(to_class_folder):
            os.makedirs(to_class_folder)
        
        from_imagepath = os.path.join(from_class_folder, image_name)
        to_imagepath = os.path.join(to_class_folder, image_name)
        
        # copy image
        shutil.copy(from_imagepath, to_imagepath)
    
    print(f"Complete!")

In [33]:
# move train images
move_images_per_dataframe(
    "train",
    df_train,
    IMAGES_DIR,
    TRAIN_SELECTED_IMAGES_OUTPUT_DIR
)
# move val images
move_images_per_dataframe(
    "val",
    df_val,
    IMAGES_DIR,
    VAL_SELECTED_IMAGES_OUTPUT_DIR
)
# move test images
move_images_per_dataframe(
    "test",
    df_test,
    IMAGES_DIR,
    TEST_SELECTED_IMAGES_OUTPUT_DIR
)

Copying train...
Complete!
Copying val...
Complete!
Copying test...
Complete!
