In [2]:
import cv2
import os
import shutil
import numpy as np
import pandas as pd

from pathlib import Path

In [4]:
# read labels dataframe
labels_path = Path("/mnt/f/Datasets/Tesis/labels_df.pkl")
df_images = pd.read_pickle(labels_path)
df_images.head()

Unnamed: 0,video,frame,class,exist
0,IMG_03_1,0,LG-T-DW,True
1,IMG_03_1,6,LG-T-O,True
2,IMG_03_1,12,LG-T-O,True
3,IMG_03_1,18,LG-T-O,True
4,IMG_03_1,24,LG-T-O,True


In [33]:
def sampling_frames_per_video(
    df_video, 
    base_folder, 
    output_folder, 
    threshold=15.0, 
    blurry_score=100
):
    # create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    count = 0
    saved_count = 0
    last_saved_frame = None

    # sort dataframe by frames
    _df_video = df_video.sort_values(by=['frame'])
    _df_video.info()

    # iterate over each frame of the video group
    for idx, row in _df_video.iterrows():
        image_name = f"['{row['video']}'] frame {row['frame']}.jpg"
        image_path = base_folder / row["class"] / image_name
        gray = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        
        if last_saved_frame is None:
            is_distinct = True
        else:
            # compute laplacian variance and adaptive similarity sampling
            diff = np.mean(np.abs(gray - last_saved_frame))
            is_distinct = diff > threshold
            if not is_distinct:
                laplacian = cv2.Laplacian(gray, cv2.CV_64F)
                score = laplacian.var()
                is_distinct = score > blurry_score

        # copy image to the output dir if it is distinct from the last frame read
        if is_distinct:
             # create the subfolder in the output folder
            target_subfolder_path = Path(output_folder, row["class"])
            
            if not os.path.exists(target_subfolder_path):
                os.makedirs(target_subfolder_path)
            
            copy_image_path = Path(target_subfolder_path, image_name)
            shutil.copy(image_path, copy_image_path)
            
            last_saved_frame = gray
            saved_count += 1
            if saved_count % 100 == 0:
                print(f"Saved {saved_count} frames...")
        
        count += 1

    print(f"Finished. Scanned {count} frames, saved {saved_count} distinct images.")

In [34]:
# sampling frames per videos based on similarity measurements:
# adaptive similarity sampling and laplacian variance
df_images.groupby('video', group_keys=False).apply(
    lambda df_video: sampling_frames_per_video(
        df_video=df_video,
        base_folder=Path("F:\\ExoNet_Images\\ExoNet_Images"),
        output_folder=Path("F:\\ExoNet_Images_curated"),
        threshold=80.0, 
        blurry_score=300
    ), include_groups=True)

<class 'pandas.core.frame.DataFrame'>
Index: 19436 entries, 417712 to 437147
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   video   19436 non-null  object
 1   frame   19436 non-null  int64 
 2   class   19436 non-null  object
 3   exist   19436 non-null  bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 626.4+ KB
Saved 100 frames...
Saved 200 frames...
Saved 300 frames...
Saved 400 frames...
Saved 500 frames...
Saved 600 frames...
Saved 700 frames...
Saved 800 frames...
Saved 900 frames...
Saved 1000 frames...
Saved 1100 frames...
Saved 1200 frames...
Saved 1300 frames...
Saved 1400 frames...
Saved 1500 frames...
Saved 1600 frames...
Saved 1700 frames...
Saved 1800 frames...
Saved 1900 frames...
Saved 2000 frames...
Saved 2100 frames...
Saved 2200 frames...
Saved 2300 frames...
Saved 2400 frames...
Saved 2500 frames...
Saved 2600 frames...
Saved 2700 frames...
Saved 2800 frames...
Saved 2900 frames...
Saved 3000 fra

  df_images.groupby('video', group_keys=False).apply(
