# Pre-processing

In [1]:
# Filter existing datasets
import os
import pandas as pd
import numpy as np

In [2]:
# Set directories and file paths
DATASET_DIR = "/media/luchocode/Extra vol/ExoNet_Images/ExoNet_Images"
LABELS_FILE = "/home/luchocode/Downloads/Labels.csv"
FINAL_DATASETS_DIR = "/media/luchocode/Extra vol/Datasets/Tesis/"
IMAGE_PATH_FORMAT = "{}/{}/['{}'] frame {}.jpg"

In [3]:
labels = pd.read_csv(LABELS_FILE)
labels["exist"] = labels.apply(lambda row: os.path.exists(
    IMAGE_PATH_FORMAT.format(
        DATASET_DIR,
        row.iloc[2],
        row.iloc[0],
        row.iloc[1])
    ), axis=1)
labels = labels[labels["exist"] == True].reset_index(drop=True)
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 922782 entries, 0 to 922781
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   video   922782 non-null  object
 1   frame   922782 non-null  int64 
 2   class   922782 non-null  object
 3   exist   922782 non-null  bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 22.0+ MB


In [4]:
# Generate train, validation and test datasets from the original one
np.random.seed(42)
video_names = labels["video"].unique()
video_names = np.random.choice(video_names, replace=False, size=len(video_names))
train_pct = int(len(video_names) * 0.7)
val_pct = int(len(video_names) * 0.2)
test_pct = int(len(video_names) - (train_pct + val_pct))
print(f"Proportions: {train_pct} - {val_pct} - {test_pct}")
train_videos, val_videos, test_videos = video_names[:train_pct], video_names[train_pct:train_pct+val_pct], video_names[train_pct+val_pct:]
print(f"Shapes: train-{train_videos.shape}, val-{val_videos.shape}, test-{test_videos.shape}")

Proportions: 39 - 11 - 6
Shapes: train-(39,), val-(11,), test-(6,)


In [5]:
train_df = labels[labels["video"].isin(train_videos)]
val_df = labels[labels["video"].isin(val_videos)]
test_df = labels[labels["video"].isin(test_videos)]
print(f"Train records: {len(train_df)}, Val records: {len(val_df)}, Test records: {len(test_df)}")

Train records: 639667, Val records: 174840, Test records: 108275


According to the article where ExoNet database was proposed, each video was downsampled to 5 frames/s to minimize human annotations efforts. Therefore, each 5 video's frames represent 1 second of video recording. 

Training dataset must be splitted into fixed lenght sequences. Each sequence will represent a past time step (frame) of the video recording. Validation dataset will be splitted into sequences equals to training dataset. All frames in the test dataset will not suffer changes since it will simulate a real time video passed to the network.

In [24]:
# Sequence length will be set to 100 meaning that 20 past seconds will be considered to classify the current frame
SEQUENCE_LENGTH = 100

def group_frames_per_sequence(df, seq_len):
    splits = len(df) // seq_len
    rest_frames = len(df) % seq_len
    add_frames = seq_len - rest_frames
    
    balanced_df = None
    for i in range(splits):
        _df = df.iloc[i*seq_len:(i+1)*seq_len].copy()
        _df["video_chunk"] = f"chunk_part{i}"
        if balanced_df is None:
            balanced_df = _df
        else:
            balanced_df = pd.concat([balanced_df, _df], axis=0)

    if add_frames:
        _df = balanced_df.iloc[-seq_len:].copy()
        _df["video_chunk"] = f"chunk_part{splits}"
        balanced_df = pd.concat([balanced_df, _df], axis=0)
    
    return balanced_df

seq_train_df = train_df.groupby('video', group_keys=False)[['video','frame','class']].apply(lambda x: group_frames_per_sequence(x, SEQUENCE_LENGTH), include_groups=False)
seq_train_df = seq_train_df.reset_index(drop=True)

seq_val_df = val_df.groupby('video', group_keys=False)[['video','frame','class']].apply(lambda x: group_frames_per_sequence(x, SEQUENCE_LENGTH), include_groups=False)
seq_val_df = seq_val_df.reset_index(drop=True)

print(f"[Training] Number of rows obtained after gruping frames per sequence: {len(seq_train_df)}")
print(f"[Validation] Number of rows obtained after gruping frames per sequence: {len(seq_val_df)}")

[Training] Number of rows obtained after gruping frames per sequence: 641800
[Validation] Number of rows obtained after gruping frames per sequence: 175400


In [26]:
# Save dataframes
seq_train_df.to_pickle(os.path.join(FINAL_DATASETS_DIR, "seq_train_df.pkl"))
seq_val_df.to_pickle(os.path.join(FINAL_DATASETS_DIR, "seq_val_df.pkl"))
test_df.to_pickle(os.path.join(FINAL_DATASETS_DIR, "test_df.pkl"))