# Pre-processing

In [1]:
# Filter existing datasets
import os
import pandas as pd
import numpy as np

In [2]:
# Set directories and file paths
DATASET_DIR = "/mnt/f/ExoNet_Images/ExoNet_Images"
LABELS_FILE = "/mnt/f/Descargas/Labels.csv"
FINAL_DATASETS_DIR = "/mnt/f/Datasets/Tesis/"
IMAGE_PATH_FORMAT = "{}/{}/['{}'] frame {}.jpg"

In [3]:
labels = pd.read_csv(LABELS_FILE)
labels["exist"] = labels.apply(lambda row: os.path.exists(
    IMAGE_PATH_FORMAT.format(
        DATASET_DIR,
        row.iloc[2],
        row.iloc[0],
        row.iloc[1])
    ), axis=1)
labels = labels[labels["exist"] == True].reset_index(drop=True)
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 922782 entries, 0 to 922781
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   video   922782 non-null  object
 1   frame   922782 non-null  int64 
 2   class   922782 non-null  object
 3   exist   922782 non-null  bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 22.0+ MB


Los frames de cada video tienen que ser agrupados en secuencia, de tal manera que la clasificación del último frame considere los frames previos de la secuencia. La secuencia elegida serán 20 frames previos.

In [4]:
# Generate train, validation and test datasets from the original one
np.random.seed(42)
video_names = labels["video"].unique()
video_names = np.random.choice(video_names, replace=False, size=len(video_names))
train_pct = int(len(video_names) * 0.7)
val_pct = int(len(video_names) * 0.2)
test_pct = int(len(video_names) - (train_pct + val_pct))
print(f"Proportions: {train_pct} - {val_pct} - {test_pct}")
train_videos, val_videos, test_videos = video_names[:train_pct], video_names[train_pct:train_pct+val_pct], video_names[train_pct+val_pct:]
print(f"Shapes: train-{train_videos.shape}, val-{val_videos.shape}, test-{test_videos.shape}")

Proportions: 39 - 11 - 6
Shapes: train-(39,), val-(11,), test-(6,)


In [42]:
train_df = labels[labels["video"].isin(train_videos)].reset_index(drop=True)
val_df = labels[labels["video"].isin(val_videos)].reset_index(drop=True)
test_df = labels[labels["video"].isin(test_videos)].reset_index(drop=True)
print(f"Train records: {len(train_df)}, Val records: {len(val_df)}, Test records: {len(test_df)}")

Train records: 639667, Val records: 174840, Test records: 108275


In [43]:
# Crear una columna que contenga los paths a las imágenes
def generate_filepath(row):
    return IMAGE_PATH_FORMAT.format(DATASET_DIR,row["class"],row["video"],row["frame"])
    
train_df.loc[:,'path'] = train_df.apply(generate_filepath, axis=1)
val_df.loc[:,'path'] = val_df.apply(generate_filepath, axis=1)
test_df.loc[:,'path'] = test_df.apply(generate_filepath, axis=1)

In [44]:
# Create a new column in the train_df and val_df datasets
train_df.loc[:,'sequence'] = ''
val_df.loc[:,'sequence'] = ''

In [45]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639667 entries, 0 to 639666
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   video     639667 non-null  object
 1   frame     639667 non-null  int64 
 2   class     639667 non-null  object
 3   exist     639667 non-null  bool  
 4   path      639667 non-null  object
 5   sequence  639667 non-null  object
dtypes: bool(1), int64(1), object(4)
memory usage: 25.0+ MB


According to the article where ExoNet database was proposed, each video was downsampled to 5 frames/s to minimize human annotations efforts. Therefore, each 5 video's frames represent 1 second of video recording. 

Training dataset must be splitted into fixed lenght sequences. Each sequence will represent a past time step (frame) of the video recording. Validation dataset will be splitted into sequences equals to training dataset. All frames in the test dataset will not suffer changes since it will simulate a real time video passed to the network.

In [49]:
# Sequence length will be set to 15 meaning that 3 past seconds will be considered to classify the current frame
SEQUENCE_LENGTH = 15

def group_frames_per_sequence(df, seq_len):
    """
    Generar secuencias de imágenes de cada video.
    [x1,x2,x3,...,xn] -> yn
    Esto se traduce a:
    [path1,path2,path3,...,path_n] -> yn
    """
    balanced_df = None
    _df = df.reset_index(drop=True)
    for i in range(seq_len, len(df)+1):
        tmp_df = _df.iloc[i-seq_len:i].copy()
        _df.at[i-1,"sequence"] = ",".join(tmp_df["path"].to_list())
    
    return _df

seq_train_df = train_df.groupby('video', group_keys=False)[['video','frame','class','path','sequence']].apply(lambda x: group_frames_per_sequence(x, SEQUENCE_LENGTH), include_groups=False)
seq_train_df = seq_train_df.reset_index(drop=True)

seq_val_df = val_df.groupby('video', group_keys=False)[['video','frame','class','path','sequence']].apply(lambda x: group_frames_per_sequence(x, SEQUENCE_LENGTH), include_groups=False)
seq_val_df = seq_val_df.reset_index(drop=True)

print(f"[Training] Number of rows obtained after gruping frames per sequence: {len(seq_train_df)}")
print(f"[Validation] Number of rows obtained after gruping frames per sequence: {len(seq_val_df)}")

[Training] Number of rows obtained after gruping frames per sequence: 639667
[Validation] Number of rows obtained after gruping frames per sequence: 174840


In [62]:
seq_train_df = seq_train_df[seq_train_df["sequence"]!= ""].reset_index(drop=True)
seq_val_df = seq_val_df[seq_val_df["sequence"]!= ""].reset_index(drop=True)

In [63]:
seq_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639121 entries, 0 to 639120
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   video     639121 non-null  object
 1   frame     639121 non-null  int64 
 2   class     639121 non-null  object
 3   path      639121 non-null  object
 4   sequence  639121 non-null  object
dtypes: int64(1), object(4)
memory usage: 24.4+ MB


In [64]:
seq_val_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174686 entries, 0 to 174685
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   video     174686 non-null  object
 1   frame     174686 non-null  int64 
 2   class     174686 non-null  object
 3   path      174686 non-null  object
 4   sequence  174686 non-null  object
dtypes: int64(1), object(4)
memory usage: 6.7+ MB


In [65]:
# Save dataframes
seq_train_df.to_pickle(os.path.join(FINAL_DATASETS_DIR, "seq_train_df.pkl"))
seq_val_df.to_pickle(os.path.join(FINAL_DATASETS_DIR, "seq_val_df.pkl"))
test_df.to_pickle(os.path.join(FINAL_DATASETS_DIR, "test_df.pkl"))