In [1]:
import torch
import pandas as pd
import numpy as np

In [2]:
# load datasets
df_val = pd.read_pickle('pickle/df_val.pkl')
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181087 entries, 0 to 181086
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   video   181087 non-null  object
 1   frame   181087 non-null  int64 
 2   class   181087 non-null  object
 3   exist   181087 non-null  bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 4.3+ MB


In [3]:
# define paths
TRAIN_DATASET_PATH = '/home/luchocode/projects/tesis/data/selected_exoimages/features/train'
TEST_DATASET_PATH = '/home/luchocode/projects/tesis/data/selected_exoimages/features/test'
VAL_DATASET_PATH = '/home/luchocode/projects/tesis/data/selected_exoimages/features/val'
SEQ_LEN = 12

In [4]:
def set_path_to_df(row, base_dir):
    """
    Set a Path's column for each row in a dataset
    """
    image_name = f"{base_dir}/{row['class']}/['{row['video']}'] frame {row['frame']}.npy"
    return image_name

def group_frames_per_sequence(df, seq_len):
    """
    Generar secuencias de imÃ¡genes de cada video.
    [x1,x2,x3,...,xn] -> yn
    Esto se traduce a:
    [path1,path2,path3,...,path_n] -> yn
    """
    balanced_df = None
    _df = df.reset_index(drop=True)
    for i in range(seq_len, len(df)+1):
        tmp_df = _df.iloc[i-seq_len:i].copy()
        _df.at[i-1,"sequence"] = ",".join(tmp_df["path"].to_list())
    
    return _df

In [5]:
df_val['path'] = df_val.apply(set_path_to_df, axis=1, base_dir=VAL_DATASET_PATH)

In [65]:
seq_val_df = df_val.groupby('video', group_keys=False).apply(lambda x: group_frames_per_sequence(x, SEQ_LEN), include_groups=True)
seq_val_df = seq_val_df.reset_index(drop=True)
seq_val_df.tail()

  seq_val_df = df_val.groupby('video', group_keys=False).apply(lambda x: group_frames_per_sequence(x, SEQ_LEN), include_groups=True)


Unnamed: 0,video,frame,class,exist,path,sequence
181082,IMG_28_1,113670,DW-T-O,True,/home/luchocode/projects/tesis/data/selected_e...,/home/luchocode/projects/tesis/data/selected_e...
181083,IMG_28_1,113676,DW-T-O,True,/home/luchocode/projects/tesis/data/selected_e...,/home/luchocode/projects/tesis/data/selected_e...
181084,IMG_28_1,113682,DW-T-O,True,/home/luchocode/projects/tesis/data/selected_e...,/home/luchocode/projects/tesis/data/selected_e...
181085,IMG_28_1,113688,DW-T-O,True,/home/luchocode/projects/tesis/data/selected_e...,/home/luchocode/projects/tesis/data/selected_e...
181086,IMG_28_1,113694,DW-T-O,True,/home/luchocode/projects/tesis/data/selected_e...,/home/luchocode/projects/tesis/data/selected_e...


In [66]:
seq_val_df = seq_val_df.dropna(ignore_index=True)

In [67]:
seq_val_df.to_pickle('pickle/seq_val_df.pkl')

In [7]:
# load dataset
df_test = pd.read_pickle('pickle/df_test.pkl')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96260 entries, 0 to 96259
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   video   96260 non-null  object
 1   frame   96260 non-null  int64 
 2   class   96260 non-null  object
 3   exist   96260 non-null  bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 2.3+ MB


In [8]:
# define paths
df_test['path'] = df_test.apply(set_path_to_df, axis=1, base_dir=TEST_DATASET_PATH)
df_test.tail()

Unnamed: 0,video,frame,class,exist,path
96255,IMG_14_1,11790,LG-T-DW,True,/home/luchocode/projects/tesis/data/selected_e...
96256,IMG_14_1,11796,LG-T-DW,True,/home/luchocode/projects/tesis/data/selected_e...
96257,IMG_14_1,11802,LG-T-DW,True,/home/luchocode/projects/tesis/data/selected_e...
96258,IMG_14_1,11808,LG-T-DW,True,/home/luchocode/projects/tesis/data/selected_e...
96259,IMG_14_1,11814,LG-T-DW,True,/home/luchocode/projects/tesis/data/selected_e...


In [11]:
# define sequence
seq_test_df = df_test.groupby('video', group_keys=False).apply(lambda x: group_frames_per_sequence(x, 1), include_groups=True)
seq_test_df = seq_test_df.reset_index(drop=True)
seq_test_df.tail()

  seq_test_df = df_test.groupby('video', group_keys=False).apply(lambda x: group_frames_per_sequence(x, 1), include_groups=True)


Unnamed: 0,video,frame,class,exist,path,sequence
96255,IMG_27_2,145524,LG-T-DW,True,/home/luchocode/projects/tesis/data/selected_e...,/home/luchocode/projects/tesis/data/selected_e...
96256,IMG_27_2,145530,LG-T-DW,True,/home/luchocode/projects/tesis/data/selected_e...,/home/luchocode/projects/tesis/data/selected_e...
96257,IMG_27_2,145536,LG-T-DW,True,/home/luchocode/projects/tesis/data/selected_e...,/home/luchocode/projects/tesis/data/selected_e...
96258,IMG_27_2,145542,LG-T-DW,True,/home/luchocode/projects/tesis/data/selected_e...,/home/luchocode/projects/tesis/data/selected_e...
96259,IMG_27_2,145566,LG-T-DS,True,/home/luchocode/projects/tesis/data/selected_e...,/home/luchocode/projects/tesis/data/selected_e...


In [14]:
# write dataframe to disk
seq_test_df.to_pickle('pickle/seq_test_df.pkl')