In [2]:
import pandas as pd

In [4]:
df = pd.read_csv('MABe-mouse-behavior-detection/train.csv')
df

Unnamed: 0,lab_id,video_id,mouse1_strain,mouse1_color,mouse1_sex,mouse1_id,mouse1_age,mouse1_condition,mouse2_strain,mouse2_color,...,pix_per_cm_approx,video_width_pix,video_height_pix,arena_width_cm,arena_height_cm,arena_shape,arena_type,body_parts_tracked,behaviors_labeled,tracking_method
0,AdaptableSnail,44566106,CD-1 (ICR),white,male,10.0,8-12 weeks,wireless device,CD-1 (ICR),white,...,16.0,1228,1068,60.0,60.0,square,familiar,"[""body_center"", ""ear_left"", ""ear_right"", ""head...","[""mouse1,mouse2,approach"", ""mouse1,mouse2,atta...",DeepLabCut
1,AdaptableSnail,143861384,CD-1 (ICR),white,male,3.0,8-12 weeks,,CD-1 (ICR),white,...,9.7,968,608,60.0,60.0,square,familiar,"[""body_center"", ""ear_left"", ""ear_right"", ""late...","[""mouse1,mouse2,approach"", ""mouse1,mouse2,atta...",DeepLabCut
2,AdaptableSnail,209576908,CD-1 (ICR),white,male,7.0,8-12 weeks,,CD-1 (ICR),white,...,16.0,1266,1100,60.0,60.0,square,familiar,"[""body_center"", ""ear_left"", ""ear_right"", ""late...","[""mouse1,mouse2,approach"", ""mouse1,mouse2,atta...",DeepLabCut
3,AdaptableSnail,278643799,CD-1 (ICR),white,male,11.0,8-12 weeks,wireless device,CD-1 (ICR),white,...,16.0,1224,1100,60.0,60.0,square,familiar,"[""body_center"", ""ear_left"", ""ear_right"", ""head...","[""mouse1,mouse2,approach"", ""mouse1,mouse2,atta...",DeepLabCut
4,AdaptableSnail,351967631,CD-1 (ICR),white,male,14.0,8-12 weeks,,CD-1 (ICR),white,...,16.0,1204,1068,60.0,60.0,square,familiar,"[""body_center"", ""ear_left"", ""ear_right"", ""late...","[""mouse1,mouse2,approach"", ""mouse1,mouse2,atta...",DeepLabCut
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8785,UppityFerret,1610683375,C57Bl/6N,black,male,8.0,8 weeks,csds,CD1,white,...,13.0,664,572,38.0,38.0,circular,neutral,"[""body_center"", ""ear_left"", ""ear_right"", ""hip_...","[""mouse1,mouse2,reciprocalsniff"", ""mouse1,mous...",DeepLabCut
8786,UppityFerret,1725214092,C57Bl/6N,black,male,27.0,8 weeks,,CD1,white,...,12.8,608,576,38.0,38.0,circular,neutral,"[""body_center"", ""ear_left"", ""ear_right"", ""hip_...","[""mouse1,mouse2,intromit"", ""mouse1,mouse2,moun...",DeepLabCut
8787,UppityFerret,1960237444,C57Bl/6N,black,male,14.0,8 weeks,csds,CD1,white,...,12.5,592,576,38.0,38.0,circular,neutral,"[""body_center"", ""ear_left"", ""ear_right"", ""hip_...","[""mouse1,mouse2,reciprocalsniff"", ""mouse1,mous...",DeepLabCut
8788,UppityFerret,2045808129,C57Bl/6N,black,male,12.0,8 weeks,csds,CD1,white,...,12.5,624,572,38.0,38.0,circular,neutral,"[""body_center"", ""ear_left"", ""ear_right"", ""hip_...","[""mouse1,mouse2,reciprocalsniff"", ""mouse1,mous...",DeepLabCut


In [5]:
df.columns

Index(['lab_id', 'video_id', 'mouse1_strain', 'mouse1_color', 'mouse1_sex',
       'mouse1_id', 'mouse1_age', 'mouse1_condition', 'mouse2_strain',
       'mouse2_color', 'mouse2_sex', 'mouse2_id', 'mouse2_age',
       'mouse2_condition', 'mouse3_strain', 'mouse3_color', 'mouse3_sex',
       'mouse3_id', 'mouse3_age', 'mouse3_condition', 'mouse4_strain',
       'mouse4_color', 'mouse4_sex', 'mouse4_id', 'mouse4_age',
       'mouse4_condition', 'frames_per_second', 'video_duration_sec',
       'pix_per_cm_approx', 'video_width_pix', 'video_height_pix',
       'arena_width_cm', 'arena_height_cm', 'arena_shape', 'arena_type',
       'body_parts_tracked', 'behaviors_labeled', 'tracking_method'],
      dtype='object')

In [None]:
import pandas as pd
from pathlib import Path

# =========================================================
# Funções utilitárias
# =========================================================
def load_parquet(file_path: Path) -> pd.DataFrame:
    return pd.read_parquet(file_path)

def extract_video_id(file_path: Path) -> int:
    # extrai o nome do arquivo sem extensão
    return int(file_path.stem)

# =========================================================
# Pastas de tracking e anotação
# =========================================================
tracking_root = Path("MABe-mouse-behavior-detection/train_tracking")
annotations_root = Path("MABe-mouse-behavior-detection/train_annotation")

tracking_files = list(tracking_root.rglob("*.parquet"))
annotation_files = list(annotations_root.rglob("*.parquet"))

print(f"{len(tracking_files)} arquivos de tracking encontrados")
print(f"{len(annotation_files)} arquivos de anotação encontrados")

# =========================================================
# Criar dicionário video_id -> annotation
# =========================================================
videoid_to_annotation = {}
for f in annotation_files:
    vid = extract_video_id(f)
    df_ann = load_parquet(f)
    videoid_to_annotation[vid] = df_ann

# =========================================================
# Carregar cada arquivo de tracking e associar comportamento
# =========================================================
all_videos = []

for f in tracking_files:
    vid = extract_video_id(f)
    df_track = load_parquet(f)

    if vid not in videoid_to_annotation:
        print(f"Aviso: não há anotação para vídeo {vid}, pulando...")
        continue

    df_ann = videoid_to_annotation[vid]

    # Ordenar para merge_asof
    df_track_sorted = df_track.sort_values('video_frame').reset_index(drop=True)
    df_ann_sorted = df_ann[['start_frame', 'stop_frame', 'action']].sort_values('start_frame').reset_index(drop=True)

    # merge_asof para associar último start_frame <= video_frame
    df_merged = pd.merge_asof(
        df_track_sorted,
        df_ann_sorted,
        left_on='video_frame',
        right_on='start_frame',
        direction='backward'
    )

    # Preencher apenas se estiver dentro do intervalo [start_frame, stop_frame]
    mask = (df_merged['video_frame'] >= df_merged['start_frame']) & \
           (df_merged['video_frame'] <= df_merged['stop_frame'])
    
    df_merged['behavior'] = None
    df_merged.loc[mask, 'behavior'] = df_merged.loc[mask, 'action']

    # Remover colunas temporárias
    df_merged = df_merged.drop(columns=['start_frame', 'stop_frame', 'action'])

    # Adicionar metadata do df original, se existir
    if 'df' in globals() and 'video_id' in df.columns:
        meta_row = df[df['video_id'] == vid]
        if not meta_row.empty:
            for col in df.columns:
                if col not in df_merged.columns:
                    df_merged[col] = meta_row.iloc[0][col]

    all_videos.append(df_merged)

# =========================================================
# DataFrame final
# =========================================================
df_final = pd.concat(all_videos, ignore_index=True)
print("DataFrame final pronto para treino:", df_final.shape)
print(df_final.head())


8790 arquivos de tracking encontrados
863 arquivos de anotação encontrados


In [5]:
import pandas as pd

# Cria IntervalIndex
intervals = pd.IntervalIndex.from_arrays(df_annotations['start_frame'],
                                         df_annotations['stop_frame'],
                                         closed='both')

# Inicializa contagem de frames com anotação
frames_with_annotation = set()

# Itera pelas anotações (eficiente mesmo para milhões de frames)
for i, row in df_annotations.iterrows():
    frames_in_row = df_tracking.loc[
        (df_tracking['video_frame'] >= row['start_frame']) & 
        (df_tracking['video_frame'] <= row['stop_frame']),
        'video_frame'
    ]
    frames_with_annotation.update(frames_in_row)

# Todos os frames do tracking
all_frames = set(df_tracking['video_frame'])

# Frames sem anotação
frames_no_annotation = all_frames - frames_with_annotation

print(f"Total frames: {len(df_tracking)}")
print(f"Frames sem anotação: {len(frames_no_annotation)} "
      f"({len(frames_no_annotation)/len(all_frames)*100:.2f}%)")


Total frames: 3447062
Frames sem anotação: 4198 (9.90%)


In [8]:
df_merged

Unnamed: 0,video_frame,mouse_id,bodypart,x,y,agent_id,target_id,action,start_frame,stop_frame
0,947,1,ear_right,199.076096,149.289795,,,,,
1,947,1,head,197.037170,127.386330,,,,,
2,947,1,tail_base,147.379089,159.329346,,,,,
3,947,2,ear_left,417.112823,97.994583,,,,,
4,947,2,ear_right,399.308228,96.032570,,,,,
...,...,...,...,...,...,...,...,...,...,...
3447057,36168,1,tail_base,395.470306,234.921738,1,1,rest,35905,36168
3447058,36168,2,ear_left,159.047134,181.406097,1,1,rest,35905,36168
3447059,36168,2,ear_right,138.765213,175.730286,1,1,rest,35905,36168
3447060,36168,2,head,136.938126,203.618652,1,1,rest,35905,36168


In [None]:
df_merged['action'].c

array([<NA>, <NA>, <NA>, ..., 'rest', 'rest', 'rest'], dtype=object)