In [None]:
import os
import glob
import shutil
from tqdm import tqdm
from datetime import datetime

import numpy as np
import pandas as pd

import tator
import panoptes_client

In [None]:
def get_now():
    return datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

### Configs

In [None]:
username = os.getenv('ZOONIVERSE_USERNAME')
password = os.getenv('ZOONIVERSE_PASSWORD')

zoon_project_id = 21853

try:
    # Login to panoptes using username and password
    panoptes_client.Panoptes.connect(username=username, password=password)
    print(f"NOTE: Authentication to Zooniverse successful for {username}")
except Exception as e:
    raise Exception(f"ERROR: Could not login to Panoptes for {username}\n{e}")

try:
    # Get access to the Zooniverse project given the provided credentials
    project = panoptes_client.Project.find(id=zoon_project_id)
    print(f"NOTE: Connected to Zooniverse project '{project.title}' successfully")
except Exception as e:
    raise Exception(f"ERROR: Could not access project {zoon_project_id}.\n{e}")

In [None]:
token = os.getenv('TATOR_TOKEN')
project_id = 70

try:
    # Get the TATOR api given the provided token
    api = tator.get_api(host='https://cloud.tator.io', token=token)
    # Get the correct type of localization for the project (bounding box, attributes)
    tator_project_id = project_id
    state_type_id = 288  # State Type (ROV)
    print(f"NOTE: Authentication to TATOR successful for {api.whoami().username}")
except Exception as e:
    raise Exception(f"ERROR: Could not obtain needed information from TATOR.\n{e}")

### Make Curated Dataframe

In [None]:
# Find all of the curated data
curated_data = glob.glob(f"{os.path.abspath('../data/reduced/')}/Season_*/media/**/frames/*.jpg", recursive=True)

print(f"Found {len(curated_data)} curated media files")

In [None]:
curated_media = {}

for media_file in tqdm(curated_data, desc="Processing curated media files"):

    # Get the media id from the file path
    media_id = os.path.basename(os.path.dirname(os.path.dirname(media_file)))
    
    # Get the frame number from the file name
    frame_number = os.path.splitext(os.path.basename(media_file))[0]
    
    if not media_id in curated_media:
        curated_media[media_id] = []

    curated_media[media_id].append(frame_number)


In [None]:
dataframe = []
columns = []

for media_id, frame_ids in tqdm(curated_media.items(), desc="Processing media IDs"):
    
    # Get the metadata for the media ID
    try:
        # Get the media object from TATOR
        media = api.get_media(media_id)
        
        # Extract basic media properties
        fps = media.fps
        height = media.height
        width = media.width
        idx = media.id
        name = media.name
        num_frames = media.num_frames
        created = media.created_datetime

        camera = ""
        cruise_id = ""
        dive_id = ""
        original_filename = ""
        video_part = ""
        
        # Extract attributes if available
        if "attributes" in media.to_dict():
            if "Camera" in media.attributes:
                camera = media.attributes["Camera"]
            if "CruiseID" in media.attributes:
                cruise_id = media.attributes["CruiseID"]
            if "DiveID" in media.attributes:
                dive_id = media.attributes["DiveID"]
            if "Original Filename" in media.attributes:
                original_filename = media.attributes["Original Filename"]
            if "VideoPart" in media.attributes:
                video_part = media.attributes["VideoPart"]
            
        metadata = {
            "cruise_id": cruise_id,
            "dive_id": dive_id,
            "original_filename": original_filename,
            "name": name,
            "fps": media.fps,
            "height": media.height,
            "width": media.width,
            "num_frames": media.num_frames,
            "created_datetime": media.created_datetime,
            "camera": camera,
            "video_part": video_part
        }
    
    except Exception as e:
        print(f"ERROR: Could not get media attributes for media ID {media_id}.\n{e}")
        # Set default values in case of error
        metadata = {
            "cruise_id": "",
            "dive_id": "",
            "original_filename": "",
            "name": "",
            "fps": 0,
            "height": 0,
            "width": 0,
            "num_frames": 0,
            "camera": "",
            "video_part": ""
        }

    try:
        # Get the Navigation data for the media ID
        nav_data = api.get_state_list(project=tator_project_id, 
                                      media_id=[int(media_id)],
                                      type=state_type_id)
        
        # Extract all the attributes for the frames in the media ID
        attribute_list = [(item.frame, item.attributes) for item in nav_data if str(item.frame) in frame_ids]
    
        for frame, attributes in attribute_list:
            dataframe.append([frame, media_id, *metadata.values(), *attributes.values()])
            
        if not columns:
            columns = ['frame_id', 'media_id'] + [key for key in metadata] + [key for key in attributes]

    except Exception as e:
        print(f"ERROR: Could not get state list for media ID {media_id}.\n{e}")
        break
    
    

In [None]:
df = pd.DataFrame(dataframe, columns=columns) # .to_csv(f"../data/curated/curated_states_{get_now()}.csv", index=False)

In [None]:
df

In [None]:
csv_files = glob.glob(f"{os.path.abspath('../data/reduced/')}**/**/reduced_annotations*.csv", recursive=True)
csv_files

In [None]:
seasons_df = pd.DataFrame()
for season, csv_file in enumerate(csv_files, start=1):
    season_df = pd.read_csv(csv_file)
    season_df['season'] = season
    seasons_df = pd.concat([seasons_df, season_df], ignore_index=True)
    
seasons_df.columns

In [None]:
seasons_df.columns
subset_seasons_df = seasons_df[['season', 'Media ID', 'Frame ID', 'Subject ID', 'label', 'x', 'y', 'w', 'h']]
subset_seasons_df.columns = ['season', 'media_id', 'frame_id', 'subject_id', 'label', 'box_x', 'box_y', 'box_w', 'box_h']

In [None]:
# Create a copy of the DataFrame to avoid SettingWithCopyWarning
subset_seasons_df = subset_seasons_df.copy()

# Convert data types properly
# First, check if columns are already strings to avoid warnings
if not isinstance(subset_seasons_df['media_id'].iloc[0], str):
    subset_seasons_df['media_id'] = subset_seasons_df['media_id'].astype(str)
if not isinstance(subset_seasons_df['frame_id'].iloc[0], str):
    subset_seasons_df['frame_id'] = subset_seasons_df['frame_id'].astype(str)

# Also ensure df has the right data types for joining
if not isinstance(df['media_id'].iloc[0], str):
    df['media_id'] = df['media_id'].astype(str)
if not isinstance(df['frame_id'].iloc[0], str):
    df['frame_id'] = df['frame_id'].astype(str)


In [None]:
new_dataframe = []

for i, r in subset_seasons_df.iterrows():
    # Find the matching row in df
    match = df[(df['media_id'] == str(r['media_id'])) & (df['frame_id'] == str(r['frame_id']))]
    # Add all the attributes from df to the new row
    if not match.empty:
        new_row = {
            'season': r['season'],
            'media_id': r['media_id'],
            'frame_id': r['frame_id'],
            'label': r['label']
        }
        for col in df.columns:
            if col not in ['season', 'media_id', 'frame_id', 'label']:
                new_row[col] = match[col].values[0]
        new_dataframe.append(new_row)
    else:
        break


In [None]:
r['media_id'], r['frame_id']

In [None]:
df[(df['media_id'] == str(r['media_id']))]

In [None]:
df[df['media_id'] == '4346978']

In [None]:
new_dataframe = pd.DataFrame(new_dataframe)

In [None]:
new_dataframe