In [7]:
import os
import json
import glob

import pandas as pd

In [8]:
def get_shortcode_longcode_dict(file_path):
    """

    :param file_path:
    :return:
    """
    with open(file_path, 'r') as file:
        data = json.load(file)

    shortcode_longcode_dict = {}
    for item in data['classlist']:
        id_number, short_code, full_name = item
        shortcode_longcode_dict[short_code] = full_name

    return shortcode_longcode_dict

In [9]:
def filter_points_dataframe(
    df: pd.DataFrame,
    qclasses_dict: dict,
    rand_sub_ceil: float = 1.0,
    reprojection_error: float = 0.01,
    view_index: int = 10,
    view_count: int = 5
) -> pd.DataFrame:
    """
    Filters and cleans a dataframe containing point data.

    :param df: Input DataFrame containing point data
    :param qclasses_dict: Dictionary containing Q-classes names (short --> long label)
    :param rand_sub_ceil: Maximum value for RandSubCeil filter (default: 1.0)
    :param reprojection_error: Maximum value for ReprojectionError filter (default: 0.01)
    :param view_index: Maximum value for ViewIndex filter (default: 10)
    :param view_count: Minimum value for ViewCount filter (default: 5)
    :return: Filtered and cleaned DataFrame
    """

    # Clean and preprocess data
    df_filtered = df.copy()
    
    df_filtered = df_filtered.dropna(how='any')
    df_filtered['Name'] = df_filtered['Name'].apply(os.path.basename)
    df_filtered['Row'] = df_filtered['Row'].astype(int)
    df_filtered['Column'] = df_filtered['Column'].astype(int)
    
    # Apply filters
    filtered_df = df_filtered[
        (df_filtered['RandSubCeil'] <= rand_sub_ceil) &
        (df_filtered['ReprojectionError'] <= reprojection_error) &
        (df_filtered['ViewIndex'] <= view_index) &
        (df_filtered['ViewCount'] >= view_count)
    ]
    
    # Map the short to long
    filtered_df.loc[:, 'Label'] = filtered_df['Label'].map(qclasses_dict)
    
    return filtered_df

In [10]:
w_drive = f"W:\\MIR_AI"
w_image_paths = f"{w_drive}\\images"
w_point_paths = f"{w_drive}\\raw_points"

output_dir = f"{w_drive}\\filtered_points"
os.makedirs(output_dir, exist_ok=True)

In [6]:
qclasses_json = f"{w_drive}\\master_qclasses.json"
qclasses_dict = get_shortcode_longcode_dict(qclasses_json)

set(qclasses_dict.values())

{'Acropora_cervicornis',
 'Acropora_palmata',
 'Acropora_prolifera',
 'Agaricia_agaricites',
 'Agaricia_fragilis',
 'Agaricia_grahamae',
 'Agaricia_humilis',
 'Agaricia_lamarcki',
 'Agaricia_sp',
 'Agaricia_tenuifolia',
 'Agaricia_undata',
 'Anemone',
 'Antillogorgia',
 'Ascidians',
 'Basalt',
 'Bivalve',
 'Black_coral',
 'Branching_Octocorals',
 'Briareum_branch',
 'Bryozoan',
 'CCA',
 'Calcareous_Algae',
 'Chondrilla',
 'Cladocora_arbuscula',
 'Clionid',
 'Colpophyllia_breviserialis',
 'Colpophyllia_natans',
 'Colpophyllia_sp',
 'Coral_skeleton',
 'Corallimorph',
 'Cyanobacteria',
 'Dendrogyra_cylindrus',
 'Dichocoenia_stokesii',
 'Dictyota',
 'Diploria_labyrinthiformis',
 'Encrusting_macroalgae',
 'Erythropodium',
 'Eunicea',
 'Eusmilia_fastigiata',
 'Favia_fragum',
 'Fine_sediment',
 'Gorgonia_sp',
 'Green_Diplosoma',
 'Halimeda',
 'Helioseris_cucculata',
 'Hydroid',
 'Icilligorgia',
 'Isophyllia_rigida',
 'Isophyllia_sinuosa',
 'Isophyllia_sp',
 'Leather_coral',
 'Lobophora',
 'Ma

In [7]:
w_point_files = glob.glob(w_point_paths + "\\*.csv")
print(f"Found {len(w_point_files)} point files.")

Found 93 point files.


In [8]:
for w_point_file in w_point_files:
    
    df = pd.read_csv(w_point_file)
    filtered_df = filter_points_dataframe(df, 
                                          qclasses_dict,
                                          rand_sub_ceil=1.0, 
                                          reprojection_error=0.01, 
                                          view_index=10, 
                                          view_count=5)
    
    # Output    
    output_file = f"{output_dir}\\{os.path.basename(w_point_file)}"
    filtered_df.to_csv(output_file, index=False)
    assert os.path.exists(output_file), f"Output file {output_file} was not created"
