In [None]:
import os
import json
import glob

import pandas as pd

In [None]:
def get_shortcode_longcode_dict(file_path):
    """

    :param file_path:
    :return:
    """
    with open(file_path, 'r') as file:
        data = json.load(file)

    shortcode_longcode_dict = {}
    for item in data['classlist']:
        id_number, short_code, full_name = item
        shortcode_longcode_dict[short_code] = full_name

    return shortcode_longcode_dict

In [None]:
def filter_points_dataframe(
    df: pd.DataFrame,
    qclasses_dict: dict,
    rand_sub_ceil: float = 1.0,
    reprojection_error: float = 0.01,
    view_index: int = 10,
    view_count: int = 5
) -> pd.DataFrame:
    """
    Filters and cleans a dataframe containing point data.

    :param df: Input DataFrame containing point data
    :param qclasses_dict: Dictionary containing Q-classes names (short --> long label)
    :param rand_sub_ceil: Maximum value for RandSubCeil filter (default: 1.0)
    :param reprojection_error: Maximum value for ReprojectionError filter (default: 0.01)
    :param view_index: Maximum value for ViewIndex filter (default: 10)
    :param view_count: Minimum value for ViewCount filter (default: 5)
    :return: Filtered and cleaned DataFrame
    """

    # Clean and preprocess data
    df_filtered = df.copy()
    
    df_filtered = df_filtered.dropna(how='any')
    df_filtered['Name'] = df_filtered['Name'].apply(os.path.basename)
    df_filtered['Row'] = df_filtered['Row'].astype(int)
    df_filtered['Column'] = df_filtered['Column'].astype(int)
    
    # Apply filters
    filtered_df = df_filtered[
        # Already randomly sampled
        (df_filtered['RandSubCeil'] <= rand_sub_ceil) &
        # Reprojection error for point reprojected to dot (distance difference)
        (df_filtered['ReprojectionError'] <= reprojection_error) &
        # The image's index in VPI view (includes a form pre-filtering)
        (df_filtered['ViewIndex'] <= view_index) &
        # The number of views the dot has
        (df_filtered['ViewCount'] >= view_count)
    ]
    
    # Map the short to long
    filtered_df.loc[:, 'Label'] = filtered_df['Label'].map(qclasses_dict)
    
    # Drop rows with the following Label
    to_drop = ['Review', 'N/A', 'Unknown']
    filtered_df = filtered_df[~filtered_df['Label'].isin(to_drop)]
    
    return filtered_df

In [None]:
w_drive = f"W:\\MIR_AI"
w_image_paths = f"{w_drive}\\images"
w_point_paths = f"{w_drive}\\raw_points"

output_dir = f"{w_drive}\\filtered_points"
os.makedirs(output_dir, exist_ok=True)

In [None]:
qclasses_json = f"{w_drive}\\master_qclasses.json"
qclasses_dict = get_shortcode_longcode_dict(qclasses_json)

set(qclasses_dict.values())

In [None]:
w_point_files = glob.glob(w_point_paths + "\\*.csv")
point_file_names = [os.path.basename(f).split(".csv")[0] for f in w_point_files]
print(f"Found {len(point_file_names)} point files.")

w_image_folders = glob.glob(w_image_paths + "\\*JPEG")
image_folder_names = [os.path.basename(f).split("_JPEG")[0] for f in w_image_folders]
print(f"Found {len(image_folder_names)} image folders.")

patch_folders = glob.glob("B:\\CoralNet-Toolbox\\Data\\MIR_AI\\patches\\*")
patch_names = [os.path.basename(p) for p in patch_folders]
print(f"Found {len(patch_folders)} patch folders")

In [None]:
to_download = list(set(point_file_names) - set(patch_names))
len(to_download), to_download

In [None]:
for w_point_file in w_point_files:
    
    df = pd.read_csv(w_point_file)
    filtered_df = filter_points_dataframe(df, 
                                          qclasses_dict,
                                          rand_sub_ceil=1.0, 
                                          reprojection_error=0.01, 
                                          view_index=10, 
                                          view_count=5)
    
    # Output    
    output_file = f"{output_dir}\\{os.path.basename(w_point_file)}"
    filtered_df.to_csv(output_file, index=False)
    assert os.path.exists(output_file), f"Output file {output_file} was not created"


In [None]:
patch_folders = glob.glob("B:\\CoralNet-Toolbox\\Data\\MIR_AI\\patches\\*")
patch_names = [os.path.basename(p) + "_JPEG" for p in patch_folders]
to_delete = ["W:\\MIR_AI\\images\\" + p for p in patch_names if os.path.exists("W:\\MIR_AI\\images\\" + p)]

len(to_delete), to_delete

In [None]:
import shutil

for folder in to_delete:
    print(f"Found {folder}")
    if os.path.exists(folder):
        print("Deleting ", folder)
        shutil.rmtree(folder)
    