In [1]:
import pandas as pd

def transform_scores(input_file, output_file):
    """
    Transform the raw scores data into the desired format.

    Args:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to save the transformed CSV file.
    """
    
    df = pd.read_csv(input_file)
    
    df.drop("expert_id", axis=1, inplace=True)
    df = df.rename(columns={df.columns[0]: "joint_id"})
    df["finger"] = df["finger"].fillna("none")
    df = df.dropna(subset=["score"])

    grouped = df.groupby(["joint_id", "patient_id", "hand", "joint", "finger", "disease"], dropna=False)["score"].mean().reset_index()
    grouped["score"] = grouped["score"].round()

    pivoted = grouped.pivot_table(
        index=["joint_id", "patient_id", "hand", "joint", "finger"],
        columns="disease",
        values="score",
        fill_value=0
    ).reset_index()

    pivoted.columns.name = None 
    pivoted = pivoted.rename(columns={"erosion": "erosion_score", "JSN": "jsn_score"})
    pivoted.sort_values(by=["patient_id", "joint_id"], ascending=[False, True], inplace=True)

    pivoted.to_csv(output_file, index=False)
    print(f"Transformed data saved to {output_file}")

transform_scores('dataset/scores.csv', 'dataset/avg_scores.csv')

Transformed data saved to dataset/avg_scores.csv


In [3]:
def merge(bbox_file, score_file, output_file):
    bbox_df = pd.read_csv(bbox_file)
    score_df = pd.read_csv(score_file)
    
    bbox_df.drop(columns=["finger"], inplace=True)
    score_df.drop(columns=["finger"], inplace=True)
    
    bbox_df.rename(columns={"Unnamed: 0": "joint_id"}, inplace=True)
    bbox_df["joint_id"] = bbox_df["joint_id"].apply(lambda x: x % 42)
    
    
    merged_df = pd.merge(
        score_df,
        bbox_df,
        left_on=['joint_id', 'patient_id', 'hand', 'joint'],
        right_on=['joint_id', 'patient_id', 'hand', 'joint'],
        how='inner'
    )
    
    merged_df.to_csv(output_file, index=False)
    print('Merged file saves successfully')

merge('dataset/bboxes.csv', 'dataset/avg_scores.csv', 'dataset/merged_df.csv')

Merged file saves successfully


In [None]:
import os
import cv2
import json
import pandas as pd

# Function to crop and save images
def crop_and_save_images(merged_df, image_dir, output_dir, normalized=False, split_subsets_by_id=None):
    """
    Crop images based on bounding boxes and save them with the appropriate label.

    Args:
        merged_df (pd.DataFrame): Merged DataFrame containing bbox and label information.
        image_dir (str): Path to the directory containing input images.
        output_dir (str): Path to save the cropped images.
        normalized (bool): Whether the bounding box coordinates are normalized.
    """
    if split_subsets_by_id is not None:
        id_to_split = {}
        with open(split_subsets_by_id, "r") as f:
            split_file = json.load(f)
        for split_name in split_file.keys():
            os.makedirs(os.path.join(output_dir, split_name), exist_ok=True)
            for value in split_file[split_name]:
                id_to_split[value] = split_name
                  
    os.makedirs(output_dir, exist_ok=True)

    for _, row in merged_df.iterrows():
        image_name = str(row["patient_id"]) + ".jpeg"  # Assuming the image name is in a column called "image_name"
        img_split = id_to_split[row["patient_id"]]
        joint_type = row['joint']
        erosion_score, jsn_score = int(row['erosion_score']), int(row['jsn_score'])
        label = f"{int(row['erosion_score'])}_{int(row['jsn_score'])}"
        x_center, y_center, width, height = row["xcenter"], row["ycenter"], row["dx"], row["dy"]

        os.makedirs(os.path.join(output_dir, img_split, f'{joint_type}_erosion', str(erosion_score)), exist_ok = True)
        os.makedirs(os.path.join(output_dir, img_split, f'{joint_type}_jsn', str(jsn_score)), exist_ok = True)
        # Load the image
        image_path = os.path.join(image_dir, image_name)
        if not os.path.exists(image_path):
            print(f"Image not found: {image_path}")
            continue

        image = cv2.imread(image_path)
        if image is None:
            print(f"Failed to load image: {image_path}")
            continue

        # Get image dimensions
        img_height, img_width = image.shape[:2]

        # Convert normalized coordinates to pixel coordinates if necessary
        if normalized:
            x_center *= img_width
            y_center *= img_height
            width *= img_width
            height *= img_height

        # Calculate bounding box coordinates
        x1 = max(int(x_center - width / 2), 0)
        y1 = max(int(y_center - height / 2), 0)
        x2 = min(int(x_center + width / 2), img_width)
        y2 = min(int(y_center + height / 2), img_height)

        # Crop the image
        cropped_image = image[y1:y2, x1:x2]

        # Save the cropped image
        output_name = f"{os.path.splitext(image_name)[0]}_{row['joint_id']}_{label}.jpg"
        output_path_1 = os.path.join(output_dir, img_split, f'{joint_type}_erosion', str(erosion_score), output_name)
        output_path_2 = os.path.join(output_dir, img_split, f'{joint_type}_jsn', str(jsn_score), output_name)
        try:
            cv2.imwrite(output_path_1, cropped_image)
            cv2.imwrite(output_path_2, cropped_image)
        except:
            print(cropped_image.shape)
            print(row)
            return 0
    print('Saved cropped images successfully')

# Load the CSV files
label_df = pd.read_csv('dataset/merged_df.csv')

#set bounding boxes sizes equal for each joint type
for id, sub_df in label_df.groupby('patient_id'):
    sub_df['dx'] = sub_df.groupby('joint')['dx'].transform('max')
    sub_df['dy'] = sub_df.groupby('joint')['dy'].transform('max')

# Crop and save images
crop_and_save_images(label_df, os.path.join('dataset','jpeg'), 'dataset', False, os.path.join('dataset','train_val_split.json'))

Saved cropped images successfully


In [9]:
# custom data split

import os
import cv2
import json
import pandas as pd

# Function to crop and save images
def crop_and_save_images(merged_df, image_dir, output_dir, normalized=False):
    """
    Crop images based on bounding boxes and save them with the appropriate label.

    Args:
        merged_df (pd.DataFrame): Merged DataFrame containing bbox and label information.
        image_dir (str): Path to the directory containing input images.
        output_dir (str): Path to save the cropped images.
        normalized (bool): Whether the bounding box coordinates are normalized.
    """             
    os.makedirs(output_dir, exist_ok=True)

    for _, row in merged_df.iterrows():
        image_name = str(row["patient_id"]) + ".jpeg"  # Assuming the image name is in a column called "image_name"
        joint_type = row['joint']
        erosion_score, jsn_score = int(row['erosion_score']), int(row['jsn_score'])
        label = f"{int(row['erosion_score'])}_{int(row['jsn_score'])}"
        x_center, y_center, width, height = row["xcenter"], row["ycenter"], row["dx"], row["dy"]

        os.makedirs(os.path.join(output_dir, f'{joint_type}_erosion', str(erosion_score)), exist_ok = True)
        os.makedirs(os.path.join(output_dir, f'{joint_type}_jsn', str(jsn_score)), exist_ok = True)
        # Load the image
        image_path = os.path.join(image_dir, image_name)
        if not os.path.exists(image_path):
            print(f"Image not found: {image_path}")
            continue

        image = cv2.imread(image_path)
        if image is None:
            print(f"Failed to load image: {image_path}")
            continue

        # Get image dimensions
        img_height, img_width = image.shape[:2]

        # Convert normalized coordinates to pixel coordinates if necessary
        if normalized:
            x_center *= img_width
            y_center *= img_height
            width *= img_width
            height *= img_height

        # Calculate bounding box coordinates
        x1 = max(int(x_center - width / 2), 0)
        y1 = max(int(y_center - height / 2), 0)
        x2 = min(int(x_center + width / 2), img_width)
        y2 = min(int(y_center + height / 2), img_height)

        # Crop the image
        cropped_image = image[y1:y2, x1:x2]

        # Save the cropped image
        output_name = f"{os.path.splitext(image_name)[0]}_{row['joint_id']}_{label}.jpg"
        output_path_1 = os.path.join(output_dir, f'{joint_type}_erosion', str(erosion_score), output_name)
        output_path_2 = os.path.join(output_dir, f'{joint_type}_jsn', str(jsn_score), output_name)
        try:
            cv2.imwrite(output_path_1, cropped_image)
            cv2.imwrite(output_path_2, cropped_image)
        except:
            print(cropped_image.shape)
            print(row)
            return 0
    print('Saved cropped images successfully')

# Load the CSV files
label_df = pd.read_csv('dataset/merged_df.csv')

#set bounding boxes sizes equal for each joint type
for id, sub_df in label_df.groupby('patient_id'):
    sub_df['dx'] = sub_df.groupby('joint')['dx'].transform('max')
    sub_df['dy'] = sub_df.groupby('joint')['dy'].transform('max')

# Crop and save images
crop_and_save_images(label_df, os.path.join('dataset','jpeg_inv_clahe'), os.path.join('dataset','non-sorted_inv_clahe'), False)

Saved cropped images successfully


In [5]:
import pandas as pd
import os

label_df = pd.read_csv('dataset/merged_df.csv')

joint_types = label_df['joint'].unique()
erosion_df = pd.DataFrame(columns = joint_types, index = range(6))
jsn_df = pd.DataFrame(columns = joint_types, index = range(5))

for joint in joint_types:
    for i in range(6):
        path = os.path.join('dataset', 'non-sorted_inv_clahe', f'{joint}_erosion', str(i))
        if os.path.exists(path):
            erosion_df.loc[i, joint] = len(os.listdir(path))
        else:
            erosion_df.loc[i, joint] = 0

for joint in joint_types:
    for i in range(5):
        path = os.path.join('dataset', 'non-sorted_inv_clahe', f'{joint}_jsn', str(i))
        if os.path.exists(path):
            jsn_df.loc[i, joint] = len(os.listdir(path))
        else:
            jsn_df.loc[i, joint] = 0

#jsn_df.to_csv(os.path.join('dataset', 'non-sorted_inv_clahe', 'jsn_data_counts.csv'))
#erosion_df.to_csv(os.path.join('dataset', 'non-sorted_inv_clahe', 'erosion_data_counts.csv'))

jsn_df

Unnamed: 0,DIP,CMC,wrist,RC,ulna,PIP,MCP
0,842,1040,88,128,600,274,434
1,749,259,42,47,0,492,437
2,1184,703,158,149,0,1130,1330
3,218,377,239,229,0,480,633
4,7,21,73,47,0,24,166


In [6]:
erosion_df

Unnamed: 0,DIP,CMC,wrist,RC,ulna,PIP,MCP
0,2954,2378,292,478,470,2290,2465
1,27,17,155,42,81,71,333
2,15,3,92,34,26,27,154
3,2,0,10,19,2,7,11
4,1,0,1,11,2,2,5
5,1,2,50,16,19,3,32


In [9]:
def count_disbalance(df, param):
    d = {}
    for column in df.columns:
        vals = df[column].to_list()
        s = sum(vals)
        for i, v in enumerate(vals):
            if v > s * .5:
                d[f'{column}_{param}'] = str(i)
                break
        else:
            d[f'{column}_{param}'] = 'balanced'
    return d

print(count_disbalance(erosion_df, 'erosion'))
print(count_disbalance(jsn_df, 'jsn'))

{'DIP_erosion': '0', 'CMC_erosion': '0', 'wrist_erosion': 'balanced', 'RC_erosion': '0', 'ulna_erosion': '0', 'PIP_erosion': '0', 'MCP_erosion': '0'}
{'DIP_jsn': 'balanced', 'CMC_jsn': 'balanced', 'wrist_jsn': 'balanced', 'RC_jsn': 'balanced', 'ulna_jsn': '0', 'PIP_jsn': 'balanced', 'MCP_jsn': 'balanced'}


In [12]:
from PIL import Image
from tqdm import tqdm

os.makedirs(os.path.join('dataset', 'custom_split_inv_clahe'), exist_ok=True)

for dir in os.listdir(os.path.join('dataset', 'non-sorted_inv_clahe')):
    try:
        for cl in os.listdir(os.path.join('dataset', 'non-sorted_inv_clahe', dir)):
            os.makedirs(os.path.join('dataset', 'custom_split_inv_clahe', dir, 'train', str(cl)), exist_ok=True)
            os.makedirs(os.path.join('dataset', 'custom_split_inv_clahe', dir, 'test', str(cl)), exist_ok=True)
            for i, file_name in enumerate(tqdm(os.listdir(os.path.join('dataset', 'non-sorted_inv_clahe', dir, cl)))):
                img = Image.open(os.path.join('dataset', 'non-sorted_inv_clahe', dir, cl, file_name))
                if i % 6 != 0:
                    img.save(os.path.join('dataset', 'custom_split_inv_clahe', dir, 'train', str(cl), file_name))
                else:
                    img.save(os.path.join('dataset', 'custom_split_inv_clahe', dir, 'test', str(cl), file_name))
    except NotADirectoryError:
        continue


100%|██████████| 2378/2378 [00:09<00:00, 257.94it/s]
100%|██████████| 17/17 [00:00<00:00, 175.07it/s]
100%|██████████| 3/3 [00:00<00:00, 240.78it/s]
100%|██████████| 2/2 [00:00<00:00, 348.31it/s]
100%|██████████| 1040/1040 [00:02<00:00, 376.02it/s]
100%|██████████| 259/259 [00:00<00:00, 428.03it/s]
100%|██████████| 703/703 [00:01<00:00, 403.35it/s]
100%|██████████| 377/377 [00:00<00:00, 394.79it/s]
100%|██████████| 21/21 [00:00<00:00, 535.55it/s]
100%|██████████| 2954/2954 [00:11<00:00, 253.08it/s]
100%|██████████| 27/27 [00:00<00:00, 263.61it/s]
100%|██████████| 15/15 [00:00<00:00, 245.11it/s]
100%|██████████| 2/2 [00:00<00:00, 284.40it/s]
100%|██████████| 1/1 [00:00<00:00, 172.19it/s]
100%|██████████| 1/1 [00:00<00:00, 186.12it/s]
100%|██████████| 842/842 [00:02<00:00, 397.87it/s]
100%|██████████| 749/749 [00:02<00:00, 366.36it/s]
100%|██████████| 1184/1184 [00:03<00:00, 334.72it/s]
100%|██████████| 218/218 [00:00<00:00, 374.12it/s]
100%|██████████| 7/7 [00:00<00:00, 292.08it/s]
100%