# ARISE-2025

This hackathon was originally made for joint detection and classification. 
My part of work was to build classification pipeline. Classification was divided into two different categories: joint erosion (6 classes) and joint space narrowing (5 classes).

In this file I'm building train and val image preprocessing pipeline for all classifiers:
1. transforming scores table: originally it consisted from scores from 3 experts, so in order to get classes we have to take an average of all labels
2. merge bounding_box and scores dataframes
3. use bounding box data from merged dataframe to transform images and sort them based on scores data
4. count classes for all data groups. This will bw necessary to find class disbalance
5. split images to train and val directories

In [None]:
import os
import cv2
import json
import pandas as pd

from PIL import Image
from tqdm import tqdm

## Score table transformation

In [None]:
def transform_scores(input_file: str, output_file: str) -> None:
    """
    Transform the raw scores data into the desired format.

    Args:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to save the transformed CSV file.
    """
    
    df = pd.read_csv(input_file)
    
    df.drop("expert_id", axis=1, inplace=True)
    df = df.rename(columns={df.columns[0]: "joint_id"})
    df["finger"] = df["finger"].fillna("none")
    df = df.dropna(subset=["score"])

    grouped = df.groupby(["joint_id", "patient_id", "hand", "joint", "finger", "disease"], dropna=False)["score"].mean().reset_index()
    grouped["score"] = grouped["score"].round()

    pivoted = grouped.pivot_table(
        index=["joint_id", "patient_id", "hand", "joint", "finger"],
        columns="disease",
        values="score",
        fill_value=0
    ).reset_index()

    pivoted.columns.name = None 
    pivoted = pivoted.rename(columns={"erosion": "erosion_score", "JSN": "jsn_score"})
    pivoted.sort_values(by=["patient_id", "joint_id"], ascending=[False, True], inplace=True)

    pivoted.to_csv(output_file, index=False)
    print(f"Transformed data saved to {output_file}")

transform_scores('dataset/scores.csv', 'dataset/avg_scores.csv')

## Merging bounding box and scores tables

In [None]:
def merge(bbox_file: str, score_file: str, output_file: str) -> None:
    '''
    merge bbox.csv & score.csv
    save result to output_file

    Args:
        bbox_file (str): path to file with bounding boxes
        score_file (str): path to file with scores for each joint
        output_file (str): path to output file
    '''
    bbox_df = pd.read_csv(bbox_file)
    score_df = pd.read_csv(score_file)
    
    bbox_df.drop(columns=["finger"], inplace=True)
    score_df.drop(columns=["finger"], inplace=True)
    
    bbox_df.rename(columns={"Unnamed: 0": "joint_id"}, inplace=True)
    bbox_df["joint_id"] = bbox_df["joint_id"].apply(lambda x: x % 42)
    
    
    merged_df = pd.merge(
        score_df,
        bbox_df,
        left_on=['joint_id', 'patient_id', 'hand', 'joint'],
        right_on=['joint_id', 'patient_id', 'hand', 'joint'],
        how='inner'
    )
    
    merged_df.to_csv(output_file, index=False)
    print('Merged file saves successfully')

merge('dataset/bboxes.csv', 'dataset/avg_scores.csv', 'dataset/merged_df.csv')

## Cropping and saving images

In [None]:
def crop_and_save_images(merged_df, image_dir, output_dir, normalized=False):
    """
    Crop images based on bounding boxes and save them to a specific folder based on their label and joint type.

    Cropped images of one group of joints from one patient will have equal sizes, this way torchvision.transforms.Reshape will not deform relative size of joint space,
    thus increasing model predictive quality.

    Args:
        merged_df (pd.DataFrame): Merged DataFrame containing bbox and label information.
        image_dir (str): Path to the directory containing input images.
        output_dir (str): Path to save the cropped images.
        normalized (bool): Whether the bounding box coordinates are normalized.
    """             
    os.makedirs(output_dir, exist_ok=True)

    for _, row in merged_df.iterrows():
        image_name = str(row["patient_id"]) + ".jpeg"  # Assuming the image name is in a column called "image_name"
        joint_type = row['joint']
        erosion_score, jsn_score = int(row['erosion_score']), int(row['jsn_score'])
        label = f"{int(row['erosion_score'])}_{int(row['jsn_score'])}"
        x_center, y_center, width, height = row["xcenter"], row["ycenter"], row["dx"], row["dy"]

        os.makedirs(os.path.join(output_dir, f'{joint_type}_erosion', str(erosion_score)), exist_ok = True)
        os.makedirs(os.path.join(output_dir, f'{joint_type}_jsn', str(jsn_score)), exist_ok = True)
        # Load the image
        image_path = os.path.join(image_dir, image_name)
        if not os.path.exists(image_path):
            print(f"Image not found: {image_path}")
            continue

        image = cv2.imread(image_path)
        if image is None:
            print(f"Failed to load image: {image_path}")
            continue

        # Get image dimensions
        img_height, img_width = image.shape[:2]

        # Convert normalized coordinates to pixel coordinates if necessary
        if normalized:
            x_center *= img_width
            y_center *= img_height
            width *= img_width
            height *= img_height

        # Calculate bounding box coordinates
        x1 = max(int(x_center - width / 2), 0)
        y1 = max(int(y_center - height / 2), 0)
        x2 = min(int(x_center + width / 2), img_width)
        y2 = min(int(y_center + height / 2), img_height)

        # Crop the image
        cropped_image = image[y1:y2, x1:x2]

        # Save the cropped image
        output_name = f"{os.path.splitext(image_name)[0]}_{row['joint_id']}_{label}.jpg"
        output_path_1 = os.path.join(output_dir, f'{joint_type}_erosion', str(erosion_score), output_name)
        output_path_2 = os.path.join(output_dir, f'{joint_type}_jsn', str(jsn_score), output_name)
        try:
            cv2.imwrite(output_path_1, cropped_image)
            cv2.imwrite(output_path_2, cropped_image)
        except:
            print(cropped_image.shape)
            print(row)
            return 0
    print('Saved cropped images successfully')

# Load the CSV files
label_df = pd.read_csv('dataset/merged_df.csv')

#set bounding boxes sizes equal for each joint type
for id, sub_df in label_df.groupby('patient_id'):
    sub_df['dx'] = sub_df.groupby('joint')['dx'].transform('max')
    sub_df['dy'] = sub_df.groupby('joint')['dy'].transform('max')

# Crop and save images
crop_and_save_images(label_df, os.path.join('dataset','jpeg_inv_clahe'), os.path.join('dataset','non-sorted_inv_clahe'), False)

## Count classes for all data groups

In [None]:
label_df = pd.read_csv('dataset/merged_df.csv')

joint_types = label_df['joint'].unique()
erosion_df = pd.DataFrame(columns = joint_types, index = range(6))
jsn_df = pd.DataFrame(columns = joint_types, index = range(5))

# if a class has any samples in it, corresponding folder will exist

for joint in joint_types:
    for i in range(6):
        path = os.path.join('dataset', 'non-sorted_inv_clahe', f'{joint}_erosion', str(i))
        if os.path.exists(path):
            erosion_df.loc[i, joint] = len(os.listdir(path))
        else:
            erosion_df.loc[i, joint] = 0

for joint in joint_types:
    for i in range(5):
        path = os.path.join('dataset', 'non-sorted_inv_clahe', f'{joint}_jsn', str(i))
        if os.path.exists(path):
            jsn_df.loc[i, joint] = len(os.listdir(path))
        else:
            jsn_df.loc[i, joint] = 0

jsn_df.to_csv(os.path.join('dataset', 'non-sorted_inv_clahe', 'jsn_data_counts.csv'))
erosion_df.to_csv(os.path.join('dataset', 'non-sorted_inv_clahe', 'erosion_data_counts.csv'))

In [None]:
jsn_df

In [None]:
erosion_df

In [None]:
os.makedirs(os.path.join('dataset', 'custom_split_inv_clahe'), exist_ok=True)

for dir in os.listdir(os.path.join('dataset', 'non-sorted_inv_clahe')):
    # splitting data to train and val. Every 6-th image goes to val data
    try:
        for cl in os.listdir(os.path.join('dataset', 'non-sorted_inv_clahe', dir)):
            os.makedirs(os.path.join('dataset', 'custom_split_inv_clahe', dir, 'train', str(cl)), exist_ok=True)
            os.makedirs(os.path.join('dataset', 'custom_split_inv_clahe', dir, 'val', str(cl)), exist_ok=True)

            for i, file_name in enumerate(tqdm(os.listdir(os.path.join('dataset', 'non-sorted_inv_clahe', dir, cl)))):
                img = Image.open(os.path.join('dataset', 'non-sorted_inv_clahe', dir, cl, file_name))

                if i % 6 != 0:
                    img.save(os.path.join('dataset', 'custom_split_inv_clahe', dir, 'train', str(cl), file_name))
                else:
                    img.save(os.path.join('dataset', 'custom_split_inv_clahe', dir, 'val', str(cl), file_name))

    except NotADirectoryError:
        # this exception was made, because jsn and erosion counts DataFrames are stored in dataset/non-sorted
        continue