In [None]:
import os
import cv2
import pydicom
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt

## Cropping and CLAHE-ing

In [None]:
def crop_breast_region(image_path, min_pixel=5):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    # Apply threshold to separate breast tissue from background
    _, binary = cv2.threshold(img, min_pixel, 255, cv2.THRESH_BINARY)
    
    # Find contours
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if not contours:
        print(f"No contours found in image: {image_path}")
        return img
    
    # Find the largest contour (assuming it's the breast tissue)
    largest_contour = max(contours, key=cv2.contourArea)
    
    # Get bounding rectangle of the largest contour
    x, y, w, h = cv2.boundingRect(largest_contour)
    
    # Add some padding
    padding = 10
    x = max(0, x - padding)
    y = max(0, y - padding)
    w = min(img.shape[1] - x, w + 2*padding)
    h = min(img.shape[0] - y, h + 2*padding)
    
    # Crop the image
    cropped_img = img[y:y+h, x:x+w]
    return cropped_img

def clahe(img, clipLimit):
    clahe = cv2.createCLAHE(clipLimit, tileGridSize=(8, 8))
    img_clahe = clahe.apply(img)
    return img_clahe

def cropNclahe(image_path):
    min_pixel=10
    clipLimit=2.0
    if "mias" in image_path:
        clipLimit = 7
        min_pixel = 50
    elif "inbreast" in image_path:
        clipLimit = 7
        min_pixel = 10
    elif "mini-ddsm" in image_path:
        clipLimit = 3
        min_pixel = 50
    elif "kau-bcmd" in image_path:
        clipLimit = 3.5
        min_pixel = 25
    elif "cmmd" in image_path:
        clipLimit = 2
        min_pixel = 10
    elif "cdd-cesm" in image_path:
        clipLimit = 1
        min_pixel = 10
    elif "rsna-screening" in image_path:
        clipLimit = 4
        min_pixel = 40
    elif "dmid" in image_path:
        clipLimit = 2.5
        min_pixel = 60

    img = crop_breast_region(image_path, min_pixel)
    # img_clahe = clahe(img, clipLimit)
    return img

## Testing the appropriate Clip Limit for CLAHE for each dataset

In [None]:
def display_images(dataset, range):
    df = pd.read_csv(f"{dataset}.csv")
    image_paths = df.sample(10)["new_path"].tolist()
    images = [cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) for image_path in image_paths]

    fig, axs = plt.subplots(10, len(range)+1, figsize=(20, 40))
    for i, image in enumerate(images):
        axs[i, 0].imshow(image, cmap="gray")
        axs[i, 0].axis("off")
        axs[i, 0].set_title("Original")
        
        for j, clipLimit in enumerate(range):
            img_clahe = cropNclahe(image_paths[i])
            axs[i, j+1].imshow(img_clahe, cmap="gray")
            axs[i, j+1].axis("off")
            axs[i, j+1].set_title(f"CLAHE (clipLimit={clipLimit})")
    plt.show()

## For images having a white background, convert their background to black

In [None]:
def white_background_to_black(img):
    count = 0
    # check if any 2 of the centres of the 4 edges are white
    if img[10, img.shape[1]//2] >= 200:
        count += 1
    if img[img.shape[0]//2, 10] >= 200:
        count += 1
    if img[img.shape[0]//2, -10] >= 200:
        count += 1
    if img[-10, img.shape[1]//2] >= 200:
        count += 1

    # if at least 2 of the 4 are white, invert the image
    if count >= 2:
        img = cv2.bitwise_not(img)
    return img

## Function to make the background black, crop the breast region and apply CLAHE

In [None]:
def preprocess(dataset):
    df = pd.read_csv(f"{dataset}.csv")
    df["crop+clahe_path"] = df["new_path"].str.replace("Original_Dataset", "crop+clahe_Dataset")
    for i, row in tqdm(df.iterrows(), total=len(df)):
        img = cv2.imread(row["new_path"], cv2.IMREAD_GRAYSCALE)
        img = white_background_to_black(img)
        cv2.imwrite(row["new_path"], img)
        img_clahed = cropNclahe(row["new_path"])
        os.makedirs(os.path.dirname(row["crop+clahe_path"]), exist_ok=True)
        cv2.imwrite(row["crop+clahe_path"], img_clahed)

## MIAS Preprocessing

In [None]:
range = [5, 6, 7, 8, 9]
display_images("mias", range)
preprocess("mias")

## INbreast Preprocessing

In [None]:
range = [5, 6, 7, 8]
display_images("inbreast", range)
preprocess("inbreast")

## Mini-DDSM Preprocessing

In [None]:
range = [2.5, 3, 3.5, 4]
display_images("mini-ddsm", range)
preprocess("mini-ddsm")

In [None]:
def highlight_roi_from_boundary(image_path, mask_path, save_dir):
    # Extract the index from the file path
    index = os.path.basename(image_path).split('_')[-1].split('.')[0]

    # Load the image and ROI mask
    image = Image.open(image_path).convert('L')  # Ensure image is in grayscale format
    mask = Image.open(mask_path).convert('L')  # Ensure mask is in grayscale format

    # Convert images to numpy arrays
    image = np.array(image)
    mask = np.array(mask)

    # Check if the dimensions of the image and mask match
    if image.shape != mask.shape:
        # Resize the mask to match the image dimensions
        mask = cv2.resize(mask, (image.shape[1], image.shape[0]))

    # Process the mask to get contours
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Create a blank mask to fill the contour
    filled_mask = np.zeros_like(mask)
    cv2.drawContours(filled_mask, contours, -1, 255, thickness=cv2.FILLED)

    # Highlight the image using the filled mask
    highlighted_image = cv2.bitwise_and(image, filled_mask)

    # Save the images with the same index as in the original image
    highlighted_image_path = os.path.join(save_dir, f'mini-ddsm_{index}.jpg')
    filled_mask_path = os.path.join(save_dir, f'mini-ddsm_ROI_{index}.jpg')
    
    cv2.imwrite(highlighted_image_path, highlighted_image)
    cv2.imwrite(filled_mask_path, filled_mask)

    return highlighted_image_path, filled_mask_path

def apply_highlight_roi(df, save_dir):
    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)
    
    highlighted_image_paths = []
    filled_mask_paths = []
    
    for index, row in tqdm(df.iterrows(), total=len(df), desc='Processing Images'):
        image_path = row['new_path']
        mask_path = row['ROI_path']
        
        highlighted_image_path, filled_mask_path = highlight_roi_from_boundary(image_path, mask_path, save_dir)
        
        highlighted_image_paths.append(highlighted_image_path)
        filled_mask_paths.append(filled_mask_path)
    
    # Add the file paths of the saved images and masks back to the DataFrame
    df['final_path'] = highlighted_image_paths
    df['filled_mask_path'] = filled_mask_paths
    
    return df

df = pd.read_csv('mini-ddsm.csv')
df = apply_highlight_roi(df, 'mini-ddsm_highlighted_roi')
df.to_csv('mini-ddsm.csv', index=False)

## KAU-BCMD Preprocessing

In [None]:
range = [3, 3.5, 4]
display_images("kau-bcmd", range)
preprocess("kau-bcmd")

## CMMD Preprocessing

In [None]:
range = [2, 2.5, 3]
display_images("cmmd", range)
preprocess("cmmd")

## CDD-CESM Preprocessing

In [None]:
range = [0.5, 1, 1.5]
display_images("cdd-cesm", range)
preprocess("cdd-cesm")

## RSNA Screening Data Preprocessing

In [None]:
range = [3.5, 4, 4.5]
display_images("rsna-screening", range)
preprocess("rsna-screening")

## DMID Preprocessing

In [None]:
range = [2, 2.5, 3]
display_images("dmid", range)
preprocess("dmid")

# Re-cropping the images which are not cropped properly in the first attempt

In [None]:
# check if more than 80% of the pixels in an image are black
def is_black_picture(img):
    black_pixels = np.sum(img <= 50)
    total_pixels = img.shape[0] * img.shape[1]
    return black_pixels / total_pixels > 0.8

# for each dataset, check if more than 80% of the images are black
datasets = ["mias", "inbreast", "mini-ddsm", "kau-bcmd", "cmmd", "cdd-cesm", "dmid", "rsna-screening"]

for dataset in datasets:
    df = pd.read_csv(f"{dataset}.csv")
    black_pictures = 0
    for i in tqdm(range(df.shape[0])):
        img_path = df.iloc[i]['new_path'].replace("Original_Dataset", "crop+clahe_Dataset")
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if is_black_picture(img):
            black_pictures += 1
            # print the path of the black image
            print(df.iloc[i]['new_path'])
            # try cropping these images with lower threshold
            img = crop_breast_region(img_path, min_pixel=100)

    print(f"{dataset}: {black_pictures} out of {df.shape[0]} images are black")

In [None]:
# check if more than 80% of the pixels in an image are black
def is_black_picture(img):
    black_pixels = np.sum(img <= 50)
    total_pixels = img.shape[0] * img.shape[1]
    return black_pixels / total_pixels > 0.8

list = [
    'Original_Dataset/mini-ddsm/mini-ddsm_6.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_190.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_194.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_1884.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_1886.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_2236.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_2238.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_2240.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_2242.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_2806.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_3190.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_3242.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_3506.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_3570.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_4448.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_4469.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_4470.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_4471.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_4648.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_4678.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_4956.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_4958.jpg',
    'Original_Dataset/mini-ddsm/mini-ddsm_5002.jpg',
]

# go through these images and crop them with lower threshold
for img_path in list:
    img_path = img_path.replace("Original_Dataset", "crop+clahe_Dataset")
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = crop_breast_region(img_path, min_pixel=100)
    # check if the image is still black
    if is_black_picture(img):
        print(f"Black Image is still black: {img_path}")
    else:
        cv2.imwrite(img_path, img)