In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt

In [2]:
def overlap_images():
    dataframe = pd.read_csv('dataset.csv', low_memory=False)
    # datasets = ['mias', 'mini-ddsm', 'inbreast', 'kau-bcmd', 'cmmd', 'cdd-cesm', 'dmid', 'rsna-screening']
    datasets = ['cdd-cesm']
    dataframe['new_path'] = dataframe['new_path'].replace('Original_Dataset', 'Preprocessed_Dataset')
    dataframe.to_csv('dataset.csv', index=False)
    for dataset in tqdm(datasets, desc='Datasets'):
        print(f'Processing dataset: {dataset}')
        df = dataframe[dataframe['dataset'] == dataset]
        os.makedirs(f'Preprocessed_Dataset/{dataset}', exist_ok=True)
        length = df.shape[0]

        for i in tqdm(range(49300, length), desc='Images'):
            output_path = f'Preprocessed_Dataset/{dataset}/{dataset}_{i}.jpg'
            img1_path = df.iloc[i]['new_path']
            img1_path = img1_path.replace('Original_Dataset', 'crop+clahe_Dataset')
            img2_path = df.iloc[i]['mask_path']

            img1 = cv2.imread(img1_path, cv2.IMREAD_GRAYSCALE)
            img2 = cv2.imread(img2_path, cv2.IMREAD_GRAYSCALE)
            # Check if images are loaded correctly
            if img1 is None:
                print(f"Error loading image: {img1_path}")
                continue
            if img2 is None:
                print(f"Error loading image: {img2_path}")
                continue

            # maintain the aspect ratio of the images
            if img1.shape[0] > img1.shape[1]:
                img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0]))
            else:
                img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0]))

            masked_image = cv2.bitwise_and(img1, img2)
            masked_image = cv2.cvtColor(masked_image, cv2.COLOR_GRAY2BGR)

            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            cv2.imwrite(output_path, masked_image)

In [None]:
os.makedirs('Preprocessed_Dataset', exist_ok=True)
overlap_images()

## Crop the breast region after overlapping the mask

In [None]:
def crop_breast_region(image_path, min_pixel=10):
    if "mias" in image_path:
        min_pixel = 50
    elif "inbreast" in image_path:
        min_pixel = 10
    elif "mini-ddsm" in image_path:
        min_pixel = 50
    elif "kau-bcmd" in image_path:
        min_pixel = 25
    elif "cmmd" in image_path:
        min_pixel = 10
    elif "cdd-cesm" in image_path:
        min_pixel = 10
    elif "rsna-screening" in image_path:
        min_pixel = 40
    elif "dmid" in image_path:
        min_pixel = 60

    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    # Apply threshold to separate breast tissue from background
    _, binary = cv2.threshold(img, min_pixel, 255, cv2.THRESH_BINARY)
    
    # Find contours
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if not contours:
        print(f"No contours found in image: {image_path}")
        return img
    
    # Find the largest contour (assuming it's the breast tissue)
    largest_contour = max(contours, key=cv2.contourArea)
    
    # Get bounding rectangle of the largest contour
    x, y, w, h = cv2.boundingRect(largest_contour)
    
    # Add some padding
    padding = 10
    x = max(0, x - padding)
    y = max(0, y - padding)
    w = min(img.shape[1] - x, w + 2*padding)
    h = min(img.shape[0] - y, h + 2*padding)
    
    # Crop the image
    cropped_img = img[y:y+h, x:x+w]
    return cropped_img

In [None]:
def preprocess(dataset):
    dataframe = pd.read_csv("dataset.csv", low_memory=False)
    df = dataframe[dataframe["dataset"] == dataset]
    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Preprocessing {dataset}"):
        img = crop_breast_region(row["preprocessed_image_path"])
        img_path = row["preprocessed_image_path"].replace("Preprocessed_Dataset", "Cropped_Preprocessed_Dataset")
        os.makedirs(os.path.dirname(img_path), exist_ok=True)
        cv2.imwrite(img_path, img)

In [None]:
datasets = ['mias', 'mini-ddsm', 'inbreast', 'kau-bcmd', 'cmmd', 'cdd-cesm', 'dmid', 'rsna-screening']
for dataset in tqdm(datasets, desc='Datasets'):
    preprocess(dataset)

## Check if more than 90% of the pixels are black, if they are then replace the image with the image from the crop+clahe_Dataset 

In [None]:
# check if more than 90% of the pixels are black
black_images = []
def check_black_images():
    dataframe = pd.read_csv('dataset.csv', low_memory=False)
    for i, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc='Checking black images'):
        img = cv2.imread(row['preprocessed_image_path'], cv2.IMREAD_GRAYSCALE)
        if img is None:
            print(f"Error loading image: {row['preprocessed_image_path']}")
            continue
        # count the total number of pixels in the image
        total_pixels = img.shape[0] * img.shape[1]
        # count the number of black pixels in the image
        black_pixels = np.sum(img < 60)
        # if more than 80% of the pixels are black
        if black_pixels > 0.9 * total_pixels:
            print(f"Image with more than 90% black pixels: {row['preprocessed_image_path']}")
            black_images.append(row['preprocessed_image_path'])

# save the black images column as a text file
with open('black_images.txt', 'w') as f:
    for img in black_images:
        f.write(f"{img}\n")

check_black_images()

In [None]:
# If more than 90% of the pixels are black, then replace the image with the image
# from the crop+clahe_Dataset and store those images in another csv file. Define black pixel
# as pixel value less than 60. black_images.txt contains the list of images that are mostly black
# go to each of these images and replace them with the corresponding image from crop+clahe_Dataset

def replace_black_images():
    dataframe = pd.read_csv('dataset.csv', low_memory=False)
    black_images = open('black_images.txt', 'r').read().split('\n')
    for image_path in tqdm(black_images, desc='Images'):
        dataset = image_path.split('/')[1]
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            print(f"Error loading image: {image_path}")
            continue
        if np.mean(img) < 60:
            new_image_path = image_path.replace('Preprocessed_Dataset', 'crop+clahe_Dataset')
            new_img = cv2.imread(new_image_path, cv2.IMREAD_GRAYSCALE)
            if new_img is None:
                print(f"Error loading image: {new_image_path}")
                continue
            os.makedirs(os.path.dirname(image_path), exist_ok=True)
            cv2.imwrite(image_path, new_img)
            dataframe.loc[dataframe['new_path'] == image_path, 'new_path'] = image_path

replace_black_images()