In [None]:
import os
import re
import shutil
import pydicom
import tifffile
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [None]:
def rename_and_save_images(data):
    j = 0
    dataset_name = data['dataset'].iloc[0]  # Extract the dataset name from the first row
    print(f"Processing {dataset_name} dataset...")
    new_dir = f"Original_Dataset/{dataset_name}"

    # Create a new directory to save the renamed images
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)

    print(f"Saving images to {new_dir}...")
    new_paths = []

    for i, row in tqdm(data.iterrows(), total=data.shape[0]):
        old_path = row['original_image_path']
        new_filename = f"{row['dataset']}_{j}.jpg"
        new_path = os.path.join(new_dir, new_filename)
        
        # Save image based on file type
        if old_path.endswith('.dcm'):
            ds = pydicom.dcmread(old_path)
            # img = ds.pixel_array
            # plt.imsave(new_path, img, cmap='gray')

            # Update the DataFrame with the new path
            data.at[i, 'new_path'] = new_path
            new_paths.append(new_path)
            j += 1
        elif old_path.endswith('.png') or old_path.endswith('.jpeg') or old_path.endswith('.pgm'):
            # img = mpimg.imread(old_path)
            # plt.imsave(new_path, img, cmap='gray')
            
            # Update the DataFrame with the new path
            data.at[i, 'new_path'] = new_path
            new_paths.append(new_path)
            j += 1
        else:
            # Ensure the new path is unique
            while os.path.exists(new_path):
                j += 1
                new_filename = f"{row['dataset']}_{j}.jpg"
                new_path = os.path.join(new_dir, new_filename)
            
            # Update the DataFrame with the new path
            data.at[i, 'new_path'] = new_path
            new_paths.append(new_path)

            # Copy the file to the new path
            shutil.copy(old_path, new_path)
            j += 1
    
    print(f"Total images saved: {j}")

    # remove the ../Individual_Original_Datasets/ from the original_image_path
    data['original_image_path'] = data['original_image_path'].replace('../Individual_Original_Datasets/', '', regex=True)
    # add mask_path column and preprocessed_image_path column
    data['mask_path'] = data['new_path'].str.replace('Original_Dataset', 'Masks')
    data['preprocessed_image_path'] = data['new_path'].str.replace('Original_Dataset', 'Preprocessed_Dataset')


    # Save the DataFrame to a CSV file
    data.to_csv(f"{dataset_name}.csv", index=False)

In [None]:
def find_missing_and_duplicate_files(data):
    # Directory containing the renamed images
    new_dir = f"Original_Dataset/{data.iloc[0]['dataset']}"

    # Generate a list of expected filenames
    expected_filenames = [f"{data.iloc[0]['dataset']}_{i}.jpg" for i in range(len(data))]

    # Generate a list of actual filenames in the directory
    actual_filenames = sorted(os.listdir(new_dir))

    # Find missing filenames
    missing_filenames = set(expected_filenames) - set(actual_filenames)

    # Find duplicate filenames
    duplicate_filenames = [filename for filename in actual_filenames if actual_filenames.count(filename) > 1]

    # Print results
    print(f"Total expected filenames: {len(expected_filenames)}")
    print(f"Total actual filenames: {len(actual_filenames)}")
    print(f"Missing filenames: {len(missing_filenames)}")
    print(f"Duplicate filenames: {len(duplicate_filenames)}")

    if missing_filenames:
        print("Missing filenames:")
        for filename in sorted(missing_filenames):
            print(filename)

    if duplicate_filenames:
        print("Duplicate filenames:")
        for filename in sorted(set(duplicate_filenames)):
            print(filename)

# MIAS

In [None]:
mias = pd.read_csv('../Individual_Original_Datasets/MIAS/Info.txt', sep=' ')

# drop the column Unnamed: 7
mias = mias.drop(columns=['Unnamed: 7'])
mias = mias.rename(columns={'CLASS': 'class', 'SEVERITY': 'classification', 'REFNUM': 'patientID', 'BG': 'density', 'RADIUS': 'radius', 'X': 'x', 'Y': 'y'})
mias['classification'] = mias['classification'].replace('B', 'Benign')
mias['classification'] = mias['classification'].replace('M', 'Malignant')
mias['density'] = mias['density'].replace('D', 'Dense')
mias['density'] = mias['density'].replace('F', 'Fatty')
mias['density'] = mias['density'].replace('G', 'Glandular')
mias['image_path'] = 'MIAS/all-mias/' + mias['patientID'].astype(str) + '.pgm'
mias['image_type'] = 'full mammogram image'
mias['dataset'] = 'mias'

mias['image_path'] = mias['image_path'].str.replace('MIAS/all-mias/', '../Individual_Original_Datasets/MIAS/all-mias/')
# remove rows with NaN values in image_path
mias = mias.dropna(subset=['image_path'])

# remove duplicate rows
mias = mias.drop_duplicates(subset=['image_path'])

#  change the name of the column image path to original_image_path
mias = mias.rename(columns={'image_path': 'original_image_path'})

print(mias.shape)
print(mias.count())
print(mias.head())

In [None]:
rename_and_save_images(mias)
find_missing_and_duplicate_files(mias)

# INbreast

In [None]:
# Load and preprocess inbreast
inbreast = pd.read_excel('../Individual_Original_Datasets/INbreast/INbreast.xls')

# drop the entry with empty file name
inbreast = inbreast.dropna(subset=['File Name'])
inbreast['ACR'] = inbreast['ACR'].replace("'", np.nan)
inbreast['density'] = inbreast['ACR']

inbreast = inbreast.drop(columns=['Patient age', 'Pectoral Muscle Annotation', 'Acquisition date', 'Mass ', 'Micros', 'Distortion', 'Asymmetry', 'Lesion Annotation Status', 'Patient ID', 'Findings Notes (in Portuguese)', 'Other Notes', 'Other Annotations', 'Acquisition date'])
inbreast = inbreast.rename(columns={'Bi-Rads': 'BIRADS', 'File Name': 'image_path', 'Laterality': 'laterality', 'View': 'view', 'ACR': 'ACR(density)'})
# convert image path to int and then to string
inbreast['image_path'] = inbreast['image_path'].astype(int).astype(str)
inbreast['image_path'] = 'INbreast/ALLDICOMs/' + inbreast['image_path']
inbreast['patientID'] = inbreast['image_path'].str.extract(r'([0-9]+)')
inbreast['patientID'] = inbreast['patientID'].str.replace('INbreast/ALLDICOMs/', '')

def update_image_path(base_dir, current_path):
    number = current_path.split('/')[-1]
    dicom_file = [file for file in os.listdir(base_dir) if file.startswith(number)]
    new_path = os.path.join(base_dir, dicom_file[0]) if dicom_file else current_path
    return new_path

# Update the image_path column
base_directory = '../Individual_Original_Datasets/INbreast/AllDICOMs'  # Replace with your actual base directory path
inbreast['image_path'] = inbreast['image_path'].apply(lambda x: update_image_path(base_directory, x))
inbreast['image_type'] = 'full mammogram image'
inbreast['dataset'] = 'inbreast'

# remove rows with NaN values in image_path
inbreast = inbreast.dropna(subset=['image_path'])

# BIRADs to NBM mapping
birads_to_nbm = {
    '0': 'Normal',
    '1': 'Normal',
    '2': 'Benign',
    '3': 'Benign',
    # '4': 'Suspicious B or M',   #(Suspicious Anomaly. Biopsy should be considered)
    '4a': 'Suspicious Malignant',
    '4b': 'Suspicious Malignant',
    '4c': 'Suspicious Malignant',
    '5': 'Malignant',
    '6': 'Malignant'
}

dataset = inbreast.copy()

# convert the BIRADS column to string
dataset['BIRADS'] = dataset['BIRADS'].astype(str)
dataset.loc[dataset['dataset'] == 'inbreast', 'classification'] = dataset.loc[dataset['dataset'] == 'inbreast', 'BIRADS'].map(birads_to_nbm)
print(dataset.head())

density_mapping = {
    1.0: 'A',
    2.0: 'B',
    3.0: 'C',
    4.0: 'D'
}

# Replace the density values in the dataset.csv file by mapping it and saving it to a new column density2
dataset.loc[dataset['dataset'] == 'inbreast', 'density2'] = dataset.loc[dataset['dataset'] == 'inbreast', 'density'].map(density_mapping)

# replace the values in the classification column where BIRADS is 4
dataset.loc[dataset['BIRADS'] == 4, 'classification'] = 'Suspicious Malignant'

# drop the column density and ACR(density)
dataset = dataset.drop(columns=['density', 'ACR(density)'])

# renme the column density2 to density
dataset = dataset.rename(columns={'density2': 'density'})

# change the name of the column image path to original_image_path
dataset = dataset.rename(columns={'image_path': 'original_image_path'})

# drop the rows with NaN values in the image_path column
dataset = dataset.dropna(subset=['original_image_path'])

print(dataset.shape)
print(dataset.count())
print(dataset.head())

In [None]:
rename_and_save_images(dataset)
find_missing_and_duplicate_files(dataset)

# Mini-DDSM

In [None]:
# create a dataframe mini_ddsm, which has the columns patientID, image_path, laterality, view, density, classification, image_type, dataset
mini_ddsm = pd.read_excel('../Individual_Original_Datasets/Mini-DDSM/DataWMask.xlsx')

# rename the columns
mini_ddsm = mini_ddsm.rename(columns={'Tumour_Contour': 'ROI_path', 'fullPath': 'image_path', 'Side': 'laterality', 'View': 'view', 'Density': 'density', 'Status': 'classification', 'Age': 'age'})

# set the column patientID to the number between the firrst '\' and 2nd '\' in the image_path
mini_ddsm['patientID'] = mini_ddsm['image_path'].str.extract(r'\\([0-9]+)\\')
mini_ddsm['image_path'] = 'Mini-DDSM/' + mini_ddsm['image_path']
# replace the \ with / in teh image path
mini_ddsm['image_path'] = mini_ddsm['image_path'].str.replace('\\', '/')

df['classification'] = df['classification'].replace('Cancer', 'Malignant')

# take only the columns patientID, image_path, laterality, view, density, classification, image_type, dataset
mini_ddsm = mini_ddsm[['patientID', 'image_path', 'ROI_path', 'laterality', 'view', 'density', 'classification', 'age']]
mini_ddsm['image_type'] = 'full mammogram image'
mini_ddsm['dataset'] = 'mini-ddsm'

# convert - in the ROI_path to nan
mini_ddsm['ROI_path'] = mini_ddsm['ROI_path'].replace('-', np.nan)

# replace al \ with / in the ROI_path column and add Mini-DDSM/ in the beginning
mini_ddsm['ROI_path'] = mini_ddsm['ROI_path'].str.replace('\\', '/')
mini_ddsm['ROI_path'] = 'Mini-DDSM/' + mini_ddsm['ROI_path']

mini_ddsm['image_path'] = mini_ddsm['image_path'].str.replace('Mini-DDSM/', '../Individual_Original_Datasets/Mini-DDSM/')

# change the name of the column image path to original_image_path
mini_ddsm = mini_ddsm.rename(columns={'image_path': 'original_image_path'})

# convert the density values to A, B, C, D
mini_ddsm['density'] = mini_ddsm['density'].replace(1, 'A')
mini_ddsm['density'] = mini_ddsm['density'].replace(2, 'B')
mini_ddsm['density'] = mini_ddsm['density'].replace(3, 'C')
mini_ddsm['density'] = mini_ddsm['density'].replace(4, 'D')
# replace 0 by nan
mini_ddsm['density'] = mini_ddsm['density'].replace(0, np.nan)

print(mini_ddsm.shape)
print(mini_ddsm.count())
print(mini_ddsm.head())

In [None]:
def rename_and_save_images_w_ROI2(data):
    j = 0
    dataset_name = data['dataset'].iloc[0]  # Extract the dataset name from the first row
    print(f"Processing {dataset_name} dataset...")
    new_dir = f"Original_Dataset/{dataset_name}"

    data['new_path'] = None
    data['ROI_path'] = data['ROI_path'].replace('-', np.nan)
    data['ROI_path'] = data['ROI_path'].str.replace('Mini-DDSM', '../Individual_Original_Datasets/Mini-DDSM')
    data['original_image_path'] = data['original_image_path'].str.replace('Mini-DDSM', '../Individual_Original_Datasets/Mini-DDSM')

    # Create a new directory to save the renamed images
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)

    print(f"Saving images to {new_dir}...")
    new_paths = []

    for i, row in tqdm(data.iterrows(), total=data.shape[0]):
        old_path = row['original_image_path']
        new_filename = f"{row['dataset']}_{j}.jpg"
        new_path = os.path.join(new_dir, new_filename)
        ROI_path = row['ROI_path']
        
        # Save image based on file type
        if old_path.endswith('.dcm'):
            # if row has nan in ROI_path, then save the image as it is, else save the image and the ROI image
            if pd.isna(row['ROI_path']):
                ds = pydicom.dcmread(old_path)
                img = ds.pixel_array
                plt.imsave(new_path, img, cmap='gray')
            elif not pd.isna(row['ROI_path']):
                ds = pydicom.dcmread(old_path)
                img = ds.pixel_array
                plt.imsave(new_path, img, cmap='gray')
                
                ds = pydicom.dcmread(row['ROI_path'])
                img = ds.pixel_array
                ROI_path = new_path.replace('.jpg', '_ROI.jpg')
                plt.imsave(ROI_path, img, cmap='gray')

            # Update the DataFrame with the new path
            data.at[i, 'new_path'] = new_path
            data.at[i, 'ROI_path'] = ROI_path
            new_paths.append(new_path)
            j += 1
        elif old_path.endswith('.png') or old_path.endswith('.jpeg') or old_path.endswith('.pgm') or old_path.endswith('.jpg'):
            if pd.isna(row['ROI_path']):
                img = mpimg.imread(old_path)
                plt.imsave(new_path, img, cmap='gray')
            elif not pd.isna(row['ROI_path']):
                img = mpimg.imread(old_path)
                plt.imsave(new_path, img, cmap='gray')
                
                img = mpimg.imread(row['ROI_path'])
                ROI_path = new_path.replace('.jpg', '_ROI.jpg')
                plt.imsave(ROI_path, img, cmap='gray')
        
            # Update the DataFrame with the new path
            data.at[i, 'new_path'] = new_path
            data.at[i, 'ROI_path'] = ROI_path
            new_paths.append(new_path)
            j += 1
        else:
            # Ensure the new path is unique
            while os.path.exists(new_path):
                j += 1
                new_filename = f"{row['dataset']}_{j}.jpg"
                new_path = os.path.join(new_dir, new_filename)
            
            # Update the DataFrame with the new path
            data.at[i, 'new_path'] = new_path
            new_paths.append(new_path)

            # Copy the file to the new path
            shutil.copy(old_path, new_path)
            j += 1
    
    print(f"Total images saved: {j}")

    data['original_image_path'] = data['original_image_path'].replace('../Individual_Original_Datasets/', '', regex=True)
    data['ROI_path'] = data['ROI_path'].replace('../Individual_Original_Datasets/', '', regex=True)

    # Save the DataFrame to a CSV file
    data.to_csv(f"../{dataset_name}.csv", index=False)

In [None]:
rename_and_save_images_w_ROI2(mini_ddsm)
find_missing_and_duplicate_files(mini_ddsm)

# KAU-BCMD

In [None]:
kau_bcmd = pd.read_excel('../Individual_Original_Datasets/king-abdulaziz-uni/correctSheetlast.xlsx', sheet_name='correctSheet')

# Select relevant columns and rename them
kau_bcmd = kau_bcmd[['Percentage of\n grandular tissue(density)', 'PatientID', 'Patient age ', 'Breast  type', 'Breast view', 'Assesment', 'Image path']]
kau_bcmd = kau_bcmd.rename(columns={'Percentage of\n grandular tissue(density)': 'density', 'PatientID': 'patientID', 'Patient age ': 'age', 'Breast  type': 'laterality', 'Breast view': 'view', 'Assesment': 'BIRADS', 'Image path': 'image_path'})

# Update image_path
kau_bcmd['image_path'] = 'king-abdulaziz-uni/' + kau_bcmd['image_path'].str.extract(r'(/.+)')

# Extract age from 'age' column
kau_bcmd['age'] = kau_bcmd['age'].astype(str).str.extract(r'(\d+)')

# Process BIRADS column
kau_bcmd['BIRADS'] = kau_bcmd['BIRADS'].str.split().str[1]
kau_bcmd['BIRADS'] = kau_bcmd['BIRADS'].replace('nan', np.nan)

# Convert BIRADS to float, then handle NaNs and convert to int
kau_bcmd['BIRADS'] = pd.to_numeric(kau_bcmd['BIRADS'], errors='coerce')  # Coerce errors to NaN
kau_bcmd['BIRADS'] = kau_bcmd['BIRADS'].fillna(-1).astype(int)  # Fill NaNs with -1 and convert to int
kau_bcmd['image_type'] = 'full mammogram image'
kau_bcmd['dataset'] = 'kau-bcmd'

# BIRADs to NBM mapping
birads_to_nbm = {
    1.0: 'Normal',
    2.0: 'Benign',
    3.0: 'Benign',
    4.0: 'Suspicious Malignant',  # (Suspicious Malignant)
    5.0: 'Malignant',
    6.0: 'Malignant'
}

# Map BIRADS to classification for 'kau-bcmd' dataset
kau_bcmd.loc[kau_bcmd['dataset'] == 'kau-bcmd', 'classification'] = kau_bcmd.loc[kau_bcmd['dataset'] == 'kau-bcmd', 'BIRADS'].map(birads_to_nbm)

density_mapping = {
    '0%-25%': 'A',
    '26%-50%': 'B',
    '51%-75%': 'C',
    '>75%': 'D'
}

# Map density values for 'kau-bcmd' dataset
kau_bcmd.loc[kau_bcmd['dataset'] == 'kau-bcmd', 'density'] = kau_bcmd.loc[kau_bcmd['dataset'] == 'kau-bcmd', 'density'].map(density_mapping)

# replace the .dcm in all image_path with .jpg
kau_bcmd['image_path'] = kau_bcmd['image_path'].str.replace('.dcm', '.jpg')
kau_bcmd['image_path'] = kau_bcmd['image_path'].str.replace('king-abdulaziz-uni/', '../Individual_Original_Datasets/king-abdulaziz-uni/')

# Remove rows with NaN values in image_path
kau_bcmd = kau_bcmd.dropna(subset=['image_path'])

# drop the images with duplicate image_path
kau_bcmd = kau_bcmd.drop_duplicates(subset=['image_path'])

print(kau_bcmd.shape)
print(kau_bcmd.count())

# remove these entries from the dataframe
kau_bcmd = kau_bcmd[kau_bcmd['image_path'].apply(os.path.exists)]

# rename the column image path to original_image_path
kau_bcmd = kau_bcmd.rename(columns={'image_path': 'original_image_path'})

print(kau_bcmd.shape)
print(kau_bcmd.count())
print(kau_bcmd.head())

In [None]:
rename_and_save_images(kau_bcmd)
find_missing_and_duplicate_files(kau_bcmd)

# CMMD

In [None]:
# Load and preprocess inbreast
cmmd = pd.read_excel('../Individual_Original_Datasets/CMMD/CMMD_clinicaldata_revision.xlsx')

cmmd = cmmd.drop(columns=['number'])
# replace the name of the Leftright column with Laterality, and Id1 wiht PatientID
cmmd = cmmd.rename(columns={'LeftRight': 'laterality', 'ID1': 'patientID', 'Age': 'age'})

# match the patientID in cmmd with Subject ID column in 'metainbreast.csv' file, and get the file location 
# from there and store it in the column 'image_path' by adding this column
metadata = pd.read_csv('../Individual_Original_Datasets/CMMD/metadata.csv')
metadata = metadata.rename(columns={'Subject ID': 'patientID', 'File Location': 'image_path'})

# remove the .\ in the image_path and replace it with CMMD instead 
metadata['image_path'] = metadata['image_path'].str.replace('.\\', 'CMMD/')

# replace all the '\' with '/' in the image_path
metadata['image_path'] = metadata['image_path'].str.replace('\\', '/')

# merge the two dataframes on the patientID column. total number of entries should be equal to those in metadata
cmmd = pd.merge(cmmd, metadata, on=['patientID'])
cmmd = cmmd[['patientID', 'abnormality', 'classification', 'image_path', 'laterality', 'age', 'subtype']]

# remove rows with NaN values in image_path
cmmd = cmmd.dropna(subset=['image_path'])
cmmd['dataset'] = 'cmmd'
cmmd['image_type'] = 'full mammogram image'

cmmd['image_path'] = cmmd['image_path'].str.replace('CMMD/CMMD/', '../Individual_Original_Datasets/CMMD/CMMD/')

# remove the entries with duplicate image paths and store them in a new dataframe
duplicate_images = cmmd[cmmd.duplicated(subset=['image_path'], keep=False)]
cmmd = cmmd.drop_duplicates(subset=['image_path'])

cmmd11 = cmmd.copy()
cmmd11['image_path'] = cmmd11['image_path'] + '/1-1.dcm'
# checl of the image path exists. if not, remove the entry
cmmd11 = cmmd11[cmmd11['image_path'].apply(os.path.exists)]
cmmd11['view'] = 'CC'

cmmd12 = cmmd.copy()
cmmd12['image_path'] = cmmd12['image_path'] + '/1-2.dcm'
cmmd12 = cmmd12[cmmd12['image_path'].apply(os.path.exists)]
cmmd12['view'] = 'MLO'

cmmd13 = cmmd.copy()
cmmd13['image_path'] = cmmd13['image_path'] + '/1-3.dcm'
cmmd13 = cmmd13[cmmd13['image_path'].apply(os.path.exists)]

# replace the D1 in patientID with D1.5 if the path exists
cmmd13['patientID'] = cmmd13['patientID'].str.replace('D1', 'D1.5')
cmmd11['view'] = 'CC'

cmmd14 = cmmd.copy()
cmmd14['image_path'] = cmmd14['image_path'] + '/1-4.dcm'
cmmd14 = cmmd14[cmmd14['image_path'].apply(os.path.exists)]
# replace the D1 in patientID with D1.5 if the path exists
cmmd14['patientID'] = cmmd14['patientID'].str.replace('D1', 'D1.5')
cmmd14['view'] = 'MLO'

# merge the four dataframes
cmmd = pd.concat([cmmd11, cmmd12, cmmd13, cmmd14])

# Iterate over the dataframe and if 1-3.dcm exists for a particular patientID, then go to the entry containing 1-1.dcm and 1-2.dcm
# of the same corresponding patientID and replace the laterality with L for both and view as CC and MLO, and R for 1-3.dcm and 1-4.dcm
# of the corresponding patientID and view as CC and MLO respectively
for i, row in tqdm(cmmd.iterrows(), total=cmmd.shape[0]):
    if '1-3.dcm' in row['image_path']:
        patientID = row['patientID']
        cmmd.loc[(cmmd['patientID'] == patientID) & (cmmd['image_path'].str.contains('1-1.dcm')), 'laterality'] = 'L'
        cmmd.loc[(cmmd['patientID'] == patientID) & (cmmd['image_path'].str.contains('1-2.dcm')), 'laterality'] = 'L'
        cmmd.loc[(cmmd['patientID'] == patientID) & (cmmd['image_path'].str.contains('1-1.dcm')), 'view'] = 'CC'
        cmmd.loc[(cmmd['patientID'] == patientID) & (cmmd['image_path'].str.contains('1-2.dcm')), 'view'] = 'MLO'
        cmmd.loc[(cmmd['patientID'] == patientID) & (cmmd['image_path'].str.contains('1-3.dcm')), 'laterality'] = 'R'
        cmmd.loc[(cmmd['patientID'] == patientID) & (cmmd['image_path'].str.contains('1-3.dcm')), 'view'] = 'CC'
        cmmd.loc[(cmmd['patientID'] == patientID) & (cmmd['image_path'].str.contains('1-4.dcm')), 'laterality'] = 'R'
        cmmd.loc[(cmmd['patientID'] == patientID) & (cmmd['image_path'].str.contains('1-4.dcm')), 'view'] = 'MLO'

# remove entries with NaN values in the image_path column
cmmd = cmmd.dropna(subset=['image_path'])

# change the name of the column image path to original_image_path
cmmd = cmmd.rename(columns={'image_path': 'original_image_path'})

print(cmmd.shape)
print(cmmd.count())
print(cmmd.head())

In [None]:
rename_and_save_images(cmmd)
find_missing_and_duplicate_files(cmmd)

In [None]:
cmmd = pd.read_csv('cmmd.csv')

print(duplicate_images.shape)
print(duplicate_images.columns)

# compare the duplicate images with the original images by matching the patientID, laterality
# if all 3 match, then put the subtype of the duplicate image in the subtype of the original image
# this operation should not afffect the original length of the dataframe
for i, row in duplicate_images.iterrows():
    patientID = row['patientID']
    laterality = row['laterality']
    subtype = row['subtype']
    classification = row['classification']
    cmmd.loc[(cmmd['patientID'] == patientID) & (cmmd['laterality'] == laterality), 'subtype'] = subtype
    cmmd.loc[(cmmd['patientID'] == patientID) & (cmmd['laterality'] == laterality), 'classification'] = classification

print(cmmd.shape)
print(cmmd.count())
print(cmmd.head())

# save the dataframe to a csv file
cmmd.to_csv('cmmd.csv', index=False)

# CDD-CESM

In [None]:
cdd_cesm = pd.read_excel('../Individual_Original_Datasets/CDD-CESM/Radiology-manual-annotations.xlsx')

# if the BIRADS value is 2$2, replace it with 2
cdd_cesm['BIRADS'] = cdd_cesm['BIRADS'].replace('2$2', int(2))

# convert all entries to characters
cdd_cesm['BIRADS'] = cdd_cesm['BIRADS'].astype(str)

# Put nan in all the entries that contain $ as a part in their BIRADS column
cdd_cesm['BIRADS'] = cdd_cesm['BIRADS'].replace('.*\$.*', np.nan, regex=True)

# add a columns called dataset and set it to cdd-
cdd_cesm['dataset'] = 'cdd-cesm'
cdd_cesm.rename(columns={'Image_name': 'image_path', 'Patient_ID': 'patientID', 'Side': 'laterality', 'Breast density (ACR)' : 'density', 'View': 'view', 'Age': 'age', 'Pathology Classification/ Follow up': 'classification'}, inplace=True)

cdd_cesm['image_path'] = cdd_cesm['image_path'].str.replace(' ', '')
cdd_cesm.rename(columns={'image_path': 'original_image_path', 'new_path': 'image_path'}, inplace=True)

# keep only the entries with CESM as the image type
cdd_cesm = cdd_cesm[cdd_cesm['Type'] == 'DM']
cdd_cesm['original_image_path'] = '../Individual_Original_Datasets/CDD-CESM/Low_Energy_Images/' + cdd_cesm['original_image_path'] + '.jpg'

#  remove the column Type, Findings, Tags, Machine
cdd_cesm = cdd_cesm.drop(columns=['Type', 'Findings', 'Tags', 'Machine', 'density'])

# remove rows with NaN values in image_path
cdd_cesm = cdd_cesm.dropna(subset=['original_image_path'])

print(cdd_cesm.shape)
print(cdd_cesm.count())
print(cdd_cesm.head())

In [None]:
rename_and_save_images(cdd_cesm)
find_missing_and_duplicate_files(cdd_cesm)

# RSNA Screening Data

In [None]:
rsna_screening = pd.read_csv('../Individual_Original_Datasets/EMBED/train.csv')
rsna_screening = rsna_screening[['patient_id', 'image_id', 'laterality', 'view', 'age', 'BIRADS', 'density']]
rsna_screening['age'] = rsna_screening['age'].astype(str).str.extract(r'(\d+)')

# add a column image_path to the inbreastframe, which contains the pth of the image in the fomr patient_id/image_id.dcm
rsna_screening['image_path'] = 'EMBED/train_images/' + rsna_screening['patient_id'].astype(str) + '/' + rsna_screening['image_id'].astype(str) + '.dcm'

# rename the column patient_id to patientID
rsna_screening = rsna_screening.rename(columns={'patient_id': 'patientID'})

# remove rows with NaN values in image_path
rsna_screening = rsna_screening.dropna(subset=['image_path'])
rsna_screening['image_type'] = 'full mammogram image'
rsna_screening['dataset'] = 'rsna-screening'

# remove the entry with the patientID 2029770528 due to broken entry (image does not exist)
rsna_screening = rsna_screening[rsna_screening['image_id'] != 2029770528]
rsna_screening = rsna_screening.drop(columns=['image_id'])

# BIRADs to NBM mapping
birads_to_nbm = {
    0.0: 'Normal',
    1.0: 'Normal',
    2.0: 'Benign',
    3.0: 'Benign',
    4.0: 'Suspicious Malignant',   #(Suspicious Anomaly. Biopsy should be considered)
    5.0: 'Malignant',
    6.0: 'Malignant'
}

rsna_screening.loc[rsna_screening['dataset'] == 'rsna-screening', 'classification'] = rsna_screening.loc[rsna_screening['dataset'] == 'rsna-screening', 'BIRADS'].map(birads_to_nbm)
rsna_screening['image_path'] = rsna_screening['image_path'].str.replace('EMBED/train_images/', '../Individual_Original_Datasets/EMBED/train_images/')

# remove the entries with NaN values in the image_path column
rsna_screening = rsna_screening.dropna(subset=['image_path'])


# change the name of the column image path to original_image_path
rsna_screening = rsna_screening.rename(columns={'image_path': 'original_image_path'})

print(rsna_screening.shape)
print(rsna_screening.count())
print(rsna_screening.head())

In [None]:
rename_and_save_images(rsna_screening)
find_missing_and_duplicate_files(rsna_screening)

# DMID

In [None]:
# open the file Metadata.xlsx, read the file from row number 32, and then store the data in a dataframe, and name the columns
dmid = pd.read_excel('../Individual_Original_Datasets/DMID/Metadata.xlsx', skiprows=30)

# rename the columns
dmid = dmid.rename(columns={'Unnamed: 0': 'patientID', 'Unnamed: 1': 'view', 'Unnamed: 2': 'density', 'Unnamed: 3': 'class', 'Unnamed: 4': 'classification', 'Unnamed: 5': 'x', 'Unnamed: 6': 'y', 'Unnamed: 7': 'radius'})

# if the view is MLORT, then view is MLO and laterality is R, if the view is CCRT, then view is CC and laterality is R, if the view is MLOLT, then view is MLO and laterality is L, if the view is CCRT, then view is CC and laterality is L
dmid['laterality'] = dmid['view'].str.extract(r'([RL])')
dmid['view'] = dmid['view'].str.replace('RT', '')
dmid['view'] = dmid['view'].str.replace('LT', '')
dmid['image_path'] = 'DMID/DICOM_Images/' + dmid['patientID'].astype(str)+ '.dcm'
# convert path to string
dmid['image_path'] = dmid['image_path'].astype(str)
# remove space from the image_path column
dmid['image_path'] = dmid['image_path'].str.replace(' ', '')
dmid['image_type'] = 'full mammogram image'
dmid['dataset'] = 'dmid'

# remove the duplicate patientID entries
dmid = dmid.drop_duplicates(subset=['patientID'])

# Define the path to the ROI Masks directory
roi_masks_dir = '../Individual_Original_Datasets/DMID/ROI_Masks'

# Get a list of all files in the directory
roi_files = os.listdir(roi_masks_dir)

# save the files in the ROI_path column by comparing patientID with the file name by removing the extension
# add the entry if and only if teh file exists
dmid['ROI_path'] = dmid['patientID'].astype(str) + '.tif'
# convert path to string
dmid['ROI_path'] = dmid['ROI_path'].astype(str)
# remove the spcees from the ROI_path column
dmid['ROI_path'] = dmid['ROI_path'].str.replace(' ', '')
# if the pasth does nt exist set it to nan
dmid['ROI_path'] = dmid['ROI_path'].apply(lambda x: os.path.join(roi_masks_dir, x) if x in roi_files else np.nan)

# convert N, B, M in classicaistion column to Normal,Benign and Malognantr
dmid['classification'] = dmid['classification'].replace('N ', 'Normal')
dmid['classification'] = dmid['classification'].replace('B ', 'Benign')
dmid['classification'] = dmid['classification'].replace('M ', 'Malignant')
dmid['classification'] = dmid['classification'].replace('N', 'Normal')
dmid['classification'] = dmid['classification'].replace('B', 'Benign')
dmid['classification'] = dmid['classification'].replace('M', 'Malignant')

dmid['image_path'] = dmid['image_path'].str.replace('DMID/DICOM_Images/', '../Individual_Original_Datasets/DMID/DICOM_Images/')

# remove rows with NaN values in image_path
dmid = dmid.dropna(subset=['image_path'])

# change the name of the column image path to original_image_path
dmid = dmid.rename(columns={'image_path': 'original_image_path'})

print(dmid.shape)
print(dmid.count())
print(dmid.head())

In [None]:
def process_and_save_image(old_path, new_path, ROI_path=None):
    # Read the original image
    if old_path.endswith('.dcm'):
        ds = pydicom.dcmread(old_path)
        img = ds.pixel_array
    else:
        img = plt.imread(old_path)
    
    # Convert RGBA to RGB if necessary
    if img.ndim == 3 and img.shape[2] == 4:  # RGBA image
        img_rgb = Image.fromarray(img).convert('RGB')
        img_rgb.save(new_path)
    else:
        plt.imsave(new_path, img, cmap='gray' if img.ndim == 2 else None)
    
    # Process and save ROI if provided
    if ROI_path:
        tif_image = tifffile.imread(ROI_path)
        jpeg_image = Image.fromarray(tif_image)

        # Save ROI
        ROI_new_path = new_path.replace('.jpg', '_ROI.jpg')
        if jpeg_image.mode == 'RGBA':
            jpeg_image = jpeg_image.convert('RGB')
        jpeg_image.save(ROI_new_path)

def rename_and_save_images_w_ROI(data):
    j = 0
    dataset_name = data['dataset'].iloc[0]  # Extract the dataset name from the first row
    print(f"Processing {dataset_name} dataset...")
    new_dir = f"Original_Dataset/{dataset_name}"

    data['new_path'] = None
    data['original_image_path'] = data['original_image_path'].str.replace('DMID', '../Individual_Original_Datasets/DMID')
    data['ROI_path'] = data['ROI_path'].str.replace('DMID', '../Individual_Original_Datasets/DMID')

    # Create a new directory to save the renamed images
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)

    print(f"Saving images to {new_dir}...")
    new_paths = []

    for i, row in tqdm(data.iterrows(), total=data.shape[0]):
        old_path = row['original_image_path']
        new_filename = f"{row['dataset']}_{j}.jpg"
        new_path = os.path.join(new_dir, new_filename)
        ROI_path = row['ROI_path'] if not pd.isna(row['ROI_path']) else None
        
        process_and_save_image(old_path, new_path, ROI_path)
        
        # Update the DataFrame with the new path
        data.at[i, 'new_path'] = new_path
        data.at[i, 'ROI_path'] = ROI_path
        new_paths.append(new_path)
        j += 1
    
    print(f"Total images saved: {j}")

    data['original_image_path'] = data['original_image_path'].replace('../Individual_Original_Datasets/', '', regex=True)
    data['ROI_path'] = data['original_image_path']
    data['ROI_path'] = data['ROI_path'].replace('.jpg', '_ROI.jpg')

    # Save the DataFrame to a CSV file
    data.to_csv(f"{dataset_name}.csv", index=False)

In [None]:
rename_and_save_images_w_ROI(dmid)
find_missing_and_duplicate_files(dmid)

### Getting the ACR Density and BIRADS from the Reports for DMID

In [None]:
# Load the CSV file
csv_file_path = 'dmid.csv'
df = pd.read_csv(csv_file_path)

# Function to extract the entire line after BIRADS and ACR for density
def extract_birads_and_density(report_text):
    # Regular expression to match the entire line after BIRADS:
    birads_match = re.search(r'BIRADS:\s*(.*)', report_text, re.IGNORECASE)
    # Regular expression to match the entire line after ACR (handles different formats like ACR-A, ACR B, etc.)
    density_match = re.search(r'ACR\s*(.*)', report_text, re.IGNORECASE)
    
    birads = birads_match.group(1).strip() if birads_match else None
    density = density_match.group(1).strip() if density_match else None
    
    return birads, density

df['BIRADS'] = ''
df['density'] = ''

for i in range(0, 510):
    patient_id = df['patientID'][i]
    # remove IMG from the patient ID
    patient_id = patient_id[3:]
    report_filename = f'Reports/Img{patient_id}.txt'
    # remove all the spaces from the report_filename
    report_filename = report_filename.replace(" ", "")
    if os.path.exists(report_filename):
        with open(report_filename, 'r') as file:
            report_text = file.read()
        birads, density = extract_birads_and_density(report_text)
        df.at[i, 'BIRADS'] = birads
        df.at[i, 'density'] = density
    else:
        print(f"Report file not found at the path {report_filename}")

df['density'] = df['density'].str.replace(')', '')
df['density'] = df['density'].str.replace('.', '')
df['density'] = df['density'].str.replace('-', '')
df['density'] = df['density'].str.replace(' ', '')
df['density'] = df['density'].str.replace(',', '')

df['BIRADS'] = df['BIRADS'].str.replace('4a', '4')
df['BIRADS'] = df['BIRADS'].str.replace('4b', '4')
df['BIRADS'] = df['BIRADS'].str.replace('4c', '4')
df.at[44, 'BIRADS'] = '3'
df.at[29, 'BIRADS'] = '4'

# Replace all the values in the BIRADS column that are not 1, 2, 3, 4, 5, or 0 with NULL
df['BIRADS'] = df['BIRADS'].apply(lambda x: x if x in ['1', '2', '3', '4', '5', '0'] else np.nan)

# convert the BIRADS and density columns to string
df['BIRADS'] = df['BIRADS'].astype(str)

# for those entries where classification is null and BIRADS is 0 or 1, set the classification to Normal, 2 or 3 to Benign, 5 to Malignant, 4 to Suspicious Malignant
for i in range(len(df)):
    if pd.isnull(df['classification'][i]) and df['BIRADS'][i] == '0' or df['BIRADS'][i] == '1':
        df.at[i, 'classification'] = 'Normal'
    elif pd.isnull(df['classification'][i]) and df['BIRADS'][i] == '2' or df['BIRADS'][i] == '3':
        df.at[i, 'classification'] = 'Benign'
    elif pd.isnull(df['classification'][i]) and df['BIRADS'][i] == '4':
        df.at[i, 'classification'] = 'Suspicious Malignant'
    elif pd.isnull(df['classification'][i]) and df['BIRADS'][i] == '5':
        df.at[i, 'classification'] = 'Malignant'

# Save the updated DataFrame back to a CSV file
df.to_csv('dmid.csv', index=False)

# Merging all the dataset files together

In [None]:
# concatenate the dataframes
mias = pd.read_csv('mias.csv')
inbreast = pd.read_csv('inbreast.csv')
mini_ddsm = pd.read_csv('mini-ddsm.csv')
kau_bcmd = pd.read_csv('kau-bcmd.csv')
cmmd = pd.read_csv('cmmd.csv')
cdd_cesm = pd.read_csv('cdd-cesm.csv')  
# add image_type as full mammogram image
cdd_cesm['image_type'] = 'full mammogram image'
rsna_screening = pd.read_csv('rsna-screening.csv')
dmid = pd.read_csv('dmid.csv')

# concatenate the dataframes
dataset = pd.concat([mias, inbreast, mini_ddsm, kau_bcmd, cmmd, cdd_cesm, rsna_screening, dmid])

# add a column mask_path by replaceing Original_Dataset with Masks
dataset['mask_path'] = dataset['new_path'].str.replace('Original_Dataset', 'Masks')

# save the dataframe to a csv file
dataset.to_csv('dataset.csv', index=False)