In [1]:
#Importing Dependencies
import os             # use it to manipulate file/folder directories
import shutil         # to copy or move any data from one dir to another
import numpy as np    # to convert image data to array format
import pandas as pd   # use it to make dataframes
from PIL import Image # use it to access image file in a directory and check its type
import hashlib

In [19]:
# Set up retrieval of file info, img label, src and dest dir's
src_img_dir = "./jpeg"
dest_parent_dir = "./mass_test_cropped"
csv_file_path = "./csv/mass_case_description_test_set.csv"
df = pd.read_csv(csv_file_path)
df['folder_name_1'] = df['cropped image file path'].str.split('/').str[1]
df['folder_name_2'] = df['cropped image file path'].str.split('/').str[2]
# Drop unnecessary columns
df = df.drop(['image file path', 'cropped image file path', 'ROI mask file path'], axis=1)
df.head()

Unnamed: 0,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,folder_name_1,folder_name_2
0,P_00016,4,LEFT,CC,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,1.3.6.1.4.1.9590.100.1.2.259596319110047779433...,1.3.6.1.4.1.9590.100.1.2.308205863110625704423...
1,P_00016,4,LEFT,MLO,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,1.3.6.1.4.1.9590.100.1.2.207144238612220754118...,1.3.6.1.4.1.9590.100.1.2.381440141511137044327...
2,P_00017,2,LEFT,CC,1,mass,ROUND,CIRCUMSCRIBED,4,MALIGNANT,4,1.3.6.1.4.1.9590.100.1.2.265130777712709757209...,1.3.6.1.4.1.9590.100.1.2.212143028513012144941...
3,P_00017,2,LEFT,MLO,1,mass,ROUND,ILL_DEFINED,4,MALIGNANT,4,1.3.6.1.4.1.9590.100.1.2.668624952114641296207...,1.3.6.1.4.1.9590.100.1.2.154030438134025107421...
4,P_00032,3,RIGHT,CC,1,mass,ROUND,OBSCURED,0,BENIGN,2,1.3.6.1.4.1.9590.100.1.2.362221700813915332616...,1.3.6.1.4.1.9590.100.1.2.199593071810497070809...


In [20]:
df.shape

(378, 13)

In [21]:
# Access 'pathology' and 'folder_path' columns
selected_data_df = df[['pathology', 'folder_name_1','folder_name_2']]
selected_data_df.head()

Unnamed: 0,pathology,folder_name_1,folder_name_2
0,MALIGNANT,1.3.6.1.4.1.9590.100.1.2.259596319110047779433...,1.3.6.1.4.1.9590.100.1.2.308205863110625704423...
1,MALIGNANT,1.3.6.1.4.1.9590.100.1.2.207144238612220754118...,1.3.6.1.4.1.9590.100.1.2.381440141511137044327...
2,MALIGNANT,1.3.6.1.4.1.9590.100.1.2.265130777712709757209...,1.3.6.1.4.1.9590.100.1.2.212143028513012144941...
3,MALIGNANT,1.3.6.1.4.1.9590.100.1.2.668624952114641296207...,1.3.6.1.4.1.9590.100.1.2.154030438134025107421...
4,BENIGN,1.3.6.1.4.1.9590.100.1.2.362221700813915332616...,1.3.6.1.4.1.9590.100.1.2.199593071810497070809...


In [22]:
# Iterate over the rows and access values:
def is_valid_image(file_path):
    """Return True if the file is a valid image."""
    try:
        with Image.open(file_path) as img:
            img.verify()  # verify that it is, in fact, an image
        return True
    except (IOError, SyntaxError):
        return False

def image_type(image_path):
    with Image.open(image_path) as img:
        # Convert image to numpy array
        data = np.array(img)
        # Check if RGB image
        if len(data.shape) == 3:
            # Check if all channels have same values which indicates grayscale
            if np.array_equal(data[:,:,0], data[:,:,1]) and np.array_equal(data[:,:,1], data[:,:,2]):
                unique_values = np.unique(data[:,:,0])
                # Check if only two unique values (0 and 255) which indicates binary
                if len(unique_values) == 2 and 0 in unique_values and 255 in unique_values:
                    return "Binary"
                else:
                    return "Grayscale"
            else:
                return "Color"

        # Check if single channel image
        elif len(data.shape) == 2:
            unique_values = np.unique(data)
            # Check if only two unique values (0 and 255) which indicates binary
            if len(unique_values) == 2 and 0 in unique_values and 255 in unique_values:
                return "Binary"
            else:
                return "Grayscale"

def image_hash(image_path):
    """Compute a hash of the image content."""
    with open(image_path, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()
    
def get_unique_filename(folder_name,file_path,current_image_hash):
    """Generate a unique filename by appending a number to it."""
    base, ext = os.path.splitext(file_path)
    counter = 1
    original_file_path = file_path
    while os.path.exists(file_path) == True:
        existing_image_hash = image_hash(file_path)
        if current_image_hash == existing_image_hash:  # Duplicate image found
            return None  # Return None to indicate a duplicate was found
        # Not a duplicate, but filename exists. Generate a new unique filename
        file_path = f"{base}_{counter}_{folder_name}{ext}"
        counter += 1
    return file_path

# Iteration routine to copy and transfer images from source to the labeled directories
dest_folder = "" # destination folder
for index, row in selected_data_df.iterrows(): #.head(100)
    pathology = row['pathology']
    src_folder_name = row['folder_name_2']
    print(f"Folder serial: {index}, name: {src_folder_name}")    
    dest_folder = dest_parent_dir + "/" + pathology    
    # Iterate through each individual file in a folder
    target_dir = src_img_dir + "/" + src_folder_name
    # Check if source directory exists, if not, skip the current iteration
    if not os.path.exists(target_dir):
        print(f"Directory {target_dir} does not exist. Skipping...")
        continue
        
    contents = os.listdir(target_dir)
    print(f"Total files in the dir: {len(contents)}")
    for item in contents:
        full_path_toFile = os.path.join(target_dir, item)
        if is_valid_image(full_path_toFile):
            img_type = image_type(full_path_toFile)
            if img_type== "Grayscale":
                dest_file = os.path.join(dest_folder, item)
                current_image_hash = image_hash(full_path_toFile)                
                unique_dest_file = get_unique_filename(src_folder_name, dest_file, current_image_hash)
                if unique_dest_file:  # Check if it's not a duplicate
                    shutil.copy2(full_path_toFile, unique_dest_file)                
                    print(f"Image File ({img_type}): {item}")
                else:
                    print(f"Duplicate Image Skipped: {item}")                                
#                 shutil.copy2(full_path_toFile, dest_file)                
#                 print(f"Image File ({img_type}): {item}")
            else:
                print(f"Not a Grayscale Image File: {item}")
        else:
            print(f"Not an Image File: {item}")

Folder serial: 0, name: 1.3.6.1.4.1.9590.100.1.2.30820586311062570442302321942433426184
Total files in the dir: 2
Image File (Grayscale): 1-083.jpg
Image File (Grayscale): 2-272.jpg
Folder serial: 1, name: 1.3.6.1.4.1.9590.100.1.2.381440141511137044327302306604206077287
Total files in the dir: 2
Image File (Grayscale): 1-084.jpg
Image File (Grayscale): 2-273.jpg
Folder serial: 2, name: 1.3.6.1.4.1.9590.100.1.2.212143028513012144941507232513982203672
Total files in the dir: 2
Image File (Grayscale): 1-274.jpg
Image File (Grayscale): 2-085.jpg
Folder serial: 3, name: 1.3.6.1.4.1.9590.100.1.2.15403043813402510742192372832381918984
Total files in the dir: 2
Image File (Grayscale): 1-275.jpg
Image File (Grayscale): 2-086.jpg
Folder serial: 4, name: 1.3.6.1.4.1.9590.100.1.2.199593071810497070809647901570077988031
Total files in the dir: 2
Image File (Grayscale): 1-276.jpg
Image File (Grayscale): 2-087.jpg
Folder serial: 5, name: 1.3.6.1.4.1.9590.100.1.2.44610919611642954332266410812181604922

Image File (Grayscale): 2-129.jpg
Folder serial: 46, name: 1.3.6.1.4.1.9590.100.1.2.422957282012810013841418684460061208674
Total files in the dir: 2
Image File (Grayscale): 1-022.jpg
Image File (Grayscale): 2-130.jpg
Folder serial: 47, name: 1.3.6.1.4.1.9590.100.1.2.303093614813874094019726427331964927001
Total files in the dir: 2
Image File (Grayscale): 1-023.jpg
Image File (Grayscale): 2-131.jpg
Folder serial: 48, name: 1.3.6.1.4.1.9590.100.1.2.140414504511432522026875170760401387879
Total files in the dir: 2
Image File (Grayscale): 1-024.jpg
Image File (Grayscale): 2-132.jpg
Folder serial: 49, name: 1.3.6.1.4.1.9590.100.1.2.195080574210944708801848595380111274367
Total files in the dir: 2
Image File (Grayscale): 1-025.jpg
Image File (Grayscale): 2-133.jpg
Folder serial: 50, name: 1.3.6.1.4.1.9590.100.1.2.92625977011874636004018945600136848409
Total files in the dir: 2
Image File (Grayscale): 1-026.jpg
Image File (Grayscale): 2-134.jpg
Folder serial: 51, name: 1.3.6.1.4.1.9590.100.1

Image File (Grayscale): 2-179.jpg
Folder serial: 91, name: 1.3.6.1.4.1.9590.100.1.2.271196701611666628735372012851189292665
Total files in the dir: 2
Image File (Grayscale): 1-072.jpg
Image File (Grayscale): 2-180.jpg
Folder serial: 92, name: 1.3.6.1.4.1.9590.100.1.2.256403565813600307536474945580729509681
Total files in the dir: 2
Image File (Grayscale): 1-073.jpg
Image File (Grayscale): 2-181.jpg
Folder serial: 93, name: 1.3.6.1.4.1.9590.100.1.2.49644675112519947439011196481198359947
Total files in the dir: 2
Image File (Grayscale): 1-074.jpg
Image File (Grayscale): 2-182.jpg
Folder serial: 94, name: 1.3.6.1.4.1.9590.100.1.2.134854701310583476236543627840768062555
Total files in the dir: 2
Image File (Grayscale): 1-075.jpg
Image File (Grayscale): 2-183.jpg
Folder serial: 95, name: 1.3.6.1.4.1.9590.100.1.2.399661616412709482210273174221545231646
Total files in the dir: 2
Image File (Grayscale): 1-076.jpg
Image File (Grayscale): 2-184.jpg
Folder serial: 96, name: 1.3.6.1.4.1.9590.100.1

Image File (Grayscale): 2-225.jpg
Folder serial: 136, name: 1.3.6.1.4.1.9590.100.1.2.8765695412897876727426080090200147170
Total files in the dir: 2
Image File (Grayscale): 1-118.jpg
Image File (Grayscale): 2-226.jpg
Folder serial: 137, name: 1.3.6.1.4.1.9590.100.1.2.242030448212482239912700364092701339463
Total files in the dir: 2
Image File (Grayscale): 1-119.jpg
Image File (Grayscale): 2-227.jpg
Folder serial: 138, name: 1.3.6.1.4.1.9590.100.1.2.357301018711284777003046565202970682599
Total files in the dir: 2
Image File (Grayscale): 1-120.jpg
Image File (Grayscale): 2-228.jpg
Folder serial: 139, name: 1.3.6.1.4.1.9590.100.1.2.296688914412318095942614557852519960917
Total files in the dir: 2
Image File (Grayscale): 1-121.jpg
Image File (Grayscale): 2-229.jpg
Folder serial: 140, name: 1.3.6.1.4.1.9590.100.1.2.73692977010116789140672214141142360185
Total files in the dir: 2
Image File (Grayscale): 1-122.jpg
Image File (Grayscale): 2-230.jpg
Folder serial: 141, name: 1.3.6.1.4.1.9590.1

Image File (Grayscale): 2-274.jpg
Folder serial: 181, name: 1.3.6.1.4.1.9590.100.1.2.148976172312695421705301976510452820236
Total files in the dir: 2
Image File (Grayscale): 1-167.jpg
Image File (Grayscale): 2-275.jpg
Folder serial: 182, name: 1.3.6.1.4.1.9590.100.1.2.91848545911005523541209798760943510196
Total files in the dir: 2
Image File (Grayscale): 1-168.jpg
Image File (Grayscale): 2-276.jpg
Folder serial: 183, name: 1.3.6.1.4.1.9590.100.1.2.354447778710312542110033340782411712017
Total files in the dir: 2
Image File (Grayscale): 1-169.jpg
Image File (Grayscale): 2-277.jpg
Folder serial: 184, name: 1.3.6.1.4.1.9590.100.1.2.219465915111092818230700566321147872133
Total files in the dir: 2
Image File (Grayscale): 1-170.jpg
Image File (Grayscale): 2-278.jpg
Folder serial: 185, name: 1.3.6.1.4.1.9590.100.1.2.231894579813593468217419823840305212127
Total files in the dir: 2
Image File (Grayscale): 1-171.jpg
Image File (Grayscale): 2-279.jpg
Folder serial: 186, name: 1.3.6.1.4.1.9590

Image File (Grayscale): 2-027.jpg
Folder serial: 226, name: 1.3.6.1.4.1.9590.100.1.2.263175968212620451040123544731301212892
Total files in the dir: 2
Image File (Grayscale): 1-220.jpg
Image File (Grayscale): 2-028.jpg
Folder serial: 227, name: 1.3.6.1.4.1.9590.100.1.2.180751617113935741930229149600506327269
Total files in the dir: 2
Image File (Grayscale): 1-221.jpg
Image File (Grayscale): 2-029.jpg
Folder serial: 228, name: 1.3.6.1.4.1.9590.100.1.2.237554983812675500928641084182457551234
Total files in the dir: 2
Image File (Grayscale): 1-222.jpg
Image File (Grayscale): 2-030.jpg
Folder serial: 229, name: 1.3.6.1.4.1.9590.100.1.2.29979988711620169137293190443121958759
Total files in the dir: 2
Image File (Grayscale): 1-223.jpg
Image File (Grayscale): 2-031.jpg
Folder serial: 230, name: 1.3.6.1.4.1.9590.100.1.2.77391350612919093242582778213778105346
Total files in the dir: 2
Image File (Grayscale): 1-224.jpg
Image File (Grayscale): 2-032.jpg
Folder serial: 231, name: 1.3.6.1.4.1.9590.

Image File (Grayscale): 2-083.jpg
Folder serial: 271, name: 1.3.6.1.4.1.9590.100.1.2.328011571912253968128627544932881689288
Total files in the dir: 2
Image File (Grayscale): 1-276.jpg
Image File (Grayscale): 2-084.jpg
Folder serial: 272, name: 1.3.6.1.4.1.9590.100.1.2.74351343711650842507206772644172627457
Total files in the dir: 2
Image File (Grayscale): 1-277.jpg
Image File (Grayscale): 2-085.jpg
Folder serial: 273, name: 1.3.6.1.4.1.9590.100.1.2.168501304611743333636619593341368522720
Total files in the dir: 2
Image File (Grayscale): 1-278.jpg
Image File (Grayscale): 2-086.jpg
Folder serial: 274, name: 1.3.6.1.4.1.9590.100.1.2.184063067412303497500789400442579095895
Total files in the dir: 2
Image File (Grayscale): 1-279.jpg
Image File (Grayscale): 2-087.jpg
Folder serial: 275, name: 1.3.6.1.4.1.9590.100.1.2.122831666011537404722786976620674592384
Total files in the dir: 2
Image File (Grayscale): 1-280.jpg
Image File (Grayscale): 2-088.jpg
Folder serial: 276, name: 1.3.6.1.4.1.9590

Image File (Grayscale): 2-129.jpg
Folder serial: 316, name: 1.3.6.1.4.1.9590.100.1.2.405215538612316971307803736710660532693
Total files in the dir: 2
Image File (Grayscale): 1-022.jpg
Image File (Grayscale): 2-130.jpg
Folder serial: 317, name: 1.3.6.1.4.1.9590.100.1.2.418529040211352094709719709500765846233
Total files in the dir: 2
Image File (Grayscale): 1-024.jpg
Image File (Grayscale): 2-132.jpg
Folder serial: 318, name: 1.3.6.1.4.1.9590.100.1.2.358777110012040595116278870160086976016
Total files in the dir: 2
Image File (Grayscale): 1-025.jpg
Image File (Grayscale): 2-133.jpg
Folder serial: 319, name: 1.3.6.1.4.1.9590.100.1.2.137435703213093899825836015743539848705
Total files in the dir: 2
Image File (Grayscale): 1-026.jpg
Image File (Grayscale): 2-134.jpg
Folder serial: 320, name: 1.3.6.1.4.1.9590.100.1.2.166780450612529588140273491753803765092
Total files in the dir: 1
Image File (Grayscale): 1-027.jpg
Folder serial: 321, name: 1.3.6.1.4.1.9590.100.1.2.203004761213509821449550

Image File (Grayscale): 2-185.jpg
Folder serial: 361, name: 1.3.6.1.4.1.9590.100.1.2.312109483013126559515254204702956315159
Total files in the dir: 2
Image File (Grayscale): 1-080.jpg
Image File (Grayscale): 2-186.jpg
Folder serial: 362, name: 1.3.6.1.4.1.9590.100.1.2.392521265211049497503641929252775059444
Total files in the dir: 2
Image File (Grayscale): 1-081.jpg
Image File (Grayscale): 2-187.jpg
Folder serial: 363, name: 1.3.6.1.4.1.9590.100.1.2.269247772812800655432369470871957942059
Total files in the dir: 2
Image File (Grayscale): 1-082.jpg
Image File (Grayscale): 2-188.jpg
Folder serial: 364, name: 1.3.6.1.4.1.9590.100.1.2.257877384211555448010431660912151834962
Total files in the dir: 2
Image File (Grayscale): 1-083.jpg
Image File (Grayscale): 2-189.jpg
Folder serial: 365, name: 1.3.6.1.4.1.9590.100.1.2.92323117813490589614713847541894281266
Total files in the dir: 2
Image File (Grayscale): 1-084.jpg
Image File (Grayscale): 2-190.jpg
Folder serial: 366, name: 1.3.6.1.4.1.9590

In [15]:
print(img_dir + "/") #+ folder_path

./jpeg/
