In [None]:
!python -m pip install -U pip
!python -m pip install pillow-heif
!pip install rembg
!pip install PIL

In [2]:
import os
import glob
from PIL import Image
from pillow_heif import register_heif_opener
from rembg import remove

## Preprocessing

<ul>
    <li>Resize the images to 512x512</li>
    <li>Remove background noise</li>
    <li>Convert from RBGA to RGB</li>
    <li>Save it with proper format -JPEG- in another file</li>
</ul>

In [4]:
register_heif_opener() # Enable support for HEIF images, including HEIC, AVIF, and other formats

# Define target format (e.g., JPEG), size (e.g., 224x224), and quality (0-100)
target_format = "JPEG"
target_size = (512, 512)
quality = 100

# Define path to your image directory
for classname in ['rock', 'paper', 'scissor']:
    src_dir = f"..\\..\\raw_data\\{classname}"
    dst_dir = f"..\\datasets\\procssed_data\\{classname}"

    os.makedirs(dst_dir, exist_ok=True) # Create destination directory if it doesn't exist

    # Find all images using glob
    for filepath in glob.glob(f"{src_dir}\\*"):
        try:
            # Open image with Pillow
            img = Image.open(filepath)
            
            # Resize image
            img = img.resize(target_size)
            
            # Removing the background from the given Image 
            img = remove(img)
            
            # Convering it to the proper format RGB instead of RBGA
            img = img.convert('L') # 'L' for black and white :)

            # Convert to target format and compress
            filename = filepath.split("\\")[-1].split(".")[0]
            img.save(f"{dst_dir}/{filename}.{target_format}", target_format, quality=quality)

            print(f"Converted to JPEG, resized and background removed {filepath}")
        except (OSError, IOError) as e:
            print('somthing went wrong in image proccecing',e)

Converted to JPEG, resized and background removed ..\..\raw_data\rock\1c6c150e-fd4c-4328-b19c-1b245f4637af.jpg
Converted to JPEG, resized and background removed ..\..\raw_data\rock\20240501_184445.heic
Converted to JPEG, resized and background removed ..\..\raw_data\rock\20240501_184727.heic
Converted to JPEG, resized and background removed ..\..\raw_data\rock\20240501_213443.heic
Converted to JPEG, resized and background removed ..\..\raw_data\rock\20240501_213452.heic
Converted to JPEG, resized and background removed ..\..\raw_data\rock\20240503_103813.jpg
Converted to JPEG, resized and background removed ..\..\raw_data\rock\20240503_103903.jpg
Converted to JPEG, resized and background removed ..\..\raw_data\rock\20240503_103904.jpg
Converted to JPEG, resized and background removed ..\..\raw_data\rock\20240503_103905.jpg
Converted to JPEG, resized and background removed ..\..\raw_data\rock\20240503_103906.jpg
Converted to JPEG, resized and background removed ..\..\raw_data\rock\20240

### Making sure JPEG is the only format in the output

In [6]:
formats = set()
filepaths = glob.glob('..\\datasets\\procssed_data\\*\\*')

for f in filepaths:
    formats.add(f.split('.')[-1].lower())

print(f'{formats} Must be only JPEG/JPG')

{'jpeg'} Must be only JPEG/JPG


### Count number of `input` vs `output` 

In [8]:
filepaths_not_processed = glob.glob('..\\..\\raw_data\\*\\*')
filepaths_processed = glob.glob('..\\datasets\\procssed_data\\*\\*')
print(f'{len(filepaths_not_processed)} is number of images in raw_data')
print(f'{len(filepaths_processed)} is number of images in processed_data')

843 is number of images in raw_data
841 is number of images in processed_data
