In [1]:
import os
os.chdir("../")
%pwd

'/home/malik-harris/dental'

In [2]:
from PIL import Image
import os
from tqdm import tqdm

input_dir = "data/final"
output_dir = "data/preprocessed"
target_size = (224, 224)  

os.makedirs(output_dir, exist_ok=True)

for class_dir in tqdm(os.listdir(input_dir), desc="Classes"):
    src_class_dir = os.path.join(input_dir, class_dir)
    dst_class_dir = os.path.join(output_dir, class_dir)
    os.makedirs(dst_class_dir, exist_ok=True)
    if not os.path.isdir(src_class_dir):
        continue
    img_list = os.listdir(src_class_dir)
    for img_name in tqdm(img_list, desc=f"Processing {class_dir}", leave=False):
        src_img_path = os.path.join(src_class_dir, img_name)
        dst_img_path = os.path.join(dst_class_dir, img_name)
        try:
            with Image.open(src_img_path) as img:
                img = img.convert("RGB")
                img = img.resize(target_size)
                img.save(dst_img_path)
        except Exception as e:
            print(f"Error processing {src_img_path}: {e}")

Classes: 100%|██████████| 8/8 [04:40<00:00, 35.10s/it]


In [3]:
import os
import random
import shutil
from sklearn.model_selection import train_test_split
from tqdm import tqdm

pre_dir = "data/preprocessed"
final_data_dir = "data/Final_data"
train_dir = os.path.join(final_data_dir, "train")
val_dir = os.path.join(final_data_dir, "val")
test_dir = os.path.join(final_data_dir, "test")
test_size = 0.1
val_size = 0.2 

os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

for class_dir in tqdm(os.listdir(pre_dir), desc="Splitting classes"):
    src_dir = os.path.join(pre_dir, class_dir)
    if not os.path.isdir(src_dir):
        continue
    images = [f for f in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, f))]
    # Limit to 150 images per class
    if len(images) > 500:
        images = random.sample(images, 500)
    # First split off test set
    trainval_imgs, test_imgs = train_test_split(images, test_size=test_size, random_state=42)
    # Then split train/val
    val_ratio = val_size / (1 - test_size)
    train_imgs, val_imgs = train_test_split(trainval_imgs, test_size=val_ratio, random_state=42)

    # Create class subfolders
    train_class_dir = os.path.join(train_dir, class_dir)
    val_class_dir = os.path.join(val_dir, class_dir)
    test_class_dir = os.path.join(test_dir, class_dir)
    os.makedirs(train_class_dir, exist_ok=True)
    os.makedirs(val_class_dir, exist_ok=True)
    os.makedirs(test_class_dir, exist_ok=True)

    for img in tqdm(train_imgs, desc=f"Copying train/{class_dir}", leave=False):
        shutil.copy(os.path.join(src_dir, img), os.path.join(train_class_dir, img))
    for img in tqdm(val_imgs, desc=f"Copying val/{class_dir}", leave=False):
        shutil.copy(os.path.join(src_dir, img), os.path.join(val_class_dir, img))
    for img in tqdm(test_imgs, desc=f"Copying test/{class_dir}", leave=False):
        shutil.copy(os.path.join(src_dir, img), os.path.join(test_class_dir, img))

Splitting classes: 100%|██████████| 8/8 [00:01<00:00,  7.60it/s]
