In [1]:
import os
import urllib.request
import zipfile
import json

def download_coco_validation():
    """Download only COCO validation set - the smart approach"""
    base_dir = "./coco_validation"
    os.makedirs(f"{base_dir}/images", exist_ok=True)
    os.makedirs(f"{base_dir}/annotations", exist_ok=True)
    
    # Only what you need - validation images and annotations
    files_to_download = {
        "val2017.zip": "http://images.cocodataset.org/zips/val2017.zip",
        "annotations_trainval2017.zip": "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
    }
    
    for filename, url in files_to_download.items():
        filepath = os.path.join(base_dir, filename)
        
        if not os.path.exists(filepath):
            print(f"Downloading {filename}...")
            urllib.request.urlretrieve(url, filepath)
            
        print(f"Extracting {filename}...")
        with zipfile.ZipFile(filepath, 'r') as zip_ref:
            zip_ref.extractall(base_dir)
        
        # Clean up
        os.remove(filepath)
        print(f"✓ Completed {filename}")
    
    return base_dir

def get_validation_stats(coco_dir):
    """Quick stats on what you actually downloaded"""
    annotation_file = os.path.join(coco_dir, "annotations", "instances_val2017.json")
    
    with open(annotation_file, 'r') as f:
        coco_data = json.load(f)
    
    print(f"✓ Images: {len(coco_data['images'])}")
    print(f"✓ Annotations: {len(coco_data['annotations'])}")
    print(f"✓ Categories: {len(coco_data['categories'])}")
    
    return coco_data

# Execute
if __name__ == "__main__":
    coco_path = download_coco_validation()
    validation_data = get_validation_stats(coco_path)


Downloading val2017.zip...
Extracting val2017.zip...
✓ Completed val2017.zip
Downloading annotations_trainval2017.zip...
Extracting annotations_trainval2017.zip...
✓ Completed annotations_trainval2017.zip
✓ Images: 5000
✓ Annotations: 36781
✓ Categories: 80


In [2]:
from PIL import Image
import os
import glob

def resize_coco_images(input_folder, output_folder, size=(224, 224)):
    """
    Resize COCO validation images to specified dimensions
    """
    os.makedirs(output_folder, exist_ok=True)
    
    # Get all image files
    image_extensions = ['*.jpg', '*.jpeg', '*.png']
    image_files = []
    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(input_folder, ext)))
    
    resized_count = 0
    
    for image_path in image_files:
        try:
            filename = os.path.basename(image_path)
            output_path = os.path.join(output_folder, filename)
            
            with Image.open(image_path) as img:
                # Convert to RGB if necessary (handles RGBA, grayscale, etc.)
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                
                # Resize with high-quality resampling
                img_resized = img.resize(size, Image.LANCZOS)
                img_resized.save(output_path, 'JPEG', quality=95)
                
            resized_count += 1
            
            if resized_count % 500 == 0:
                print(f"Processed {resized_count} images...")
                
        except Exception as e:
            print(f"Error processing {image_path}: {e}")
    
    print(f"✓ Resized {resized_count} images to {size} in {output_folder}")

# Usage - adjust paths to match your setup
input_dir = "./coco_validation/val2017"  # Your COCO images folder
output_dir = "./coco_validation/val2017_224x224"

resize_coco_images(input_dir, output_dir, (224, 224))


Processed 500 images...
Processed 1000 images...
Processed 1500 images...
Processed 2000 images...
Processed 2500 images...
Processed 3000 images...
Processed 3500 images...
Processed 4000 images...
Processed 4500 images...
Processed 5000 images...
✓ Resized 5000 images to (224, 224) in ./coco_validation/val2017_224x224


In [3]:
import os
import glob
import random
import shutil
from sklearn.model_selection import train_test_split

def select_and_split_coco_images(source_folder, output_folder, 
                                num_samples=1000, test_size=0.2, random_state=42):
    """
    Select random images from COCO and split into train/test
    """
    # Create output directories
    train_dir = os.path.join(output_folder, "train")
    test_dir = os.path.join(output_folder, "test")
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    
    # Get all image files
    all_images = glob.glob(os.path.join(source_folder, "*.jpg"))
    
    if len(all_images) < num_samples:
        print(f"Warning: Only {len(all_images)} images available, using all of them")
        num_samples = len(all_images)
    
    # Randomly sample the specified number of images
    random.seed(random_state)
    selected_images = random.sample(all_images, num_samples)
    
    # Split into train and test
    train_files, test_files = train_test_split(
        selected_images, 
        test_size=test_size, 
        random_state=random_state
    )
    
    # Copy files to respective directories
    for file_path in train_files:
        filename = os.path.basename(file_path)
        shutil.copy2(file_path, os.path.join(train_dir, filename))
    
    for file_path in test_files:
        filename = os.path.basename(file_path)
        shutil.copy2(file_path, os.path.join(test_dir, filename))
    
    print(f"✓ Selected {num_samples} images randomly")
    print(f"✓ Train set: {len(train_files)} images")
    print(f"✓ Test set: {len(test_files)} images")
    
    return len(train_files), len(test_files)

# Usage - adjust paths to your setup
source_dir = "./coco_validation/val2017"  # Your COCO images
output_dir = "./coco_split_1000"

train_count, test_count = select_and_split_coco_images(
    source_folder=source_dir,
    output_folder=output_dir,
    num_samples=1000,
    test_size=0.2,  # 80% train, 20% test
    random_state=42  # For reproducible results
)


✓ Selected 1000 images randomly
✓ Train set: 800 images
✓ Test set: 200 images


In [7]:
import os
import shutil
import glob
from pathlib import Path

def combine_medical_datasets(medical_base_folder, non_medical_base_folder, output_folder):
    """
    Combine your version_v3 (medical) and coco_split_1000 (non-medical) datasets
    """
    # Create output structure
    output_train_dir = os.path.join(output_folder, "train")
    output_test_dir = os.path.join(output_folder, "test")
    
    # Create class subdirectories for binary classification
    for split_dir in [output_train_dir, output_test_dir]:
        os.makedirs(os.path.join(split_dir, "medical"), exist_ok=True)
        os.makedirs(os.path.join(split_dir, "non_medical"), exist_ok=True)
    
    def copy_dataset_split(source_folder, dest_folder, label):
        """Copy images from source train/test to destination with labels"""
        stats = {"train": 0, "test": 0}
        
        for split in ["train", "test"]:
            source_split_path = os.path.join(source_folder, split)
            dest_split_path = os.path.join(dest_folder, split, label)
            
            if os.path.exists(source_split_path):
                # Handle both flat structure and subfolder structure
                image_files = []
                extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tiff']
                
                # Check if images are directly in the split folder
                for ext in extensions:
                    image_files.extend(glob.glob(os.path.join(source_split_path, ext)))
                
                # If no images found, check subfolders
                if not image_files:
                    for subfolder in os.listdir(source_split_path):
                        subfolder_path = os.path.join(source_split_path, subfolder)
                        if os.path.isdir(subfolder_path):
                            for ext in extensions:
                                image_files.extend(glob.glob(os.path.join(subfolder_path, ext)))
                
                # Copy files
                for img_path in image_files:
                    filename = os.path.basename(img_path)
                    # Rename to avoid conflicts
                    new_filename = f"{label}_{filename}"
                    shutil.copy2(img_path, os.path.join(dest_split_path, new_filename))
                    stats[split] += 1
                
                print(f"✓ Copied {stats[split]} {label} images to {split} set")
        
        return stats
    
    # Process both datasets
    medical_stats = copy_dataset_split(medical_base_folder, output_folder, "medical")
    non_medical_stats = copy_dataset_split(non_medical_base_folder, output_folder, "non_medical")
    
    # Print final summary
    total_train = medical_stats["train"] + non_medical_stats["train"]
    total_test = medical_stats["test"] + non_medical_stats["test"]
    
    print(f"\n🎯 FINAL BINARY CLASSIFICATION DATASET:")
    print(f"Training Set: {total_train} images")
    print(f"  - Medical: {medical_stats['train']}")
    print(f"  - Non-medical: {non_medical_stats['train']}")
    print(f"Test Set: {total_test} images") 
    print(f"  - Medical: {medical_stats['test']}")
    print(f"  - Non-medical: {non_medical_stats['test']}")
    
    # Check for class imbalance
    if medical_stats["train"] > 0 and non_medical_stats["train"] > 0:
        ratio = max(medical_stats["train"], non_medical_stats["train"]) / min(medical_stats["train"], non_medical_stats["train"])
        if ratio > 2:
            print(f"⚠️  WARNING: Class imbalance detected! Ratio: {ratio:.1f}:1")
    
    return medical_stats, non_medical_stats

# Usage - UPDATE THESE PATHS based on your structure
medical_dataset_path = "C:/Users/USER/Desktop/Medical&Non-Medical_image_classifier/dataset/version_v5"  # Your medical images folder
non_medical_dataset_path = "C:/Users/USER/Desktop/Medical&Non-Medical_image_classifier/dataset/coco_split_1000"  # Your COCO non-medical images
combined_output_path = "./binary_medical_classifier_dataset"

med_stats, non_med_stats = combine_medical_datasets(
    medical_base_folder=medical_dataset_path,
    non_medical_base_folder=non_medical_dataset_path, 
    output_folder=combined_output_path
)


✓ Copied 800 medical images to train set
✓ Copied 200 medical images to test set
✓ Copied 800 non_medical images to train set
✓ Copied 200 non_medical images to test set

🎯 FINAL BINARY CLASSIFICATION DATASET:
Training Set: 1600 images
  - Medical: 800
  - Non-medical: 800
Test Set: 400 images
  - Medical: 200
  - Non-medical: 200
