In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
import pickle

# Define the dataset structure
data_dir = "Dental_Radiography"  # Root directory of your dataset
splits = ["train", "valid", "test"]
classes = ["Cavity", "Fillings", "Impacted Tooth", "Implant", "Normal"]

def load_and_preprocess_images(data_dir, splits, classes):
    """
    Load all images from the specified directory structure,
    preprocess them, and convert to numpy arrays with labels.
    
    Returns:
        dict: Contains X and y arrays for each split
    """
    # Dictionary to store processed data
    dataset = {}
    
    for split in splits:
        print(f"Processing {split} split...")
        
        # Lists to store images and labels
        images = []
        labels = []
        
        # Process each class
        for class_idx, class_name in enumerate(classes):
            class_dir = os.path.join(data_dir, split, class_name)
            
            # Check if directory exists
            if not os.path.exists(class_dir):
                print(f"Warning: Directory {class_dir} not found. Skipping.")
                continue
            
            # Get list of image files
            image_files = [f for f in os.listdir(class_dir) if f.lower().endswith('.png')]
            
            print(f"  Found {len(image_files)} images in class '{class_name}'")
            
            # Process each image
            for img_file in tqdm(image_files, desc=f"Processing {class_name}"):
                img_path = os.path.join(class_dir, img_file)
                
                try:
                    # Open image as grayscale
                    img = Image.open(img_path).convert('L')
                    
                    # Convert to numpy array and normalize to [0, 1]
                    img_array = np.array(img) / 255.0
                    
                    # Ensure the image is 64x64
                    if img_array.shape != (64, 64):
                        img = img.resize((64, 64), Image.LANCZOS)
                        img_array = np.array(img) / 255.0
                    
                    # Add image and label to lists
                    images.append(img_array)
                    labels.append(class_idx)
                    
                except Exception as e:
                    print(f"Error processing {img_path}: {e}")
        
        # Convert lists to numpy arrays
        X = np.array(images)
        y = np.array(labels)
        
        # Reshape to include channel dimension: (n_samples, height, width, channels)
        X = X.reshape(X.shape[0], 64, 64, 1)
        
        # Store in dataset dictionary
        dataset[split] = {
            'X': X,
            'y': y
        }
        
        print(f"  {split} split: {X.shape[0]} images processed")
        
    return dataset

def visualize_samples(dataset, classes, num_samples=5):
    """
    Visualize random samples from each class in the training set
    """
    X_train = dataset['train']['X']
    y_train = dataset['train']['y']
    
    plt.figure(figsize=(15, 10))
    
    for class_idx, class_name in enumerate(classes):
        # Get indices for this class
        indices = np.where(y_train == class_idx)[0]
        
        # Select random samples
        if len(indices) >= num_samples:
            selected_indices = np.random.choice(indices, num_samples, replace=False)
            
            # Plot images
            for i, idx in enumerate(selected_indices):
                plt.subplot(len(classes), num_samples, class_idx * num_samples + i + 1)
                plt.imshow(X_train[idx].reshape(64, 64), cmap='gray')
                plt.axis('off')
                
                if i == 0:
                    plt.title(class_name)
    
    plt.tight_layout()
    plt.savefig('sample_dental_xrays.png')
    plt.close()

def save_processed_data(dataset, output_dir="processed_data"):
    """
    Save the processed dataset as numpy files
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    for split in dataset.keys():
        # Save X and y arrays
        np.save(os.path.join(output_dir, f"X_{split}.npy"), dataset[split]['X'])
        np.save(os.path.join(output_dir, f"y_{split}.npy"), dataset[split]['y'])
    
    # Save class names
    with open(os.path.join(output_dir, 'class_names.pkl'), 'wb') as f:
        pickle.dump(classes, f)
    
    print(f"Processed data saved to {output_dir}")

def print_dataset_stats(dataset):
    """
    Print statistics about the dataset
    """
    print("\nDataset Statistics:")
    print("-" * 50)
    
    for split in dataset.keys():
        X = dataset[split]['X']
        y = dataset[split]['y']
        
        print(f"{split.capitalize()} set:")
        print(f"  Total images: {X.shape[0]}")
        print(f"  Shape: {X.shape}")
        print(f"  Range: [{X.min():.4f}, {X.max():.4f}]")
        print(f"  Mean: {X.mean():.4f}")
        print(f"  Std: {X.std():.4f}")
        
        # Class distribution
        print("  Class distribution:")
        for class_idx, class_name in enumerate(classes):
            count = np.sum(y == class_idx)
            percentage = (count / len(y)) * 100
            print(f"    {class_name}: {count} ({percentage:.2f}%)")
        
        print("-" * 30)

if __name__ == "__main__":
    # Process the dataset
    dataset = load_and_preprocess_images(data_dir, splits, classes)
    
    # Print statistics
    print_dataset_stats(dataset)
    
    # Visualize samples
    visualize_samples(dataset, classes)
    
    # Save processed data
    save_processed_data(dataset)
    
    print("Preprocessing complete!")

Processing train split...
  Found 576 images in class 'Cavity'


Processing Cavity: 100%|██████████| 576/576 [00:08<00:00, 64.20it/s]


  Found 5242 images in class 'Fillings'


Processing Fillings: 100%|██████████| 5242/5242 [01:18<00:00, 66.55it/s]


  Found 428 images in class 'Impacted Tooth'


Processing Impacted Tooth: 100%|██████████| 428/428 [00:05<00:00, 74.16it/s]


  Found 1784 images in class 'Implant'


Processing Implant: 100%|██████████| 1784/1784 [00:27<00:00, 65.72it/s]


  Found 17106 images in class 'Normal'


Processing Normal: 100%|██████████| 17106/17106 [03:56<00:00, 72.23it/s]


  train split: 25136 images processed
Processing valid split...
  Found 43 images in class 'Cavity'


Processing Cavity: 100%|██████████| 43/43 [00:00<00:00, 58.31it/s]


  Found 540 images in class 'Fillings'


Processing Fillings: 100%|██████████| 540/540 [00:07<00:00, 68.96it/s]


  Found 38 images in class 'Impacted Tooth'


Processing Impacted Tooth: 100%|██████████| 38/38 [00:00<00:00, 78.41it/s]


  Found 159 images in class 'Implant'


Processing Implant: 100%|██████████| 159/159 [00:02<00:00, 73.37it/s]


  Found 2032 images in class 'Normal'


Processing Normal: 100%|██████████| 2032/2032 [00:27<00:00, 73.93it/s]


  valid split: 2812 images processed
Processing test split...
  Found 22 images in class 'Cavity'


Processing Cavity: 100%|██████████| 22/22 [00:00<00:00, 72.71it/s]


  Found 315 images in class 'Fillings'


Processing Fillings: 100%|██████████| 315/315 [00:04<00:00, 71.97it/s]


  Found 32 images in class 'Impacted Tooth'


Processing Impacted Tooth: 100%|██████████| 32/32 [00:00<00:00, 69.74it/s]


  Found 104 images in class 'Implant'


Processing Implant: 100%|██████████| 104/104 [00:01<00:00, 78.44it/s]


  Found 1176 images in class 'Normal'


Processing Normal: 100%|██████████| 1176/1176 [00:18<00:00, 63.76it/s]


  test split: 1649 images processed

Dataset Statistics:
--------------------------------------------------
Train set:
  Total images: 25136
  Shape: (25136, 64, 64, 1)
  Range: [0.0000, 1.0000]
  Mean: 0.4022
  Std: 0.3742
  Class distribution:
    Cavity: 576 (2.29%)
    Fillings: 5242 (20.85%)
    Impacted Tooth: 428 (1.70%)
    Implant: 1784 (7.10%)
    Normal: 17106 (68.05%)
------------------------------
Valid set:
  Total images: 2812
  Shape: (2812, 64, 64, 1)
  Range: [0.0000, 1.0000]
  Mean: 0.3980
  Std: 0.3759
  Class distribution:
    Cavity: 43 (1.53%)
    Fillings: 540 (19.20%)
    Impacted Tooth: 38 (1.35%)
    Implant: 159 (5.65%)
    Normal: 2032 (72.26%)
------------------------------
Test set:
  Total images: 1649
  Shape: (1649, 64, 64, 1)
  Range: [0.0000, 1.0000]
  Mean: 0.4168
  Std: 0.3786
  Class distribution:
    Cavity: 22 (1.33%)
    Fillings: 315 (19.10%)
    Impacted Tooth: 32 (1.94%)
    Implant: 104 (6.31%)
    Normal: 1176 (71.32%)
--------------------

In [4]:
import numpy as np
x=np.load('./processed_data/X_test.npy')
x[0]

array([[[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]],

       ...,

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]]])

In [5]:
y=np.load('./processed_data/y_test.npy')
y[0]

np.int64(0)