In [1]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import Counter

In [2]:
# ===== Configuration =====
TARGET_SIZE = (224, 224)  # Standard size for CNN models
BASE_DIR = 'BreastCancerData'
ONLINE_DIR = os.path.join(BASE_DIR, 'OnlineData')
OFFLINE_DIR = os.path.join(BASE_DIR, 'OfflineData')
OUTPUT_DIR = 'preprocessed_data'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
def load_images(directory, class_mapping):
    images, labels, sources = [], [], []
    for class_name, label in class_mapping.items():
        class_dir = os.path.join(directory, class_name)
        if not os.path.exists(class_dir):
            continue
            
        print(f"Processing {class_name} in {directory.split('/')[-1]}...")
        for file in tqdm(os.listdir(class_dir)):
            if not file.lower().endswith(('.png', '.jpg', '.jpeg')):
                continue
                
            img_path = os.path.join(class_dir, file)
            img = cv2.imread(img_path)
            if img is None:
                continue
                
            # Resize and convert to grayscale
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            resized = cv2.resize(gray, TARGET_SIZE)
            normalized = resized / 255.0  # Normalize pixel values
            
            images.append(normalized)
            labels.append(label)
            sources.append(directory.split('/')[-1])
            
    return np.array(images), np.array(labels), np.array(sources)


In [4]:
online_mapping = {'malignant': 1, 'benign': 0, 'normal': 0}
offline_mapping = {'malignant': 1, 'benign': 0}

In [6]:
# Load all data
X_offline, y_offline, s_offline = load_images(OFFLINE_DIR, offline_mapping)
X_online, y_online, s_online = load_images(ONLINE_DIR, online_mapping)

Processing malignant in OfflineData...


  7%|▋         | 18/270 [00:00<00:03, 81.90it/s]

100%|██████████| 270/270 [00:07<00:00, 38.02it/s]


Processing benign in OfflineData...


100%|██████████| 287/287 [00:05<00:00, 48.97it/s]


Processing malignant in OnlineData...


100%|██████████| 210/210 [00:11<00:00, 18.23it/s]


Processing benign in OnlineData...


100%|██████████| 288/288 [00:20<00:00, 14.06it/s]


Processing normal in OnlineData...


100%|██████████| 133/133 [00:09<00:00, 14.22it/s]


In [7]:
# Combine datasets
X_combined = np.concatenate((X_online, X_offline))
y_combined = np.concatenate((y_online, y_offline))
sources = np.concatenate((s_online, s_offline))

In [12]:
# ===== 1.2 Data Validation =====
print("\nData Summary:")
print(f"- Online samples: {len(X_online)} (Malignant: {sum(y_online==1)} | Benign: {sum(y_online==0)})")
print(f"- Offline samples: {len(X_offline)} (Malignant: {sum(y_offline==1)} | Benign: {sum(y_offline==0)})")
print(f"- Combined samples: {len(X_combined)} (Malignant: {sum(y_combined==1)} | Benign: {sum(y_combined==0)})")


Data Summary:
- Online samples: 631 (Malignant: 210 | Benign: 421)
- Offline samples: 557 (Malignant: 270 | Benign: 287)
- Combined samples: 1188 (Malignant: 480 | Benign: 708)


In [13]:
# ===== 1.3 Data Visualization =====
def plot_distribution(labels, title, file_name):
    class_counts = Counter(labels)
    plt.figure(figsize=(10, 5))
    plt.bar(['Non-Cancerous (0)', 'Malignant (1)'], 
            [class_counts[0], class_counts[1]], 
            color=['skyblue', 'salmon'])
    plt.title(f'Class Distribution: {title}\nTotal Samples: {sum(class_counts.values())}')
    plt.ylabel('Count')
    plt.savefig(f'plots/{file_name}.png')
    plt.close()


In [14]:
# Create visualizations
os.makedirs('plots', exist_ok=True)
plot_distribution(y_online, 'Online Data', 'online_distribution')
plot_distribution(y_offline, 'Hospital Data', 'hospital_distribution')
plot_distribution(y_combined, 'Combined Data', 'combined_distribution')

In [15]:
# ===== Save Processed Data =====
np.savez_compressed(
    os.path.join(OUTPUT_DIR, 'preprocessed_images.npz'),
    X_online=X_online,
    y_online=y_online,
    X_offline=X_offline,
    y_offline=y_offline,
    X_combined=X_combined,
    y_combined=y_combined,
    sources=sources
)

print("\nPreprocessing completed. Files saved to 'preprocessed_data' directory")
print("Class distributions visualized in 'plots' directory")


Preprocessing completed. Files saved to 'preprocessed_data' directory
Class distributions visualized in 'plots' directory
