# Model Exploration
This objective of this project is to evaluates 3 approaches to accurately analyze real-world data: a naive approach, a non deep learning approach, and a neural network-based deep learning approach 

In [None]:
# Imports
import numpy as np
from pathlib import Path
#pip import cv2
from collections import Counter
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from huggingface_hub import HfApi, login
from dotenv import load_dotenv
import os


In [2]:
# Paths
train_dir = Path("data/processed/train")
val_dir = Path("data/processed/val")
test_dir = Path("data/processed/test")

### Naive Approach
Predict the majority class

In [None]:
def count_images_per_class(directory):
    '''Count the number of images per class'''
    counts = {}
    for cls in ["0", "1"]:
        cls_path = directory / cls # Build path to class folder
        counts[cls] = len(list(cls_path.glob("*.png"))) # Line written with GPT-5
    return counts

print("Train set:", count_images_per_class(train_dir))
print("Validation set:", count_images_per_class(val_dir))
print("Test set:", count_images_per_class(test_dir))

Train set: {'0': 155946, '1': 62302}
Validation set: {'0': 17561, '1': 7029}
Test set: {'0': 25231, '1': 9455}


In [None]:
train_counts = count_images_per_class(train_dir) # Count number of images in each class in train set
majority_class = max(train_counts, key=train_counts.get) # Determine which class has the most images
print(f"Majority class in training set: {majority_class}")

Majority class in training set: 0


In [None]:
def naive_predict(directory, predicted_class):
    """
    Predicts the majority class for all images in a given directory.
    """
    y_true = []
    y_pred = []
    for cls in ["0", "1"]:
        cls_path = directory / cls
        files = list(cls_path.glob("*.png")) # List of all PNG files in class folder
        y_true.extend([int(cls)] * len(files)) # Add true lables to y_true
        y_pred.extend([int(predicted_class)] * len(files)) # Add predicted lables to y_pred
    return y_true, y_pred

In [22]:
# Validation set evaluation
y_val_true, y_val_pred = naive_predict(val_dir, majority_class)


print("Validation Set:")
print(classification_report(y_val_true, y_val_pred, zero_division=0))
print("ROC-AUC:", round(roc_auc_score(y_val_true, y_val_pred), 4))

# Test set evaluation
y_test_true, y_test_pred = naive_predict(test_dir, majority_class)

print("Test Set:")
print(classification_report(y_test_true, y_test_pred, zero_division=0))
print("ROC-AUC:", round(roc_auc_score(y_test_true, y_test_pred), 4))

Validation Set:
              precision    recall  f1-score   support

           0       0.71      1.00      0.83     17561
           1       0.00      0.00      0.00      7029

    accuracy                           0.71     24590
   macro avg       0.36      0.50      0.42     24590
weighted avg       0.51      0.71      0.60     24590

ROC-AUC: 0.5
Test Set:
              precision    recall  f1-score   support

           0       0.73      1.00      0.84     25231
           1       0.00      0.00      0.00      9455

    accuracy                           0.73     34686
   macro avg       0.36      0.50      0.42     34686
weighted avg       0.53      0.73      0.61     34686

ROC-AUC: 0.5


The naive baseline predicts the majority class, which in this dataset is benign (0), for all images. It achieved an accuracy of around 73% on the test set which is reasonable due to class imbalance in the dataset: it has a little under 75% benign images and the rest malignant images. However, the model completely fails to detect malignant tissue patches (1), resulting in an F1-score of 0 and ROC-AUC of 0.5. These results provide a good baseline to use as a reference point for the more sophisticated models but is clinically unusable for detecting breast cancer.

### Classical Machine Learning Approach
Random Forest with class weighting to handles imbalance

In [18]:
def extract_features(image_path, bins=32):
    ''' Extracts grayscale histogram features from an image. '''
    img = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE) # Read image in grayscale
    hist = cv2.calcHist([img], [0], None, [bins], [0, 256]) # Compute histogram
    hist = cv2.normalize(hist, hist).flatten() # Normalize and flatten
    return hist

def build_dataset(directory, bins=32):
    ''' Builds X (features) and y (labels) arrays from a directory structure with subfolders for each class. '''
    X, y = [], []
    for cls in ["0", "1"]:
        folder = Path(directory) / cls
        for img_path in folder.glob("*.png"): # Iterate over all PNG files in class folder
            features = extract_features(img_path, bins=bins) # Extract features
            X.append(features)
            y.append(int(cls))
    return np.array(X), np.array(y) # Convert lists to numpy arrays

In [19]:
# Build feature datasets
X_train, y_train = build_dataset(train_dir)
X_val, y_val     = build_dataset(val_dir)
X_test, y_test   = build_dataset(test_dir)

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)

Train: (218248, 32) Val: (24590, 32) Test: (34686, 32)


In [45]:
# Train Random Forest model
rf = RandomForestClassifier(n_estimators=500, random_state=42, class_weight="balanced", n_jobs=-1)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [46]:
# Validation predictions
y_val_pred = rf.predict(X_val) # Predicted classes
y_val_proba = rf.predict_proba(X_val)[:, 1] # Predicted probabilities for positive class

print("Validation Results")
print(classification_report(y_val, y_val_pred, digits=4))
print("ROC-AUC:", round(roc_auc_score(y_val, y_val_proba),4))

# Test predictions
y_test_pred = rf.predict(X_test) # Predicted classes
y_test_proba = rf.predict_proba(X_test)[:, 1] # Predicted probabilities for positive class

print("\nTest Results")
print(classification_report(y_test, y_test_pred, digits=4))
print("ROC-AUC:", round(roc_auc_score(y_test, y_test_proba),4))


Validation Results
              precision    recall  f1-score   support

           0     0.8474    0.9104    0.8778     17561
           1     0.7251    0.5906    0.6509      7029

    accuracy                         0.8190     24590
   macro avg     0.7863    0.7505    0.7644     24590
weighted avg     0.8125    0.8190    0.8129     24590

ROC-AUC: 0.8688

Test Results
              precision    recall  f1-score   support

           0     0.8690    0.9050    0.8867     25231
           1     0.7150    0.6361    0.6732      9455

    accuracy                         0.8317     34686
   macro avg     0.7920    0.7705    0.7799     34686
weighted avg     0.8271    0.8317    0.8285     34686

ROC-AUC: 0.8881


The random forest model was trained on grayscale histogram features and is an improvement compared to the naive baseline. It has an accuracy of 83.2%, an F1-score of 67.3% for malignant patches and a ROC-AUC of 0.8688. This model is still failing to predict some malignant cases, even after balancing the classes.

### Neural Network-based Deep Learning Approach
CNN trained end-to-end

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Use GPU if available

In [4]:
# Data augmentation and normalization for training
train_transforms = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.RandomHorizontalFlip(p=0.5),   # safe, lesions can appear left/right
    transforms.RandomVerticalFlip(p=0.5),     # safe, orientation doesn’t matter
    transforms.RandomRotation(15),            # small rotations are fine
    transforms.RandomResizedCrop(128, scale=(0.8, 1.0)),  # zoom in/out a bit
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])        # normalize to [-1, 1] range
])

test_transforms = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

train_dataset = datasets.ImageFolder(train_dir, transform=train_transforms)
val_dataset   = datasets.ImageFolder(val_dir, transform=test_transforms)
test_dataset  = datasets.ImageFolder(test_dir, transform=test_transforms)

In [5]:
# Handle imbalance with WeightedRandomSampler
class_counts = np.bincount(train_dataset.targets)
class_weights = 1. / class_counts
sample_weights = [class_weights[label] for label in train_dataset.targets]
sampler = torch.utils.data.WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

In [6]:
# DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler, num_workers=4)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

print("Train:", class_counts)

Train: [155946  62302]


In [7]:
# CNN Architecture
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )
        self.classifier = nn.Sequential(
            nn.Linear(256 * 8 * 8, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 1),
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

model = SimpleCNN().to(device)

In [8]:
# Loss and optimizer
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(class_counts[0]/class_counts[1], device=device))
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [9]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    for imgs, labels in loader:
        imgs, labels = imgs.to(device), labels.to(device).float().unsqueeze(1)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)
    return running_loss / len(loader.dataset)

def evaluate(model, loader):
    model.eval()
    y_true, y_pred, y_prob = [], [], []
    with torch.no_grad():
        for imgs, labels in loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs).squeeze(1)
            probs = torch.sigmoid(outputs)
            preds = (probs > 0.5).long()
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
            y_prob.extend(probs.cpu().numpy())
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    roc = roc_auc_score(y_true, y_prob)
    return report, roc

In [10]:
# Training loop
best_auc = 0
num_epochs = 10

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_report, val_auc = evaluate(model, val_loader)
    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {train_loss:.4f} | Val ROC-AUC: {val_auc:.4f}")
    if val_auc > best_auc:
        best_auc = val_auc
        torch.save(model.state_dict(), "best_cnn.pth") # 

Epoch 1/10 | Loss: 0.6010 | Val ROC-AUC: 0.9167
Epoch 2/10 | Loss: 0.5393 | Val ROC-AUC: 0.9161
Epoch 3/10 | Loss: 0.5213 | Val ROC-AUC: 0.9114
Epoch 4/10 | Loss: 0.4996 | Val ROC-AUC: 0.9140
Epoch 5/10 | Loss: 0.4871 | Val ROC-AUC: 0.9193
Epoch 6/10 | Loss: 0.4731 | Val ROC-AUC: 0.9191
Epoch 7/10 | Loss: 0.4606 | Val ROC-AUC: 0.9195
Epoch 8/10 | Loss: 0.4480 | Val ROC-AUC: 0.9189
Epoch 9/10 | Loss: 0.4352 | Val ROC-AUC: 0.9154
Epoch 10/10 | Loss: 0.4282 | Val ROC-AUC: 0.9102


In [None]:
# Load HF token from .env
load_dotenv()
login(token=os.getenv("HUGGINGFACE_TOKEN"))

# Create repo and upload
repo_id = "moosejuice13/cnn_breast_cancer_classifier"
api = HfApi()
api.create_repo(repo_id, repo_type="model", exist_ok=True)

api.upload_file(
    path_or_fileobj="best_cnn.pth",
    path_in_repo="breast_cancer_classifier_cnn_model.pth",
    repo_id=repo_id,
)

print(f"Model pushed to https://huggingface.co/{repo_id}")


  from .autonotebook import tqdm as notebook_tqdm
best_cnn.pth: 100%|██████████| 35.1M/35.1M [00:02<00:00, 15.3MB/s]


Model pushed to https://huggingface.co/moosejuice13/cnn_breast_cancer_classifier


In [13]:
# Load best model and evaluate on test set
model.load_state_dict(torch.load("best_cnn.pth"))
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.float().to(device)
        outputs = model(images).squeeze(1)
        preds = torch.sigmoid(outputs).cpu().numpy()
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds)

y_pred_binary = (np.array(y_pred) >= 0.5).astype(int)
print(classification_report(y_true, y_pred_binary, digits=4))
print("ROC-AUC:", roc_auc_score(y_true, y_pred))

              precision    recall  f1-score   support

         0.0     0.9645    0.8251    0.8894     25231
         1.0     0.6632    0.9189    0.7704      9455

    accuracy                         0.8507     34686
   macro avg     0.8138    0.8720    0.8299     34686
weighted avg     0.8823    0.8507    0.8569     34686

ROC-AUC: 0.9438158292050937
