# Mini-Project: Histopathologic Cancer Detection

This notebook provides a solution for the Histopathologic Cancer Detection Kaggle competition, developed as part of the Deep Learning course at CU Boulder. It includes problem explanation, exploratory data analysis (EDA), data preprocessing, model building with hyperparameter tuning, results analysis, and a conclusion.

## 1. Project Explanation
The goal is to classify histopathologic images as cancerous (label=1) or non-cancerous (label=0) based on a 32x32 pixel region in 96x96 pixel images. The dataset contains over 220,000 training images and 57,000 test images, each in .tif format. The task is a binary classification problem, requiring a deep learning model to detect cancerous regions accurately.

## 2. Data Description
The dataset includes:
- **Training labels**: A CSV file (`train_labels.csv`) with image IDs and binary labels (0 or 1).
- **Training images**: 96x96 pixel .tif images in the `train` directory.
- **Test images**: 96x96 pixel .tif images in the `test` directory, without labels.
- **Size**: ~220,000 training images, ~57,000 test images.
- **Structure**: Images are RGB, and labels indicate the presence (1) or absence (0) of cancer in the central 32x32 region.

## 3. Imports and exploratory Data Analysis (EDA)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import torch
from PIL import Image
import torchvision.transforms as transforms
import warnings
warnings.filterwarnings('ignore')

# Define paths
data_dir = '/kaggle/input/histopathologic-cancer-detection/'
train_labels_path = os.path.join(data_dir, 'train_labels.csv')
train_images_dir = os.path.join(data_dir, 'train')
test_images_dir = os.path.join(data_dir, 'test')

# Load labels
train_labels_df = pd.read_csv(train_labels_path)

# Basic data description
print(f"Training labels shape: {train_labels_df.shape}")
print("\nFirst 5 rows of training labels:")
print(train_labels_df.head().to_markdown(index=False))
print("\nMissing values in training labels:")
print(train_labels_df.isnull().sum())

# Check for duplicates
print("\nDuplicate rows in training labels:")
print(train_labels_df[train_labels_df.duplicated(keep=False)])

# Class distribution
class_counts = train_labels_df['label'].value_counts()
print("\nClass distribution:")
print(class_counts)

# Visualize class distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=train_labels_df)
plt.title('Class Distribution (0: Non-Cancerous, 1: Cancerous)')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

# Display sample images
def display_sample_images(df, image_dir, num_samples=3):
    fig, axes = plt.subplots(2, num_samples, figsize=(num_samples*4, 8))
    for i, label in enumerate([0, 1]):
        label_df = df[df['label'] == label].sample(num_samples, random_state=42)
        for j, row in enumerate(label_df.itertuples()):
            img_path = os.path.join(image_dir, f'{row.id}.tif')
            img = Image.open(img_path)
            axes[i, j].imshow(img)
            axes[i, j].set_title(f'Label: {label}')
            axes[i, j].axis('off')
    plt.suptitle('Sample Images (Top: Non-Cancerous, Bottom: Cancerous)')
    plt.show()

display_sample_images(train_labels_df, train_images_dir)

# Image statistics
sample_image = Image.open(os.path.join(train_images_dir, f'{train_labels_df["id"].iloc[0]}.tif'))
print(f"\nSample image dimensions: {sample_image.size}")
print(f"Sample image mode: {sample_image.mode}")

# Pixel intensity distribution
def plot_pixel_intensity(image_dir, sample_ids, title):
    plt.figure(figsize=(10, 6))
    for img_id in sample_ids[:3]:
        img_path = os.path.join(image_dir, f'{img_id}.tif')
        img = np.array(Image.open(img_path))
        plt.hist(img.flatten(), bins=50, alpha=0.5, label=f'Image {img_id[:5]}')
    plt.title(title)
    plt.xlabel('Pixel Intensity')
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

sample_ids = train_labels_df['id'].sample(3, random_state=42).values
plot_pixel_intensity(train_images_dir, sample_ids, 'Pixel Intensity Distribution')


## 4. Data Preprocessing & Analysis Plan
Based on EDA:
- **Class imbalance**: ~60% non-cancerous, ~40% cancerous. Use class weights in loss function.
- **Preprocessing**: Normalize images using mean and std from ImageNet. Apply data augmentation (flips, rotations) to improve generalization.
- **Plan**: Use a CNN with iterative hyperparameter tuning (learning rate, batch size, architecture depth). Evaluate using validation accuracy and AUC.

In [None]:
# Enhanced data transformations
train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Custom dataset
class CancerDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, image_dir, transform=None):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = self.dataframe.iloc[idx, 0]
        label = self.dataframe.iloc[idx, 1] if 'label' in self.dataframe.columns else 0
        img_path = os.path.join(self.image_dir, f'{img_name}.tif')
        image = Image.open(img_path)
        if self.transform:
            image = self.transform(image)
        return image, label

# Split dataset
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(train_labels_df, test_size=0.2, stratify=train_labels_df['label'], random_state=42)

# Create datasets and loaders
train_dataset = CancerDataset(train_df, train_images_dir, transform=train_transforms)
val_dataset = CancerDataset(val_df, train_images_dir, transform=val_transforms)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)


## 5. Model Architecture
I use a deeper CNN with batch normalization and dropout for regularization. The architecture is inspired by VGG-like models but tailored for 96x96 images.

In [None]:
import torch.nn as nn

class DeepCNN(nn.Module):
    def __init__(self, num_filters=32, dropout_rate=0.5):
        super(DeepCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, num_filters, kernel_size=3, padding=1),
            nn.BatchNorm2d(num_filters),
            nn.ReLU(),
            nn.Conv2d(num_filters, num_filters, kernel_size=3, padding=1),
            nn.BatchNorm2d(num_filters),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(num_filters, num_filters*2, kernel_size=3, padding=1),
            nn.BatchNorm2d(num_filters*2),
            nn.ReLU(),
            nn.Conv2d(num_filters*2, num_filters*2, kernel_size=3, padding=1),
            nn.BatchNorm2d(num_filters*2),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(num_filters*2, num_filters*4, kernel_size=3, padding=1),
            nn.BatchNorm2d(num_filters*4),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
        )
        with torch.no_grad():
            dummy_input = torch.zeros(1, 3, 96, 96)
            dummy_output = self.features(dummy_input)
            self.flattened_size = dummy_output.view(1, -1).size(1)
        self.classifier = nn.Sequential(
            nn.Linear(self.flattened_size, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 2)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

In [None]:
# Hyperparameter tuning
from sklearn.metrics import roc_auc_score
import numpy as np

def train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, scheduler, device, num_epochs=10):
    train_losses, val_losses, val_accuracies, val_aucs = [], [], [], []
    best_auc, best_model = 0.0, None
    for epoch in range(num_epochs):
        # Training
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        train_loss = running_loss / len(train_loader)
        train_losses.append(train_loss)

        # Validation
        model.eval()
        val_loss, correct, total, all_preds, all_labels = 0.0, 0, 0, [], []
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                correct += (predicted == labels).sum().item()
                total += labels.size(0)
                all_preds.extend(torch.softmax(outputs, dim=1)[:, 1].cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        val_loss /= len(val_loader)
        val_accuracy = 100 * correct / total
        val_auc = roc_auc_score(all_labels, all_preds)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)
        val_aucs.append(val_auc)
        scheduler.step(val_loss)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%, Val AUC: {val_auc:.4f}")

        if val_auc > best_auc:
            best_auc = val_auc
            best_model = model.state_dict()

    return train_losses, val_losses, val_accuracies, val_aucs, best_model

In [None]:
# Try different hyperparameters
hyperparams = [
    {'num_filters': 32, 'dropout_rate': 0.5, 'lr': 0.001, 'batch_size': 64},
    {'num_filters': 64, 'dropout_rate': 0.3, 'lr': 0.0005, 'batch_size': 32},
]

results = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_weights = torch.tensor([1.0, class_counts[0] / class_counts[1]], dtype=torch.float).to(device)

for params in hyperparams:
    print(f"\nTraining with params: {params}")
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, num_workers=4)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False, num_workers=4)
    model = DeepCNN(num_filters=params['num_filters'], dropout_rate=params['dropout_rate']).to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)
    train_losses, val_losses, val_accuracies, val_aucs, best_model = train_and_evaluate(
        model, train_loader, val_loader, criterion, optimizer, scheduler, device
    )
    results.append({
        'params': params,
        'train_losses': train_losses,
        'val_losses': val_losses,
        'val_accuracies': val_accuracies,
        'val_aucs': val_aucs,
        'best_model': best_model
    })

## 6. Results & Analysis
Plot results

In [None]:
plt.figure(figsize=(12, 8))
for i, result in enumerate(results):
    plt.subplot(2, 2, 1)
    plt.plot(result['train_losses'], label=f"Train Loss (Params {i+1})")
    plt.plot(result['val_losses'], label=f"Val Loss (Params {i+1})")
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.subplot(2, 2, 2)
    plt.plot(result['val_accuracies'], label=f"Val Accuracy (Params {i+1})")
    plt.title('Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.legend()
    plt.subplot(2, 2, 3)
    plt.plot(result['val_aucs'], label=f"Val AUC (Params {i+1})")
    plt.title('Validation AUC')
    plt.xlabel('Epoch')
    plt.ylabel('AUC')
    plt.legend()
plt.tight_layout()
plt.show()

## Submission

In [None]:
# Create submission file
from torch.utils.data import DataLoader

best_result = max(results, key=lambda x: max(x['val_aucs']))  # Choose model with highest AUC
best_model_state = best_result['best_model']
model = DeepCNN(num_filters=best_result['params']['num_filters'], 
                dropout_rate=best_result['params']['dropout_rate']).to(device)
model.load_state_dict(best_model_state)
model.eval()

test_df = pd.DataFrame({'id': [f.split('.tif')[0] for f in os.listdir(test_images_dir)]})
test_dataset = CancerDataset(test_df, test_images_dir, transform=val_transforms)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)

# Make predictions
predictions = []
image_ids = test_df['id'].values

with torch.no_grad():
    for inputs, _ in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        probs = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()  # Probability of class 1
        predictions.extend(probs)

submission_df = pd.DataFrame({'id': image_ids, 'label': predictions})

submission_df.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created successfully!")