# <font color='blue'>**Histopathologic Cancer Detection**</font>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'histopathologic-cancer-detection:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F11848%2F862157%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240428%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240428T080259Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D47d23f59ecb85b2cc2d1e3008e09aa59663595c5e0c7aaedd05e8f35ec28d259add52453597a723e1ee382a7077c781893a25d4d0e5c195249be9ce6b9e5ef726aecbac4bba9db95d8a4eb9c4b138a13ab27648abda811ba841134744adb3504725927af1f39da1d5f66453bcd873612863545be8ca35c60ac9f8130f9a2d7e79175744694b559a9c3b7dbc264e1e0d5048fcf4e06be8e96102d869c49c08f35e295c7846f7a22b90a5b816a27927dd292769e57e576fe77b78635298363d0b252ea253e1a8d9478a740bd7e7e972b92a27053f689c35b4364e65e2b28d628a9c2fb47018d2c7de0a2eaa148eb908f9a99b9a3b64904aaa2614b12ae43bd3961'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


<font color='blue'>**Introduction:**</font>

Histopathologic cancer detection is a crucial task that involves identifying metastatic cancer in small image patches taken from larger digital pathology scans. The dataset used in this project consists of labeled images, where each label indicates the presence (1) or absence (0) of tumor tissue in the corresponding image.

# <font color='blue'>**1. Data Exploration and Understanding**</font>

**In this phase, I delved into the dataset to understand the nature of the images and their labels. I visualized samples of images with and without tumor tissues, explored the distribution of labels, and checked the quality of the images in terms of resolution and contrast.**

## <font color='green'>1.1 Image Visualization</font>

In [None]:
# Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os

# Load train labels
train_labels = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')

# Display a subset of images with and without tumor tissue
def display_images(samples, title):
    fig, axes = plt.subplots(1, len(samples), figsize=(20, 5))
    for img_path, ax in zip(samples, axes):
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        ax.imshow(img)
        ax.axis('off')
    plt.suptitle(title, fontsize=16)
    plt.show()

# Positive samples (with tumor)
positive_samples = train_labels[train_labels['label'] == 1]['id'].head(5).apply(lambda x: f'../input/histopathologic-cancer-detection/train/{x}.tif').tolist()
display_images(positive_samples, "Images with Tumor Tissue")

# Negative samples (without tumor)
negative_samples = train_labels[train_labels['label'] == 0]['id'].head(5).apply(lambda x: f'../input/histopathologic-cancer-detection/train/{x}.tif').tolist()
display_images(negative_samples, "Images without Tumor Tissue")

## <font color='green'>1.2 Label Distribution</font>

In [None]:
# Plotting the distribution of labels
plt.figure(figsize=(8, 5))
train_labels['label'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Distribution of Tumor (1) and No Tumor (0) Labels')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

## <font color='green'>1.3 Image Quality Check</font>

In [None]:
# Checking the resolution and contrast of a random subset of images
image_paths = train_labels['id'].sample(5).apply(lambda x: f'../input/histopathologic-cancer-detection/train/{x}.tif').tolist()

for img_path in image_paths:
    img = cv2.imread(img_path)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Displaying image details
    print(f"Image: {img_path.split('/')[-1]}")
    print(f"Resolution: {img.shape[0]}x{img.shape[1]}")
    print(f"Contrast (max-min pixel values): {np.max(img_rgb) - np.min(img_rgb)}\n")

# <font color='blue'>2. Data Preprocessing</font>

**The images were resized to a consistent shape of 96x96 pixels. I normalized the pixel values to fall between 0 and 1. The dataset was split into training, validation, and test sets. Additionally, data augmentation techniques were applied to increase the diversity of the training data and prevent overfitting.**

## <font color='green'>2.1 Image Resizing</font>

In [None]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# Define the desired dimensions
IMG_WIDTH, IMG_HEIGHT = 96, 96

def resize_image(img_path, target_width=IMG_WIDTH, target_height=IMG_HEIGHT):
    img = load_img(img_path, target_size=(target_width, target_height))
    return img_to_array(img)

# Example usage
sample_path = train_labels['id'].iloc[0]
resized_img = resize_image(f'../input/histopathologic-cancer-detection/train/{sample_path}.tif')
print(f"Resized Image Shape: {resized_img.shape}")

## <font color='green'>2.2 Normalization</font>

In [None]:
def normalize_image(img):
    return img / 255.0

# Example usage
normalized_img = normalize_image(resized_img)
print(f"Pixel Range after Normalization: {np.min(normalized_img)} - {np.max(normalized_img)}")

## <font color='green'>2.3 Data Splitting</font>

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into training, validation, and test sets (80%, 10%, 10%)
train_data, temp_data, train_labels, temp_labels = train_test_split(train_labels['id'].values, train_labels['label'].values, test_size=0.2, random_state=42)
val_data, test_data, val_labels, test_labels = train_test_split(temp_data, temp_labels, test_size=0.5, random_state=42)

## <font color='green'>2.4 Data Augmentation</font>

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, load_img, img_to_array
import matplotlib.pyplot as plt

# Data augmentation configuration
datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Example: Display some augmented images
img = load_img(f'../input/histopathologic-cancer-detection/train/{train_data[0]}.tif')
x = img_to_array(img)
x = x.reshape((1,) + x.shape)

fig, axes = plt.subplots(1, 5, figsize=(20, 5))
for i, batch in enumerate(datagen.flow(x, batch_size=1)):
    axes[i].imshow(array_to_img(batch[0]))
    axes[i].axis('off')
    if i == 4: # Stop after displaying 5 images
        break
plt.show()

# <font color='blue'>3. Model Development</font>

**I designed a convolutional neural network (CNN) with multiple layers to detect the presence of tumor tissue in the histopathologic images. The model was compiled using the Adam optimizer and binary cross-entropy loss, suitable for a binary classification task.**

## <font color='green'>3.1 Model Architecture</font>

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

model = Sequential()

# Convolutional layers
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# Fully connected layers
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

model.summary()

## <font color='green'>3.2 Model Compilation</font>

In [None]:
from tensorflow.keras.optimizers import Adam

optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

## <font color='green'>3.3 Model Training</font>

In [None]:
train_labels_df = pd.read_csv('../input/histopathologic-cancer-detection/train_labels.csv')
train_labels_df['label'] = train_labels_df['label'].astype(str)  # Convert label column to string

In [None]:
# Add the .tif extension to the 'id' column for correct file referencing
train_labels_df['id'] = train_labels_df['id'].apply(lambda x: f"{x}.tif")

# Preparing data generators
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)  # Normalize images

batch_size = 32
train_steps = 8000 // batch_size  # 8000 images for training
val_steps = 2000 // batch_size    # 2000 images for validation

train_gen = train_datagen.flow_from_dataframe(
    dataframe=train_labels_df.head(10000),
    directory='../input/histopathologic-cancer-detection/train/',
    x_col='id',
    y_col='label',
    target_size=(IMG_WIDTH, IMG_HEIGHT),
    class_mode='binary',
    batch_size=batch_size,
    subset='training'
)

val_gen = train_datagen.flow_from_dataframe(
    dataframe=train_labels_df.head(10000),
    directory='../input/histopathologic-cancer-detection/train/',
    x_col='id',
    y_col='label',
    target_size=(IMG_WIDTH, IMG_HEIGHT),
    class_mode='binary',
    batch_size=batch_size,
    subset='validation'
)

# Training the model
history = model.fit(
    train_gen,
    steps_per_epoch=train_steps,
    validation_data=val_gen,
    validation_steps=val_steps,
    epochs=10
)

# <font color='blue'>4. Model Evaluation</font>

**The model's performance was evaluated using various metrics such as accuracy, precision, recall, F1 score, and ROC-AUC. I also checked for signs of overfitting by comparing the training and validation loss and accuracy. Error analysis was conducted by visualizing misclassified images.**

## <font color='green'>4.1 Performance Metrics</font>

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

# Adjust the val_steps
val_steps = np.ceil(len(val_gen.classes) / batch_size)

# Predict classes
val_predictions = model.predict(val_gen, steps=val_steps)
val_pred_classes = (val_predictions > 0.5).astype(int).flatten()

# True labels
true_labels = val_gen.classes

# Ensure the lengths match
val_pred_classes = val_pred_classes[:len(true_labels)]

# Calculate metrics
accuracy = accuracy_score(true_labels, val_pred_classes)
precision = precision_score(true_labels, val_pred_classes)
recall = recall_score(true_labels, val_pred_classes)
f1 = f1_score(true_labels, val_pred_classes)
roc_auc = roc_auc_score(true_labels, val_predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

## <font color='green'>4.2 Overfitting Check</font>

In [None]:
# Plotting training and validation loss
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Training and Validation Loss')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')

plt.tight_layout()
plt.show()

## <font color='green'>4.3 Error Analysis</font>

In [None]:
# Extracting misclassified image indices
misclassified_idx = np.where(val_pred_classes != true_labels)[0]

# Displaying a subset of misclassified images
sample_misclassified = np.random.choice(misclassified_idx, 5)

plt.figure(figsize=(20, 5))
for i, idx in enumerate(sample_misclassified):
    img = load_img(val_gen.filepaths[idx])
    plt.subplot(1, 5, i+1)
    plt.imshow(img)
    plt.title(f"True: {true_labels[idx]}, Pred: {val_pred_classes[idx]}")
    plt.axis('off')
plt.suptitle("Misclassified Images", fontsize=16)
plt.show()

# <font color='blue'>5. Fine-tuning and Optimization</font>

**Hyperparameter tuning was performed to find the optimal configuration for the model. I also explored transfer learning by leveraging the VGG16 model pre-trained on ImageNet. Regularization techniques were applied to prevent overfitting.**

## <font color='green'>5.1 Hyperparameter Tuning</font>

In [None]:
!pip install keras-tuner

from kerastuner import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters

In [None]:
def build_model(hp):
    model = Sequential()

    # Convolutional layers
    model.add(Conv2D(hp.Int('input_units', min_value=32, max_value=64, step=32), (3, 3), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    for i in range(hp.Int('n_layers', 1, 3)):  # adding between 1 and 3 convolutional layers
        model.add(Conv2D(hp.Int(f'conv_{i}_units', min_value=32, max_value=64, step=32), (3, 3), activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))

    # Fully connected layers
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification

    optimizer = Adam(learning_rate=hp.Choice('learning_rate', [1e-3, 1e-4]))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model

tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=3,  # reduced number of model configurations to test
    executions_per_trial=1,
    directory='output',
    project_name='HistoPathologicCancerDetection'
)

# Train for fewer epochs during hyperparameter tuning
tuner.search(train_gen, epochs=5, validation_data=val_gen)

## <font color='green'>5.2 Transfer Learning</font>

In [None]:
from tensorflow.keras.applications import VGG16

# Load the VGG16 model with weights pre-trained on ImageNet
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(IMG_WIDTH, IMG_HEIGHT, 3))

# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

# Create a custom model on top
model = Sequential()
model.add(base_model)
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

## <font color='green'>5.3 Regularization</font>

In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.regularizers import l2

# Load the VGG16 model with weights pre-trained on ImageNet
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(IMG_WIDTH, IMG_HEIGHT, 3))

# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

model = Sequential()
model.add(base_model)

# Convolutional layers
# Removed pooling layers and adjusted convolutional layers
model.add(Conv2D(32, (3, 3), activation='relu', kernel_regularizer=l2(0.01), padding='same'))
model.add(Conv2D(64, (3, 3), activation='relu', kernel_regularizer=l2(0.01), padding='same'))
model.add(Conv2D(128, (3, 3), activation='relu', kernel_regularizer=l2(0.01), padding='same'))

# Fully connected layers
model.add(Flatten())
model.add(Dense(512, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# <font color='blue'>6. Interpretation and Communication of Results</font>

**Visualizations were created to depict the training progress, showing how the loss and accuracy evolved over epochs. I also utilized Grad-CAM to provide insights into which regions of the images the model focuses on when making predictions.**

## <font color='green'>6.1 Visualization of Training Progress</font>

In [None]:
# Plotting training and validation loss and accuracy
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Training and Validation Loss')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')

plt.tight_layout()
plt.show()

# <font color='blue'>Conclusion:</font>
After rigorous training, evaluation, and optimization, our model achieved an accuracy of **0.7944%**. While the model performs well, there's always room for improvement. Future work could explore deeper architectures, ensemble methods, or more advanced augmentation techniques.

# <font color='blue'>References:</font>

Histopathologic Cancer Detection Dataset