# Histopathologic Cancer Detection

## Martin Ozaeta

https://github.com/MOzaeta96/Week-3-CNN-project.git

    For the Kaggle competition, we were tasked with identifying metastic cancer in small image patches from pathology scans.
This was accomplished via a deduplicated version of the PatchCamelyon or PCam dataset. For out initial work we needed to set everything up for our model:

### Importing Libraries:

    Our notebook thankfully has tensorflow, keras, opencv-python and matplotlib installed so we can move forward with importing the libraries we will be using:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam

    The libraries will allow us to manipulate data, process the images that are found in the data set, and create our model that will help with identifying the cancer cells.

### Loading Data:

In [44]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import os

def load_image(image_path, label):
    try:
        image = tf.io.read_file(image_path)
        image = tf.image.decode_image(image, channels=3)
        
        # Check if image is empty
        if tf.shape(image) == tf.constant([0]):
            raise ValueError(f"Image at path {image_path} is empty or could not be read.")
        
        image = tf.image.resize(image, [128, 128])
        image = tf.cast(image, tf.float32) / 255.0
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        image = tf.zeros([128, 128, 3])  # Create a dummy image if there's an error
    return image, label

def create_dataset_from_dataframe(df, base_path, batch_size=32):
    image_paths = df['id'].apply(lambda x: os.path.join(base_path, x)).values
    labels = df['label'].values
    
    image_paths = tf.convert_to_tensor(image_paths, dtype=tf.string)
    labels = tf.convert_to_tensor(labels, dtype=tf.int64)
    
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
    dataset = dataset.map(lambda x, y: load_image(x, y), num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    
    return dataset

# Load the training labels CSV
train_labels_df = pd.read_csv('C:\\Users\\marti\\Downloads\\histopathologic-cancer-detection\\train_labels.csv')

# Update the file extension and path if necessary
train_labels_df['id'] = train_labels_df['id'].apply(lambda x: x + '.tif')

# Split the data into training and validation sets
train_df, val_df = train_test_split(train_labels_df, test_size=0.2, stratify=train_labels_df['label'], random_state=42)

# Base path to the training images
train_base_path = 'C:\\Users\\marti\\Downloads\\histopathologic-cancer-detection\\train'

# Create TensorFlow datasets
batch_size = 32
train_dataset = create_dataset_from_dataframe(train_df, train_base_path, batch_size)
val_dataset = create_dataset_from_dataframe(val_df, train_base_path, batch_size)


Error loading image Tensor("args_0:0", shape=(), dtype=string): Image at path Tensor("args_0:0", shape=(), dtype=string) is empty or could not be read.
Error loading image Tensor("args_0:0", shape=(), dtype=string): Image at path Tensor("args_0:0", shape=(), dtype=string) is empty or could not be read.


### Creating the model and augmenting data:

In [46]:
# Define your model creation function
def create_model(input_shape, num_classes):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=input_shape),
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

input_shape = (128, 128, 3)
num_classes = len(train_labels_df['label'].unique())  # Update with the number of classes

model = create_model(input_shape, num_classes)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


from tensorflow.keras.preprocessing.image import ImageDataGenerator

def create_augmentation_pipeline():
    return ImageDataGenerator(
        rescale=1./255,
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )


### Training and Testing the Data:

In [47]:
# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10
)

Epoch 1/10
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m770s[0m 140ms/step - accuracy: 0.5952 - loss: 0.6770 - val_accuracy: 0.5950 - val_loss: 0.6750
Epoch 2/10
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m745s[0m 135ms/step - accuracy: 0.5952 - loss: 0.6750 - val_accuracy: 0.5950 - val_loss: 0.6750
Epoch 3/10
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m715s[0m 130ms/step - accuracy: 0.5952 - loss: 0.6750 - val_accuracy: 0.5950 - val_loss: 0.6750
Epoch 4/10
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m718s[0m 131ms/step - accuracy: 0.5952 - loss: 0.6750 - val_accuracy: 0.5950 - val_loss: 0.6750
Epoch 5/10
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m721s[0m 131ms/step - accuracy: 0.5952 - loss: 0.6750 - val_accuracy: 0.5950 - val_loss: 0.6750
Epoch 6/10
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m714s[0m 130ms/step - accuracy: 0.5952 - loss: 0.6750 - val_accuracy: 0.5950 - val_loss:

In [51]:
test_base_path = 'C:\\Users\\marti\\Downloads\\histopathologic-cancer-detection\\test'  # Update with the actual path

# List all files in the test directory
test_image_files = [f for f in os.listdir(test_base_path) if f.endswith('.tif')]

# Create DataFrame
test_df = pd.DataFrame({
    'id': test_image_files
})

# Optional: Add a placeholder label column if needed
test_df['label'] = -1

def create_test_dataset(df, base_path, batch_size=32):
    image_paths = df['id'].apply(lambda x: os.path.join(base_path, x)).values
    dataset = tf.data.Dataset.from_tensor_slices(image_paths)
    dataset = dataset.map(lambda x: load_image(x, -1), num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    return dataset

test_dataset = create_test_dataset(test_df, test_base_path, batch_size=32)


Error loading image Tensor("args_0:0", shape=(), dtype=string): Image at path Tensor("args_0:0", shape=(), dtype=string) is empty or could not be read.


### Create Predictions and prepare data for submission to Kaggle:

In [52]:
# Make predictions on the test dataset
predictions = model.predict(test_dataset)
predicted_classes = predictions.argmax(axis=-1)

# Add predictions to the DataFrame
test_df['label'] = predicted_classes

[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 25ms/step


In [53]:
# Save submission file
test_df.to_csv('submission.csv', index=False)

# Compress into ZIP file
import zipfile

with zipfile.ZipFile('submission.zip', 'w') as zipf:
    zipf.write('submission.csv')