# Histopathologic Cancer Detection

In [1]:
# Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from re import findall
import tensorflow as tf
from tensorflow.keras import layers, models

In [6]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## I. Brief Introduction

* Goal:

    The task of the project is to predict if tumors are on small pathology images. A positive label indicates that the center 32x32px region of a patch contains at least one pixel of tumor tissue.

* Data description

    1. Data for training and testing is named with an image ID, and we have 220,000 images for training. There are 57,500 images we need to predict if tumors are in the image.

    2. 

## Load

In [2]:
Path.cwd()

PosixPath('/Users/ivanliu/Library/CloudStorage/OneDrive-GiantGroup/model training')

In [26]:
# Training images
train_images = Path.cwd() /'CNN/train/'
train_image_path = [i.as_posix() for i in train_images.glob('*.tif')]
print(f"Number of images: {len(train_image_path)}")

Number of images: 220025


In [21]:
train_image_path[0]

'/Users/ivanliu/Library/CloudStorage/OneDrive-GiantGroup/model training/CNN/train/f0c2a0b8ef3024f407fa97d852d49be0215cafe0.tif'

In [27]:
# Show the training data information
img = plt.imread(train_image_path[0])
print(f"Image data type: {type(img)}")
print(f"Image data shape: {img.shape}")

Image data type: <class 'numpy.ndarray'>
Image data shape: (96, 96, 3)


In [29]:
# Create a dataframe to store the training data
df = pd.DataFrame({'path' : train_image_path})
df['image_id'] = df['path'].apply(lambda x: findall(r'[^\/]+(?=\.)', x)[0])
df['data'] = df['path'].apply(lambda x: plt.imread(x))

In [30]:
df.head()

Unnamed: 0,path,image_id,data
0,/Users/ivanliu/Library/CloudStorage/OneDrive-G...,f0c2a0b8ef3024f407fa97d852d49be0215cafe0,"[[[212, 227, 224], [213, 225, 225], [215, 223,..."
1,/Users/ivanliu/Library/CloudStorage/OneDrive-G...,99ef485f205645918613cd04281098daa7c17819,"[[[193, 139, 175], [136, 84, 122], [125, 78, 1..."
2,/Users/ivanliu/Library/CloudStorage/OneDrive-G...,e2612e173abd0e8bb54a3c3db3f264b63d80bffb,"[[[149, 73, 148], [178, 107, 185], [237, 174, ..."
3,/Users/ivanliu/Library/CloudStorage/OneDrive-G...,6d1bb57c0606f05dbd75f90a8d9e21a57e1267e0,"[[[134, 49, 152], [184, 101, 189], [223, 144, ..."
4,/Users/ivanliu/Library/CloudStorage/OneDrive-G...,9c043ab2adadfeb758c71d21432fccd3e43565c0,"[[[119, 41, 125], [181, 108, 187], [234, 168, ..."


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220025 entries, 0 to 220024
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   path      220025 non-null  object
 1   image_id  220025 non-null  object
 2   data      220025 non-null  object
dtypes: object(3)
memory usage: 5.0+ MB


In [56]:
df['data'][0].reshape(-1, 96, 96, 3).shape

(1, 96, 96, 3)

In [62]:
# Convert the training images as tensor
img_array = df['data'].apply(lambda x: x.reshape(-1, 96, 96, 3)).values
# Concatenate all images
train_images = np.concatenate(img_array)
# Data info
print(f"Training data shape: {train_images.shape}")

Training data shape: (220025, 96, 96, 3)
