# Cancer Tissue Classification using ResNet50
This project aims to classify cancerous and non-cancerous tissue images using the ResNet50 deep learning architecture. We will train the model on the PCam dataset, evaluate its performance, and deploy the model using Gradio.

## Loading the CSV File

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load the train labels
train_labels = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')

# Display the first few rows of the dataset
train_labels.head()

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0


In [2]:
train_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220025 entries, 0 to 220024
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      220025 non-null  object
 1   label   220025 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.4+ MB


In [3]:
# Check the distribution of the labels
train_labels['label'].value_counts()

label
0    130908
1     89117
Name: count, dtype: int64

In [4]:
train_labels.isnull().sum()

id       0
label    0
dtype: int64

In [5]:
import plotly.express as px

# Visualize the distribution of labels using Plotly
fig = px.histogram(
    train_labels,
    x='label',
    title='Distribution of Labels',
    nbins=2,  # Since it's a binary classification
    labels={'label': 'Cancerous (1) vs Non-Cancerous (0)'},
    color='label',
    color_discrete_map={0: 'blue', 1: 'red'}
)

# Update layout for better aesthetics
fig.update_layout(
    xaxis_title="Label",
    yaxis_title="Count",
    bargap=0.2
)

fig.show()

In [6]:
# Calculate the distribution
label_distribution = train_labels['label'].value_counts().reset_index()
label_distribution.columns = ['Label', 'Count']

# Create a pie chart
fig = px.pie(
    label_distribution,
    names='Label',
    values='Count',
    title='Distribution of Cancerous vs Non-Cancerous Images',
    color='Label',
    color_discrete_map={0: 'blue', 1: 'red'}
)

# Update layout for better aesthetics
fig.update_traces(textinfo='percent+label')
fig.update_layout(
    title_font_size=20,
    title_x=0.5,
    legend_title_text='Labels',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    )
)

fig.show()

## Data Cleaning of CSV file

In [7]:
train_labels['label'] = train_labels['label'].astype(str)

There are many files which are mentioned in the csv file but are absent in the train folder directory. So, I decided to remove those IDs who are absent in the train directory and index those IDs accordingly.

In [8]:
# Load train_labels.csv
train_labels = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')

# Define paths to the image directories
train_dir = '/kaggle/input/histopathologic-cancer-detection/train/'
test_dir = '/kaggle/input/histopathologic-cancer-detection/test/'

# Get list of all files in the directories
train_files = set(os.listdir(train_dir))
test_files = set(os.listdir(test_dir))

# Function to get filenames without additional extensions
def clean_filename(filename):
    return filename.rstrip('.tif') + '.tif'

# Get list of filenames from train_labels
csv_files = set(train_labels['id'].apply(clean_filename))

# Find missing files in both directories
missing_in_both = csv_files - (train_files | test_files)

# Remove rows corresponding to files missing in both directories
train_labels_cleaned = train_labels[~train_labels['id'].apply(lambda x: clean_filename(x) in missing_in_both)]

# Reset index
train_labels_cleaned = train_labels_cleaned.reset_index(drop=True)

# Add '.tif' suffix to filenames in 'id' column if not already present
train_labels_cleaned['id'] = train_labels_cleaned['id'].apply(lambda x: x if x.endswith('.tif') else f"{x}.tif")

In [9]:
# Verify the changes
train_labels_cleaned

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835.tif,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77.tif,1
2,755db6279dae599ebb4d39a9123cce439965282d.tif,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08.tif,0
4,068aba587a4950175d04c680d38943fd488d6a9d.tif,0
...,...,...
206210,53e9aa9d46e720bf3c6a7528d1fca3ba6e2e49f6.tif,0
206211,d4b854fe38b07fe2831ad73892b3cec877689576.tif,1
206212,3d046cead1a2a5cbe00b2b4847cfb7ba7cf5fe75.tif,0
206213,f129691c13433f66e1e0671ff1fe80944816f5a2.tif,0


In [10]:
# Save cleaned train_labels to a new CSV file
train_labels_cleaned.to_csv('/kaggle/working/train_labels_cleaned.csv', index=False)

## Visualizations of the CSV file

In [11]:
import plotly.express as px

# Load the cleaned train_labels.csv
train_labels_cleaned = pd.read_csv('/kaggle/working/train_labels_cleaned.csv')

# Count the occurrences of each label
label_counts = train_labels_cleaned['label'].value_counts().reset_index()
label_counts.columns = ['label', 'count']

# Create a bar chart for label counts
bar_chart = px.bar(label_counts, x='label', y='count', title='Count of Labels',
                   labels={'label': 'Label', 'count': 'Count'},
                   text='count')
bar_chart.show()

# Calculate the percentage of each label
label_counts['percentage'] = (label_counts['count'] / label_counts['count'].sum()) * 100

# Create a pie chart for label percentages
pie_chart = px.pie(label_counts, names='label', values='percentage', title='Percentage of Labels',
                   labels={'label': 'Label', 'percentage': 'Percentage'})
pie_chart.show()

## Setting up the environment

In [12]:
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Verify GPU is being used
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


## Load and Prepare Data

In [13]:
# Ensure 'label' column values are strings
train_labels_cleaned['label'] = train_labels_cleaned['label'].astype(str)

# Define the path to the image directory
train_image_dir = '/kaggle/input/histopathologic-cancer-detection/train/'

# Define the ImageDataGenerator with augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,  # 20% for validation
    horizontal_flip=True,
    vertical_flip=True,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    fill_mode='nearest'
)

# Create training and validation generators
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_labels_cleaned,
    directory=train_image_dir,
    x_col='id',
    y_col='label',
    target_size=(224, 224),  # ResNet50 input size
    batch_size=32,
    class_mode='binary',
    subset='training'
)

validation_generator = train_datagen.flow_from_dataframe(
    dataframe=train_labels_cleaned,
    directory=train_image_dir,
    x_col='id',
    y_col='label',
    target_size=(224, 224),  # ResNet50 input size
    batch_size=32,
    class_mode='binary',
    subset='validation'
)

Found 164972 validated image filenames belonging to 2 classes.
Found 41243 validated image filenames belonging to 2 classes.


## Build the ResNet50 model

In [14]:
# Load the ResNet50 model with pre-trained weights from ImageNet
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

# Add custom layers on top of the base model
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(1, activation='sigmoid')(x)  # Binary classification

# Define the model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


## Adding Early Stopping

In [15]:
from tensorflow.keras.callbacks import EarlyStopping

# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_accuracy',  # Metric to monitor
    patience=3,              # Number of epochs with no improvement after which training will be stopped
    verbose=1,               # Verbosity mode
    restore_best_weights=True # Restore model weights from the epoch with the best value of the monitored metric
)

## Train the Model

In [16]:
# Train the model with EarlyStopping
history = model.fit(
    train_generator,
    epochs=10,  # Adjust the number of epochs as needed
    validation_data=validation_generator,
    verbose=1,
    callbacks=[early_stopping]  # Include the EarlyStopping callback
)

Epoch 1/10



Your `PyDataset` class should call `super().__init__(**kwargs)` in its constructor. `**kwargs` can include `workers`, `use_multiprocessing`, `max_queue_size`. Do not pass these arguments to `fit()`, as they will be ignored.

I0000 00:00:1725613246.677510     109 service.cc:145] XLA service 0x7d5b5001bd80 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1725613246.677575     109 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


[1m   2/5156[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:22[0m 51ms/step - accuracy: 0.3828 - loss: 0.7210   

I0000 00:00:1725613251.610376     109 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m5156/5156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3308s[0m 639ms/step - accuracy: 0.7086 - loss: 0.5589 - val_accuracy: 0.7538 - val_loss: 0.5107
Epoch 2/10
[1m5156/5156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2461s[0m 477ms/step - accuracy: 0.7532 - loss: 0.5066 - val_accuracy: 0.7692 - val_loss: 0.4896
Epoch 3/10
[1m5156/5156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2453s[0m 475ms/step - accuracy: 0.7604 - loss: 0.4963 - val_accuracy: 0.7681 - val_loss: 0.4865
Epoch 4/10
[1m5156/5156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2456s[0m 476ms/step - accuracy: 0.7673 - loss: 0.4887 - val_accuracy: 0.7648 - val_loss: 0.4858
Epoch 5/10
[1m5156/5156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2466s[0m 478ms/step - accuracy: 0.7661 - loss: 0.4888 - val_accuracy: 0.7555 - val_loss: 0.4956
Epoch 5: early stopping
Restoring model weights from the end of the best epoch: 2.


##  Evaluate the Model

In [17]:
# Evaluate the model
val_loss, val_accuracy = model.evaluate(validation_generator)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

[1m1289/1289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m504s[0m 391ms/step - accuracy: 0.7696 - loss: 0.4894
Validation Loss: 0.48911383748054504
Validation Accuracy: 0.7700700759887695


## Save the Model

In [18]:
# Save the model
model.save('/kaggle/working/resnet50_histopathology_model.h5')