# Setting up the environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from google.colab import files

# Upload your Kaggle API token JSON file
uploaded = files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d paultimothymooney/breast-histopathology-images

In [None]:
!ls

In [None]:
!unzip -q breast-histopathology-images.zip -d dataset

## Importing modules

In [None]:
# Basic libraries
import numpy as np
import random
from os import listdir
from PIL import Image

# Preprocessing/Visualization
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from keras.utils import to_categorical

# Model creation
import tensorflow as tf
import keras
from keras import layers

# Evaluatiom
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
# Defining directories

base_path = "dataset/IDC_regular_ps50_idx5/"
files = listdir(base_path)

# Length of the samples is usually the no of patients

print("Total no of patients: "+ str(len(files)))

In [None]:
# Saving the data source into an array

dataset = []

for i in range(len(files)):
  patient_id = files[i]
  for c in [0,1]:
    patientid_path = base_path + patient_id
    class_path = patientid_path + "/" + str(c) + "/"
    samples = listdir(class_path)
    for picture in samples:
      image_path = class_path + picture
      dataset.append([image_path, c])

In [None]:
print("No. of images: " + str(len(dataset)))

In [None]:
# Presentation of dataset 2d array

dataset[0]

In [None]:
# Limit the dataset due to perfomance concerns

limit = len(dataset) / 8
dataset = dataset[:int(limit)]

len(dataset)

In [None]:
# Data Visualization

# Load an image

image_path = dataset[0][0]
label = dataset[0][1]
image = Image.open(image_path)

# Dimensions of the image
print("Width and height respectively: {} Pixels" .format(image.size))

In [None]:
# Show the first image in the dataset

plt.figure(figsize=(3, 3))

plt.imshow(image)
plt.title("First Image Sample")

plt.show()

In [None]:
# Seperate the data by class

BENIGNdata = [img for img, label in dataset if label == 0]
MALIGNANTdata = [img for img, label in dataset if label == 1]

BENIGNlabels = [label for img, label in dataset if label == 0]
MALIGNANTlabels = [label for img, label in dataset if label == 1]

In [None]:
# Get a sample of images each type of dataset

BENIGNsample = random.sample(BENIGNdata, 50)
MALIGNANTsample = random.sample(MALIGNANTdata, 50)

In [None]:
# Create a figure with subplots
fig, ax = plt.subplots(5, 10, figsize=(20, 10))

# Loop through the subplots
for n in range(5):
    for m in range(10):
        # Calculate the index based on row and column
        idx = m + 10 * n

        if idx < len(BENIGNsample):
            # Open the image using PIL
            image = Image.open(BENIGNsample[idx])

            # Display the image on the current subplot
            ax[n, m].imshow(image)
            ax[n, m].grid(False)
        else:
            # If there are not enough images to fill the grid, remove the empty subplot
            fig.delaxes(ax[n, m])

# Adjust spacing between subplots
plt.tight_layout()

# Show the grid of healthy patches
plt.show()

In [None]:
# Create a figure with subplots
fig, ax = plt.subplots(5, 10, figsize=(20, 10))

# Loop through the subplots
for n in range(5):
    for m in range(10):
        # Calculate the index based on row and column
        idx = m + 10 * n

        if idx < len(MALIGNANTsample):
            # Open the image using PIL
            image = Image.open(MALIGNANTsample[idx])

            # Display the image on the current subplot
            ax[n, m].imshow(image)
            ax[n, m].grid(False)
        else:
            # If there are not enough images to fill the grid, remove the empty subplot
            fig.delaxes(ax[n, m])

# Adjust spacing between subplots
plt.tight_layout()

# Show the grid of healthy patches
plt.show()

In [None]:
# Class distribution


labels = ["BENIGN", "MALIGNANT"]
counts = [len(BENIGNdata), len(MALIGNANTdata)]
colors = ["green", "orange"]

total_samples = sum(counts)
percentages = [(count / total_samples) * 100 for count in counts]

In [None]:
plt.figure(figsize=(8, 6))
plt.bar(labels, counts, color=colors)
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("Class Distribution")
plt.show()