The data that is used in this project is the WSIROI dataset that's part of the TIGER dataset (https://tiger.grand-challenge.org/). 

In [None]:
# Connect the collab file to the drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import everything that's needed for the data exploration
from matplotlib import pyplot as plt 
from matplotlib.patches import Rectangle, Patch
from matplotlib.colors import to_rgb
from PIL import Image
from statistics import mean

import cv2
import json
import matplotlib.pyplot as plt
import numpy as np
import statistics

In [None]:
# Store the json file with bounding boxes
tissue_cells_json = json.load(open("/content/drive/Shared drives/TILs/dataset/TIGER/roi-level-annotations/tissue-cells/tiger-coco.json"))

# Data exploration

In [None]:
# Print the total amount of images in the dataset
print("There are", len(tissue_cells_json['images']), "images.")

In [None]:
# Print the total amount of bounding box annotations
print("There are", len(tissue_cells_json['annotations']), "annotations.")

In [None]:
# Calculate how many images are annotated with bounding boxes
id = []
for annotation in tissue_cells_json['annotations']:
  id.append(annotation['image_id'])
print(len(set(id)), "images have annotations.")

In [None]:
# Print some statistics about the annotations
ann_per_img = []
min, max = 100000, 0
for i in range(len(tissue_cells_json['images'])):
  ann_per_img.append(id.count(i))
  if id.count(i) > max:
    max = id.count(i)
  if id.count(i) < min:
    min = id.count(i)

print("=== Number of annotations per image ===")
print("Mean:", statistics.mean(ann_per_img), "\nMedian:", statistics.median(ann_per_img), "\nStandard deviation:", statistics.stdev(ann_per_img), "\nMax:", max, "\nMin:", min)

In [None]:
# Print some statistics about the image size
heights = []
widths = []
hi_g = []
wi_g = []

hi_k = []
wi_k = []
min_height, min_width, max_height, max_width = 100000, 100000, 0, 0

for image in tissue_cells_json['images']:
  heights.append(image['height'])
  widths.append(image['width'])
  if image['height'] < min_height:
    min_height = image['height']
  if image['height'] > max_height:
    max_height = image['height']
  if image['width'] < min_width:
    min_width = image['width']
  if image['width'] > max_width:
    max_width = image['width']
  if image['width'] > 800:
    wi_g.append(image['width'])
  else:
    wi_k.append(image['width'])
  if image['height'] > 800:
    hi_g.append(image['height'])
  else:
    hi_k.append(image['height'])

print("=== Size of the images ===")
print("Mean height:", statistics.mean(heights), ", Mean width:", statistics.mean(widths), "\nMedian height:", statistics.median(heights), ", Median width:", statistics.median(widths), 
      "\nStandard deviation height:", statistics.stdev(heights), ", Standard deviation width:", statistics.stdev(widths), "\nMax height:", max_height, ", Max width:", max_width, 
      "\nMin height:", min_height, ", Min width:", min_width)

print(statistics.mean(wi_k), statistics.mean(wi_g))
print(statistics.mean(hi_k), statistics.mean(hi_g))

In [None]:
# Plot the heights of the images
plt.hist(heights, bins = [0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500]) 
plt.title("Histogram height") 
plt.show()

In [None]:
# Print the mean height of the small and large images
print("The mean height of the images with a height > 800:", statistics.mean(hi_g), "\nThe mean height of the images with a height < 800:", statistics.mean(hi_k))

In [None]:
# Plot the widths of the images
plt.hist(widths, bins = [0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500]) 
plt.title("Histogram width") 
plt.show()

In [None]:
# Print the mean width of the small and large images
print("The mean width of the images with a width > 800:", statistics.mean(wi_g), "\nThe mean width of the images with a width < 800:", statistics.mean(wi_k))

In [None]:
# Print an image from the dataset with a given id
id = 12
for image in tissue_cells_json['images']:
  if image['id'] == id:
    img = image['file_name']
    path = "/content/drive/Shared drives/TILs/dataset/TIGER/roi-level-annotations/tissue-cells/" + img

tissue_cell_img = cv2.imread(path)
plt.imshow(tissue_cell_img)

In [None]:
# Print an image and bounding boxes from the dataset with a given id
id = 12
for image in tissue_cells_json['images']:
  if image['id'] == id:
    img = image['file_name']
    path = "/content/drive/Shared drives/TILs/dataset/TIGER/roi-level-annotations/tissue-cells/" + img

image = cv2.imread(path)
fig, ax = plt.subplots(1,1, figsize=(8,8))
ax.imshow(image)

annotations = []
for annotation in tissue_cells_json['annotations']:
  if annotation['image_id'] == id:
    annotations.append(annotation)

for annotation in annotations:
  bbox_centre_x, bbox_centre_y, width, height = annotation['bbox']
  patch = Rectangle((bbox_centre_x, bbox_centre_y), width, height, fill=False, linewidth = 2)
  ax.add_patch(patch)
    
plt.show()

In [None]:
# Print an image and pixel-wise segmentation from the dataset with a given id
colour_label = {
    0: 'k', # no label
    1: 'c', # invasive tumor
    2: 'g', # tumor associated stroma
    3: 'y', # in situ tumor
    4: 'crimson', # healty glands
    5: 'm', # necrosis not in situ
    6: 'darkviolet', # inflamed stroma
    7: 'mediumblue' # rest
}

id = 12
for image in tissue_cells_json['images']:
  if image['id'] == id:
    img = image['file_name']
    img = img[8:]
    path = "/content/drive/Shared drives/TILs/dataset/TIGER/roi-level-annotations/tissue-cells/masks" + img

image = cv2.imread(path)

for label in np.unique(image):
  colour = np.multiply(to_rgb(colour_label[label]),256).astype(np.uint8)
  mask = (image == list((label,)*3)).all(axis=2)
  image[mask] = colour
  print(label)

plt.imshow(image)

In [None]:
# Print the amoung of segmentation annotations per category
label_list = [0 for i in range(8)]
images_paths = [img_dict['file_name'] for img_dict in tissue_cells_json['images']]
for image_path in images_paths:
  image_path = image_path[8:]
  mask_path = "/content/drive/Shared drives/TILs/dataset/TIGER/roi-level-annotations/tissue-cells/masks" + image_path
  mask = cv2.imread(mask_path)
  for label in np.unique(mask):
    label_list[label] += 1

for i in range(len(label_list)):
  print("There are", label_list[i], "segmentation annotations for category", i)

# Model training

In [None]:
# Import YOLOv5 
!git clone https://github.com/ultralytics/yolov5  # clone
%cd yolov5
%pip install -qr requirements.txt  # install

import torch
import utils
display = utils.notebook_init()  # checks

In [None]:
# Install torchvision
!pip install torch torchvision

In [None]:
# Import everyting that's needed for the model training, validation and testing
%cd yolov5
import torch
from yolov5 import utils
import torch
import utils
from IPython import display
from IPython.display import clear_output
from pathlib import Path
import yaml
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import glob

%matplotlib inline
display = utils.notebook_init()

In [None]:
# Train the YOLOv5 model using transfer learning
!python train.py --img 150 --hyp 'hyp.scratch-high.yaml' --batch 256 --epochs 300 --data '/content/drive/Shared drives/TILs/Project/yolov5/data/data.yaml' --weights 'yolov5m.pt' --project 'runs_tils' --name 'feature_extraction_yolov5m' --cache --freeze 12

In [None]:
# Print validation results
display.Image(f"/content/drive/Shared drives/TILs/Project/yolov5/runs_tils/feature_extraction_yolov5m/results.png")

# Model fine-tuning

In [None]:
# Fine-tune the model
!python train.py --img 150 --hyp 'hyp.VOC.yaml' --batch 256 --epochs 150 --data '/content/drive/Shared drives/TILs/Project/yolov5/data/data.yaml' --weights '/content/drive/Shared drives/TILs/Project/yolov5/runs_tils/feature_extraction_yolov5m/weights/best.pt' --project 'runs_tils' --name 'fine-tuning_yolov5m' --cache

In [None]:
# Print validation results
display.Image(f"/content/drive/Shared drives/TILs/Project/yolov5/runs_tils/fine-tuning_yolov5m/results.png")

# Model evaluation

In [None]:
# Evaluate the model using an unseen test set
!python val.py --img 150 --weights '/content/drive/Shared drives/TILs/Project/yolov5/runs_tils/fine-tuning_yolov5m/weights/best.pt' --batch 256 --data '/content/drive/Shared drives/TILs/Project/yolov5/data/data.yaml' --task test --project 'runs_tils' --name 'validation_on_test_data_yolov5m'