<a href="https://colab.research.google.com/github/KashishMistry/bloodcellclassification/blob/main/APS360_Blood_Cell_Classification_Data_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Blood Cell Classification


# Data Processing
- Using primary (https://www.sciencedirect.com/science/article/pii/S2352340920303681) and secondary (https://www.kaggle.com/datasets/paultimothymooney/blood-cells) datasets
- Crop primary images to 280 x 280
- Crop secondary images by hand (we're only using lymphocytes and monocytes from secondary dataset)
- Resize cropped secondary images to 280 x 280
- Merge the two datasets in a new folder using classes from primary dataset
  - For the secondary dataset, use the provided csv file for labels (since images are not classified into folders) to only retrieve lymphocyte and monocyte images
- Apply random modifications to augment the data in classes with less than 3000 images
  - Modifications: rotate 90, 180, 270 degrees, flip vertically, flip horizontally
  - All classes should have 3000 images each at the end
- Split into train, validation and testing using a 80-10-10 split

- Make a smaller dataset for faster training
  - From the dataset derived as above, copy 250 images from each class into a new dataset folder
  - Split into train, validation and testing using a 80-10-10 split

In [None]:
import os
import PIL
from PIL import Image
import pandas as pd
#import shutil

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Unzip both datasets
!unzip '/content/drive/My Drive/APS360 Project Team/Data/PBC_dataset_normal_DIB.zip' -d '/root/datasets'    # primary dataset
!unzip '/content/drive/My Drive/APS360 Project Team/Data/Kaggle_Blood_Cell_Images.zip' -d '/root/datasets'  # secondary dataset

# Define data directories for both datasets
data_dir_prim = '/root/datasets/PBC_dataset_normal_DIB'
data_dir_sec = '/root/datasets/Kaggle_Blood_Cell_Images'

# Make a folder to hold all the final images
!mkdir Blood_Cell_Dataset
final_dir = '/content/Blood_Cell_Dataset'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /root/datasets/PBC_dataset_normal_DIB/erythroblast/ERB_141392.jpg  
  inflating: /root/datasets/PBC_dataset_normal_DIB/erythroblast/ERB_284076.jpg  
  inflating: /root/datasets/PBC_dataset_normal_DIB/erythroblast/ERB_392614.jpg  
  inflating: /root/datasets/PBC_dataset_normal_DIB/erythroblast/ERB_835279.jpg  
  inflating: /root/datasets/PBC_dataset_normal_DIB/erythroblast/ERB_287580.jpg  
  inflating: /root/datasets/PBC_dataset_normal_DIB/erythroblast/ERB_187538.jpg  
  inflating: /root/datasets/PBC_dataset_normal_DIB/erythroblast/ERB_528886.jpg  
  inflating: /root/datasets/PBC_dataset_normal_DIB/erythroblast/ERB_721826.jpg  
  inflating: /root/datasets/PBC_dataset_normal_DIB/erythroblast/ERB_349979.jpg  
  inflating: /root/datasets/PBC_dataset_normal_DIB/erythroblast/ERB_91181.jpg  
  inflating: /root/datasets/PBC_dataset_normal_DIB/erythroblast/ERB_133116.jpg  
  inflating: /root/datasets/PBC_dataset_norma

In [None]:
# Loop through images in primary dataset, crop them, and add them to final directory

#orig_dir_prim = os.path.join(data_dir_prim, 'PBC_dataset_normal_DIB/')
orig_dir_prim = data_dir_prim
os.remove(os.path.join(orig_dir_prim, 'neutrophil/.DS_169665.jpg')) #delete this non-image file
prim_classes = os.listdir(orig_dir_prim)

for dir in prim_classes: #dir is each of the class folders
  os.mkdir(os.path.join(final_dir, dir))  #make folder for each class (using class names from prim dataset)
  prim_images = os.listdir(os.path.join(data_dir_prim, dir))
  counter = 1 # to keep track of how many images of this class - don't take more than 3000

  for img in prim_images:
    open_img = PIL.Image.open(os.path.join(orig_dir_prim, dir, img))
    (left, upper, right, lower) = (40, 41, 320, 321)
    cropped_img = open_img.crop((left, upper, right, lower))
    cropped_img.save(os.path.join(final_dir, dir, img))
    counter = counter + 1
    if (counter > 3000): 
      break

In [None]:
# Import csv file that contains the mapping of the images to labels for the secondary dataset
df = pd.read_csv(os.path.join(data_dir_sec, 'dataset-master/dataset-master/labels.csv'))

# Loop through images in secondary dataset, resize them, and add them to final directory
orig_dir_sec = os.path.join(data_dir_sec, 'dataset-master/dataset-master/JPEGImages/')
sec_images = os.listdir(orig_dir_sec)
counter = 0 # to keep track of image number

for img in sec_images:
  img_num = int(img[11:16]) # get the image number from its name so we can find it in the csv file
  df_img = df[df['Image'] == img_num] 

  if ((df_img.loc[df_img.index[0]])['Category'] == 'LYMPHOCYTE'):
    open_img = PIL.Image.open(os.path.join(orig_dir_sec, img))
    (width, height) = (280, 280)
    resized_img = open_img.resize((width, height))
    resized_img.save(os.path.join(final_dir, 'lymphocyte', img))

  elif ((df_img.loc[df_img.index[0]])['Category'] == 'MONOCYTE'):
    open_img = PIL.Image.open(os.path.join(orig_dir_sec, img))
    (width, height) = (280, 280)
    resized_img = open_img.resize((width, height))
    resized_img.save(os.path.join(final_dir, 'monocyte', img))
    
  counter = counter + 1

In [None]:
import random
# apply modifications for data augmentation to get all classes to reach 3000 images

final_classes = os.listdir(final_dir)
theta = [90, 180, 270] # rotation angles
methods = [PIL.Image.FLIP_LEFT_RIGHT, PIL.Image.FLIP_TOP_BOTTOM] # flip types

for dir in final_classes: # each of the class folders in the final directory
  final_images = os.listdir(os.path.join(final_dir, dir))
  num_images = len(final_images)

  if num_images < 3000:
    remainder = 3000 - num_images

    while remainder > 0: # until we reach 3000
      rand_img = random.randint(0, num_images-1) # pick a random image to modify
      open_img = PIL.Image.open(os.path.join(final_dir, dir, final_images[rand_img]))
      rand_modif = random.randint(0, 1) # pick a random modification to make to the image (rotate or flip)
      
      if (rand_modif == 0):
        rand_theta = random.randint(0, 2) # pick a random angle for rotation
        modified_img = open_img.rotate(angle=rand_theta)
      
      else:
        rand_method = random.randint(0, 1) # pick a random type of flip
        modified_img = open_img.transpose(method=methods[rand_method])
      
      modified_img.save(os.path.join(final_dir, dir, 'Modified_'+str(remainder)+'.jpg'))
      remainder = remainder - 1

In [None]:
# Check to make sure all classes have 3000 images now after data augmentation

for dir in final_classes: # each of the class folders in the final directory
  final_images = os.listdir(os.path.join(final_dir, dir))
  num_images = len(final_images)
  print(dir, ": ", num_images)

erythroblast :  3000
platelet :  3000
monocyte :  3000
basophil :  3000
eosinophil :  3000
lymphocyte :  3000
ig :  3000
neutrophil :  3000


In [None]:
!pip install split_folders
import splitfolders

# Split data into training, validation and test datasets
splitfolders.ratio('/content/Blood_Cell_Dataset', output='/content/Blood_Cell_Dataset_Split', seed=22, ratio=(.8,.1,.1))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting split_folders
  Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split_folders
Successfully installed split_folders-0.5.1


Copying files: 24000 files [00:04, 4829.56 files/s]


In [None]:
# Zip final dataset and upload to shared drive folder
!zip -r /content/drive/My\ Drive/APS360\ Project\ Team/Data/Blood_Cell_Dataset_Split.zip /content/Blood_Cell_Dataset_Split

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: content/Blood_Cell_Dataset_Split/train/lymphocyte/LY_832225.jpg (deflated 3%)
  adding: content/Blood_Cell_Dataset_Split/train/lymphocyte/Modified_749.jpg (deflated 2%)
  adding: content/Blood_Cell_Dataset_Split/train/lymphocyte/Modified_1725.jpg (deflated 2%)
  adding: content/Blood_Cell_Dataset_Split/train/lymphocyte/LY_553290.jpg (deflated 3%)
  adding: content/Blood_Cell_Dataset_Split/train/lymphocyte/Modified_956.jpg (deflated 2%)
  adding: content/Blood_Cell_Dataset_Split/train/lymphocyte/LY_4334.jpg (deflated 2%)
  adding: content/Blood_Cell_Dataset_Split/train/lymphocyte/LY_865265.jpg (deflated 2%)
  adding: content/Blood_Cell_Dataset_Split/train/lymphocyte/Modified_1264.jpg (deflated 2%)
  adding: content/Blood_Cell_Dataset_Split/train/lymphocyte/LY_476808.jpg (deflated 2%)
  adding: content/Blood_Cell_Dataset_Split/train/lymphocyte/Modified_786.jpg (deflated 2%)
  adding: content/Blood_Cell_Dataset_Spl

In [None]:
# Make the small dataset (take 250 images per class)

!mkdir Blood_Cell_Dataset_Small
final_dir_small = '/content/Blood_Cell_Dataset_Small'

for dir in final_classes: # each of the class folders in the final directory
  final_images = os.listdir(os.path.join(final_dir, dir))
  num_images = len(final_images)
  os.mkdir(os.path.join(final_dir_small, dir))  #make folder for each class
  count = 1
  index = 0

  while count <= 250:
    open_img = PIL.Image.open(os.path.join(final_dir, dir, final_images[index]))
    open_img.save(os.path.join(final_dir_small, dir, final_images[index]))
    index = index + 12 #from 3000 pics, incrementing by 12 each time will give us 250 pics
    count = count + 1

In [None]:
# Make sure all classes have 250 images each

for dir in final_classes: # each of the class folders in the final directory
  final_images_small = os.listdir(os.path.join(final_dir_small, dir))
  num_images_small = len(final_images_small)
  print(num_images_small)

250
250
250
250
250
250
250
250


In [None]:
# Split data into training, validation and test datasets
splitfolders.ratio('/content/Blood_Cell_Dataset_Small', output='/content/Blood_Cell_Dataset_Small_Split', seed=22, ratio=(.8,.1,.1))
# Zip this dataset and upload to shared drive folder
!zip -r /content/drive/My\ Drive/APS360\ Project\ Team/Data/Blood_Cell_Dataset_Small_Split.zip /content/Blood_Cell_Dataset_Small_Split

Copying files: 2000 files [00:00, 4156.81 files/s]


  adding: content/Blood_Cell_Dataset_Small_Split/ (stored 0%)
  adding: content/Blood_Cell_Dataset_Small_Split/val/ (stored 0%)
  adding: content/Blood_Cell_Dataset_Small_Split/val/erythroblast/ (stored 0%)
  adding: content/Blood_Cell_Dataset_Small_Split/val/erythroblast/Modified_258.jpg (deflated 2%)
  adding: content/Blood_Cell_Dataset_Small_Split/val/erythroblast/Modified_338.jpg (deflated 3%)
  adding: content/Blood_Cell_Dataset_Small_Split/val/erythroblast/Modified_916.jpg (deflated 3%)
  adding: content/Blood_Cell_Dataset_Small_Split/val/erythroblast/ERB_523193.jpg (deflated 3%)
  adding: content/Blood_Cell_Dataset_Small_Split/val/erythroblast/ERB_665874.jpg (deflated 2%)
  adding: content/Blood_Cell_Dataset_Small_Split/val/erythroblast/Modified_277.jpg (deflated 3%)
  adding: content/Blood_Cell_Dataset_Small_Split/val/erythroblast/ERB_380670.jpg (deflated 2%)
  adding: content/Blood_Cell_Dataset_Small_Split/val/erythroblast/Modified_332.jpg (deflated 2%)
  adding: content/Blood