In [1]:
from __future__ import print_function

import argparse

import os
import sys
import pandas as pd
import torch
from torch.utils import data
from torchvision import transforms
from PIL import Image
import cv2
import  glob
import time
import albumentations
import pandas as pd
import numpy as np
from collections import Counter
from utils.utils import segment_cell

from sklearn.preprocessing import OneHotEncoder# creating instance of one-hot-encoder
from sklearn.model_selection import train_test_split
### Internal Imports
from models.ResNext50 import Myresnext50
from train.train_classification import trainer_classification
from utils.utils import configure_optimizers
from Datasets.DataLoader import Img_DataLoader

### PyTorch Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import glob

In [2]:
# Load the data, split it into training and validation dataframes
df = pd.read_pickle('notextimagepaths.pkl')
 
train_df, val_df = train_test_split(df, test_size=0.20, random_state=42, stratify=df['Label'])
val_df, test_df = train_test_split(val_df, test_size=0.5, random_state=42, stratify=val_df['Label'])
print(f"Training set shape: {train_df.shape}          Training set label count: {str(Counter(train_df['Label'].to_list()))[7:][1:][:-1]} \n")
print(f"Validation set shape: {val_df.shape}         Validation set label count: {str(Counter(val_df['Label'].to_list()))[7:][1:][:-1]} \n")
print(f"Test set shape: {test_df.shape}         Validation set label count: {str(Counter(test_df['Label'].to_list()))[7:][1:][:-1]} \n")

Training set shape: (16591, 4)          Training set label count: {'Other': 9474, 'Myeloid': 6158, 'Lymphoid': 959} 

Validation set shape: (2074, 4)         Validation set label count: {'Other': 1185, 'Myeloid': 770, 'Lymphoid': 119} 

Test set shape: (2074, 4)         Validation set label count: {'Other': 1184, 'Myeloid': 770, 'Lymphoid': 120} 



In [3]:
# # Code used to store each image with removed text in a separate folder, such that it doesnt have to be done during training thus saving much time
# df = pd.read_pickle('imagepaths.pkl')

# def remove_text(filepath):

#     # Load image
#     img = cv2.imread(filepath)
#     img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
#     # Step 1: Copy the image
#     img_segmented = img_rgb.copy()
    
#     # Step 2: Define region to black out (bottom-right corner)
#     height, width, _ = img_segmented.shape
#     corner_hmin = int(height * 0.15)  # bottom 10%
#     corner_hmax = int(height * 0.05)  # bottom 5%
#     corner_w = int(width * 0.42)   # rightmost 25%
    
#     # Step 3: Black out that region
#     img_segmented[height - corner_hmin : height - corner_hmax, 0 : corner_w] = 0  # set to black
        
#     return img_rgb, img_segmented

# filepaths = df['Filepath'].tolist()

# for filepath in filepaths:
#     filename =  filepath.split('\\')[-1]
#     orig_img, seg_img = remove_text(filepath)
#     img_bgr = cv2.cvtColor(seg_img, cv2.COLOR_RGB2BGR)
#     cv2.imwrite(os.path.join('Datasets/notextimages', filename), img_bgr)

In [4]:
# For testing purposes to check whether a model trained on no data indeed performs poorly, thus ensuring that there is no label leakage during evaluation

# train_df_w, train_df_small = train_test_split(train_df, test_size=0.003, random_state=42, stratify=train_df['Label'])
# val_df_w, val_df_small = train_test_split(val_df, test_size=0.05, random_state=42, stratify=val_df['Label'])
# test_df_w, test_df_small = train_test_split(test_df, test_size=0.05, random_state=42, stratify=test_df['Label'])

# print(f"Training set shape: {train_df_small.shape}          Training set label count: {str(Counter(train_df_small['Label'].to_list()))[7:][1:][:-1]} \n")
# print(f"Validation set shape: {val_df_small.shape}         Validation set label count: {str(Counter(val_df_small['Label'].to_list()))[7:][1:][:-1]} \n")
# print(f"Test set shape: {test_df_small.shape}         Validation set label count: {str(Counter(test_df_small['Label'].to_list()))[7:][1:][:-1]} \n")

# # Load df that represents the one hot encoding of each cell type (Myeloid, Lymphoid, other)
# cell_types_df = pd.read_pickle("cell_types_df.pkl")

# # Load model
# resnext50_pretrained = torch.hub.load('pytorch/vision:v0.10.0', 'resnext50_32x4d', pretrained=True)
# my_extended_model = Myresnext50(my_pretrained_model= resnext50_pretrained, num_classes = 3)

# X_train_small = train_df_small['Filepath'].to_list()
# X_val_small = val_df_small['Filepath'].to_list()

# # Load labels
# train_labels_small = train_df_small['Label'].to_list()
# validation_labels_small = val_df_small['Label'].to_list()

In [5]:
# Load filepaths
X_train = train_df['Filepath'].to_list()
X_val = val_df['Filepath'].to_list()

# Load labels
train_labels = train_df['Label'].to_list()
validation_labels = val_df['Label'].to_list()

# Load df that represents the one hot encoding of each cell type (Myeloid, Lymphoid, other)
cell_types_df = pd.read_pickle("cell_types_df.pkl")

# Load model
resnext50_pretrained = torch.hub.load('pytorch/vision:v0.10.0', 'resnext50_32x4d', pretrained=True)
my_extended_model = Myresnext50(my_pretrained_model= resnext50_pretrained, num_classes = 3)

Using cache found in C:\Users\moone/.cache\torch\hub\pytorch_vision_v0.10.0


In [6]:
# Simple augumentation to improve the data generalibility

transform_pipeline = albumentations.Compose(
    [
        albumentations.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),

    ]
)

In [7]:
# Define training setup
trainer = trainer_classification(train_image_files=X_train, validation_image_files=X_val, train_labels=train_labels, validation_labels=validation_labels, model=my_extended_model,
                                     img_transform=transform_pipeline, init_lr=0.001,
                                     lr_decay_every_x_epochs=10,

                                     weight_decay=0.0005, batch_size=32, epochs=30, gamma=0.1, df=cell_types_df,
                                     save_checkpoints_dir='checkpoints')

In [8]:
# For testing purposes

# dataset = Img_DataLoader(img_list=X_train, labels=train_labels, split='train', transform = transform_pipeline, df = cell_types_df)
# shuffle = True
# dataloader = DataLoader(dataset, batch_size=32, num_workers=2, shuffle=shuffle)

In [11]:
# Train the model, 
#DONT FORGET TO DELETE CHECKPOINTS
My_model = trainer.train(my_extended_model).cuda()

==> Create model
==> List learnable parameters
==> Load data
16591
2074
==> Configure optimizer
10
==> Start training
==> Create the saving dictionary
The directory exists, overrode duplicate files
==> Epoch: 1 Step: 400 LR: 0.001000 Total Loss: 0.2995 Runtime: 4.82 s/50 iters.
==> Epoch: 1 Step: 450 LR: 0.001000 Total Loss: 0.5055 Runtime: 5.91 s/50 iters.
==> Epoch: 1 Step: 500 LR: 0.001000 Total Loss: 0.4243 Runtime: 6.03 s/50 iters.
==> Epoch: 1 Step: 550 LR: 0.001000 Total Loss: 0.3183 Runtime: 6.03 s/50 iters.
==> Epoch: 1 Step: 600 LR: 0.001000 Total Loss: 0.1842 Runtime: 5.66 s/50 iters.
==> Epoch: 1 Step: 650 LR: 0.001000 Total Loss: 0.2823 Runtime: 6.14 s/50 iters.
==> Epoch: 1 Step: 700 LR: 0.001000 Total Loss: 0.2615 Runtime: 6.08 s/50 iters.
==> Epoch: 1 Step: 750 LR: 0.001000 Total Loss: 0.1495 Runtime: 6.00 s/50 iters.
==> Epoch: 1 Step: 800 LR: 0.001000 Total Loss: 0.4136 Runtime: 6.17 s/50 iters.
==> Epoch: 1 Step: 850 LR: 0.001000 Total Loss: 0.3049 Runtime: 5.96 s/50