In [1]:
# importing the libraries
import os
import sys
import pandas as pd
import torch
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
import  glob
import time
import albumentations
import math
import shutil
from collections import Counter
from scipy.special import softmax
from sklearn.preprocessing import OneHotEncoder# creating instance of one-hot-encoder
from sklearn.model_selection import train_test_split

from models.ResNext50 import Myresnext50
from train.train_classification import trainer_classification
from utils.utils import configure_optimizers
from Datasets.DataLoader import Img_DataLoader

### PyTorch Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils import data
from torchvision import transforms
from torchcam.methods import GradCAM
from torchcam.utils import overlay_mask


In [2]:
# Load the data, split it into training and validation dataframes
df = pd.read_pickle('notextimagepaths.pkl')

train_df, val_df = train_test_split(df, test_size=0.20, random_state=42, stratify=df['Label'])
val_df, test_df = train_test_split(val_df, test_size=0.5, random_state=42, stratify=val_df['Label'])
print(f"Training set shape: {train_df.shape}          Training set label count: {str(Counter(train_df['Label'].to_list()))[7:][1:][:-1]} \n")
print(f"Validation set shape: {val_df.shape}         Validation set label count: {str(Counter(val_df['Label'].to_list()))[7:][1:][:-1]} \n")
print(f"Test set shape: {test_df.shape}         Validation set label count: {str(Counter(test_df['Label'].to_list()))[7:][1:][:-1]} \n")

Training set shape: (16630, 4)          Training set label count: {'Other': 9477, 'Myeloid': 6195, 'Lymphoid': 958} 

Validation set shape: (2079, 4)         Validation set label count: {'Other': 1185, 'Myeloid': 774, 'Lymphoid': 120} 

Test set shape: (2079, 4)         Validation set label count: {'Other': 1184, 'Myeloid': 775, 'Lymphoid': 120} 



In [3]:
# Load filepaths, random state ensures testset is the same as in Training.ipynb
X_train = train_df['Filepath'].to_list()
X_val = val_df['Filepath'].to_list()
X_test = test_df['Filepath'].to_list()

# Load labels
train_labels = train_df['Label'].to_list()
validation_labels = val_df['Label'].to_list()
test_labels = test_df['Label'].to_list()

# Load df that represents the one hot encoding of each cell type (Myeloid, Lymphoid, other)
cell_types_df = pd.read_pickle("cell_types_df.pkl")

In [4]:
# Load model
resnext50_pretrained = torch.hub.load('pytorch/vision:v0.10.0', 'resnext50_32x4d', pretrained=True)
my_extended_model = Myresnext50(my_pretrained_model= resnext50_pretrained, num_classes = 3)

Using cache found in C:\Users\moone/.cache\torch\hub\pytorch_vision_v0.10.0


In [5]:
from PIL import Image
import numpy as np
from tqdm import tqdm

# Set target image size for resizing (optional but needed for fair comparison)
resize_shape = (224, 224)

# Load and preprocess images
def load_image(path, size=resize_shape):
    try:
        with Image.open(path) as img:
            img = img.convert("RGB")  # Ensure 3 channels
            img = img.resize(size)
            return np.asarray(img).astype(np.float32).flatten()
    except Exception as e:
        print(f"Error loading {path}: {e}")
        return None

# Precompute all image vectors
images1 = [(path, load_image(path)) for path in tqdm(X_train, desc="Loading List 1")]
images2 = [(path, load_image(path)) for path in tqdm(X_val, desc="Loading List 2")]

# Compare with Euclidean distance
matches = []
threshold = 0.0  # Change this if you want fuzzy matching

for path1, img1 in images1:
    if img1 is None:
        continue
    for path2, img2 in images2:
        if img2 is None:
            continue
        dist = np.linalg.norm(img1 - img2)
        if dist <= threshold:
            matches.append((path1, path2, dist))

# Output matches
for path1, path2, dist in matches:
    print(f"MATCH: {path1} == {path2} (distance: {dist})")

Loading List 1: 100%|████████████████████████████████████████████████████████████| 16630/16630 [09:55<00:00, 27.90it/s]
Loading List 2: 100%|██████████████████████████████████████████████████████████████| 2079/2079 [00:50<00:00, 41.18it/s]


MATCH: D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_131.jpg == D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_127.jpg (distance: 0.0)
MATCH: D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_131.jpg == D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_129.jpg (distance: 0.0)
MATCH: D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_PMY_24.jpg == D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_118.jpg (distance: 0.0)
MATCH: D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_130.jpg == D:\Mathijs\Open Universiteit\Thesis\Implementa

In [10]:
for match in matches:
    print(match[0])

D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_131.jpg
D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_131.jpg
D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_PMY_24.jpg
D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_130.jpg
D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_117.jpg
D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_PMY_22.jpg
D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___240523004481_1_BL_158.jpg
D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___230626004577_1_BL_154.jpg
D:\Mathijs\

In [26]:
# Images with whom there is a euclidian distance of 0.0 between X_train and X_test
duplicates = ['D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_127.jpg', 'D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___221207005714_1_BL_176.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_167.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_141.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101029_1_BL_142.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___211302811131_1_BL_31.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_143.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_135.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___230626004577_1_BL_174.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___221003005096_1_BL_116.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___221003005096_1_BL_115.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_161.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___230307000606_1_BL_113.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_137.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___230626004577_1_BL_153.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___240524000463_1_BL_150.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_129.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_144.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_144.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_121.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_123.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_125.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___240523004481_1_BL_155.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_154.jpg']

In [26]:
# Images with whom there is a euclidian distance of 0.0 between e.g. X_train and X_test
duplicatestraintest = ['D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_127.jpg', 'D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___221207005714_1_BL_176.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_167.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_141.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101029_1_BL_142.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___211302811131_1_BL_31.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_143.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_135.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___230626004577_1_BL_174.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___221003005096_1_BL_116.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___221003005096_1_BL_115.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_161.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___230307000606_1_BL_113.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_137.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___230626004577_1_BL_153.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___240524000463_1_BL_150.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_129.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_144.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_144.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_121.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_123.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_125.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___240523004481_1_BL_155.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_154.jpg']
duplicatesvaltest = ['D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___240523004481_1_BL_156.jpg', 'D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_124.jpg']
duplicatestrainval = ['D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_131.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_PMY_24.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_130.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_117.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_PMY_22.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___240523004481_1_BL_158.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___230626004577_1_BL_154.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_128.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___ERR20250210101550_1_BL_118.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_121.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220607003832_1_BL_156.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_178.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___240523004481_1_BL_159.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_PMY_23.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___221005002617_1_BL_145.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_163.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_176.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220607003832_1_BL_155.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___230626004577_1_BL_168.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___230626004577_1_BL_163.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___230626004577_1_BL_156.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___230626004577_1_BL_147.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_130.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___220712002361_1_BL_126.jpg','D:\Mathijs\Open Universiteit\Thesis\Implementation\DeepHeme_training\Datasets\notextimages\AML___240523004481_1_BL_155.jpg']

duplicatesall = list(set(duplicatestraintest + duplicatesvaltest + duplicatestrainval))

# Remove the duplicates from the dataframes
df = pd.read_pickle('imagepaths.pkl')
df1 = pd.read_pickle('segmentedimagepaths.pkl')
df2 = pd.read_pickle('notextimagepaths.pkl')

df.head()

Unnamed: 0,Filepath,Label,Final diagnosis,Classnames
0,D:\Mathijs\Open Universiteit\Thesis\Implementa...,Other,AML,LY
1,D:\Mathijs\Open Universiteit\Thesis\Implementa...,Myeloid,AML,BL
2,D:\Mathijs\Open Universiteit\Thesis\Implementa...,Other,AML,LY
3,D:\Mathijs\Open Universiteit\Thesis\Implementa...,Myeloid,AML,BL
4,D:\Mathijs\Open Universiteit\Thesis\Implementa...,Other,AML,SMU


In [24]:
filepaths = df['Filepath'].tolist()
indices = []
for i in range(filepaths):
    if filepath
filepaths[0].split('\\')[-1] == duplicatesall[0].split('\\'

'AML___231108001537_1_LY_83.jpg'

In [27]:
duplicatesall[0].split('\\')[-1]

'AML___220712002361_1_BL_123.jpg'