In [None]:
!gdown 1gAZ40Q0tdgqb2GW2vvcWbwRYQbPDh3LJ

Downloading...
From (original): https://drive.google.com/uc?id=1gAZ40Q0tdgqb2GW2vvcWbwRYQbPDh3LJ
From (redirected): https://drive.google.com/uc?id=1gAZ40Q0tdgqb2GW2vvcWbwRYQbPDh3LJ&confirm=t&uuid=f4af2b6a-dfbc-4dcc-bb99-56ef0b1ea658
To: /content/AML2.zip
100% 3.41G/3.41G [01:11<00:00, 48.0MB/s]


In [None]:
!unzip AML2.zip > /dev/null 2> /dev/null

# Classification of Normal and Acute Myeloid Leukemia (AML) with RUNX1-RUNX1T1 Fusion Gene

## Introduction

Acute myeloid leukemia (AML) with t(8;21)(q22;q22.1);RUNX1-RUNX1T1, one of the core-binding factor leukemias, is one of the most common subtypes of AML with recurrent genetic abnormalities and is associated with a favorable outcome. This translocation leads to the formation of a pathological RUNX1-RUNX1T1 fusion protein, disrupting the normal function of the core-binding factor, which plays a crucial role in hematopoietic differentiation and maturation. This chromosomal rearrangement is one of the most common, with an incidence of 15% in children and young adults.

For our study, we obtained images of peripheral blood smears from both normal (control) and leukemia patients with RUNX1-RUNX1T1 Fusion Gene (RUNX1_RUNX1T1) from the Cancer Image Archive. Images of patients from each group were pooled; these were then used to train our CNN model. The model achieved F1 scores of ~0.9 for both normal cells and leukemia cells.

[1] https://www.cancerimagingarchive.net/collection/aml-cytomorphology_mll_helmholtz/

In [None]:
# @title Import libraries

# import system libs
import os
import itertools
import cv2

# import data handling tools
import glob
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# import Deep learning Libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adamax
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

# ignore the warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

In [None]:
import tqdm.notebook as tq

# **Preprocessing Images**

In [None]:
image_data = "aml_data"
files = [i for i in glob.glob(image_data + "//*//*//*")]
classes = [x.split('/')[1] for x in files]
patients = [x.split('/')[2] for x in files]

In [None]:
df = pd.DataFrame({"filename":files, "patient":patients, "label":classes})

In [None]:
df = df.sort_values(by="filename")

In [None]:
all_files = list(df['filename'])
for i in tq.tqdm(range(len(all_files))):
  img_path = all_files[i]
  image = cv2.imread(img_path)
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  blurred = cv2.GaussianBlur(gray, (5, 5), 0)
  cv2.imwrite(img_path, blurred)

  0%|          | 0/81214 [00:00<?, ?it/s]

In [None]:
!mv aml_data aml_preprocessed

In [None]:
!unzip AML2.zip > /dev/null 2> /dev/null

In [None]:
df['filename'] = df['filename'].apply(lambda filename: "".join(filename[9:]))

In [None]:
np.unique(df['label'])

array(['CBFB_MYH11', 'NPM1', 'PML_RARA', 'RUNX1_RUNX1T1', 'control'],
      dtype=object)

In [None]:
df['cancer'] = df['label'].replace({'CBFB_MYH11':'malignant', 'NPM1':'malignant', 'PML_RARA':'malignant', 'RUNX1_RUNX1T1':'malignant'})

In [None]:
from sklearn.model_selection import train_test_split

# Split df_shuffled into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['cancer'])

# Use pandas' isin method for faster matching
df['split'] = 'test'  # Set default to 'test'
df.loc[df['filename'].isin(train_df['filename']), 'split'] = 'train'

df

Unnamed: 0,filename,patient,label,cancer,split
66575,CBFB_MYH11/AQK/image_0.tif,AQK,CBFB_MYH11,malignant,train
66651,CBFB_MYH11/AQK/image_1.tif,AQK,CBFB_MYH11,malignant,train
66697,CBFB_MYH11/AQK/image_10.tif,AQK,CBFB_MYH11,malignant,test
66672,CBFB_MYH11/AQK/image_100.tif,AQK,CBFB_MYH11,malignant,train
66505,CBFB_MYH11/AQK/image_101.tif,AQK,CBFB_MYH11,malignant,train
...,...,...,...,...,...
35529,control/ZNL/image_95.tif,ZNL,control,control,train
35431,control/ZNL/image_96.tif,ZNL,control,control,train
35191,control/ZNL/image_97.tif,ZNL,control,control,test
35168,control/ZNL/image_98.tif,ZNL,control,control,train


# **Segmentation Ideas**

What I Need To Do:

-Take the original image and run segmentation
-At the end, append each segment to a dataframe along with the original image it got segmented from beside it on the next coulumn
-Save the segmented images in the folder
-Save the np.array of all of the filepaths of the original images in order

In [None]:
!cp -r aml_preprocessed aml_segmented

In [None]:
!find aml_segmented -name "*.tif" -type f -delete

In [None]:
import cv2
import numpy as np
import pandas as pd
import os

#df2 = pd.DataFrame(columns=["segmented_image", "segmented_filepath", "og_image", "og_filepath", "label"])
#nuclei_output_dir = "nuclei_images"
#os.makedirs(nuclei_output_dir, exist_ok=True)

def otsu_threshold(image):
    #blur = cv2.GaussianBlur(image, (5, 5), 0)
    _, otsu_thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    return otsu_thresh

def refine_segmentation(thresholded_image):
    kernel = np.ones((2, 2), np.uint8)
    opening = cv2.morphologyEx(thresholded_image, cv2.MORPH_OPEN, kernel)
    closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel)
    return closing

def apply_watershed(original_image, binary_image):
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(binary_image, cv2.MORPH_OPEN, kernel, iterations=2)
    sure_bg = cv2.dilate(opening, kernel, iterations=3)
    dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
    _, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0)
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg, sure_fg)
    _, markers = cv2.connectedComponents(sure_fg)
    markers = markers + 1
    markers[unknown == 255] = 0
    markers = cv2.watershed(original_image, markers)
    original_image[markers == -1] = [0, 0, 255]
    return markers

def extract_individual_nuclei(markers, original_image):
    unique_markers = np.unique(markers)
    nuclei_images = []
    for marker in unique_markers:
        if marker <= 1:
            continue
        mask = np.zeros_like(markers, dtype=np.uint8)
        mask[markers == marker] = 255
        nucleus_image = cv2.bitwise_and(original_image, original_image, mask=mask)
        resized_nucleus_image = cv2.resize(nucleus_image, (64, 64))
        nuclei_images.append(resized_nucleus_image)
    return nuclei_images

#data_to_append = []

all_files = list(df['filename'])
for i in tq.tqdm(range(len(all_files))):
    blurred = cv2.imread("aml_preprocessed/" + all_files[i], cv2.IMREAD_GRAYSCALE)
    original_image = cv2.imread("aml_data/" + all_files[i])

    segmented_image = otsu_threshold(blurred)
    refined_segmented_image = refine_segmentation(segmented_image)
    watershed_markers = apply_watershed(original_image, refined_segmented_image)
    nuclei_images = extract_individual_nuclei(watershed_markers, original_image)

    for j, nucleus_image in enumerate(nuclei_images):
        segmented_filepath = "aml_segmented/" + all_files[i] + f"_{j}.png"
        cv2.imwrite(segmented_filepath, nucleus_image)

        #data_to_append.append({
        #    "segmented_image": nucleus_image,
        #    "segmented_filepath": segmented_filepath,
        #    "og_image": original_image,
        #    "og_filepath": og_filepath,
        #    "label": label  # Append the label from df['label']
        #})

#df2 = pd.concat([df2, pd.DataFrame(data_to_append)], ignore_index=True)

#print(f"DataFrame df2 contains {len(df2)} entries.")
#print(df2.head())


  0%|          | 0/81214 [00:00<?, ?it/s]

In [None]:
#!zip -r aml_segmented.zip aml_segmented

In [None]:
segment_data = "aml_segmented"
files = [i for i in glob.glob(segment_data + "//*//*//*")]
originals = ["_".join(x.split("_")[:-1]) for x in files]
originals = ["".join(x[14:]) for x in originals]
#classes = [x.split('/')[1] for x in files]
#patients = [x.split('/')[2] for x in files]

In [None]:
segmented_df = pd.DataFrame({"nucleus":files, "filename":originals})

In [None]:
segmented_df = df.merge(segmented_df, on="filename")

In [None]:
segmented_df = segmented_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
segmented_df['filename'] = "aml_data/" + segmented_df['filename']

In [None]:
segmented_df

Unnamed: 0,filename,patient,label,cancer,split,nucleus
0,aml_data/NPM1/OCV/image_250.tif,OCV,NPM1,malignant,train,aml_segmented/NPM1/OCV/image_250.tif_1.png
1,aml_data/RUNX1_RUNX1T1/UWF/image_427.tif,UWF,RUNX1_RUNX1T1,malignant,train,aml_segmented/RUNX1_RUNX1T1/UWF/image_427.tif_...
2,aml_data/PML_RARA/RNQ/image_421.tif,RNQ,PML_RARA,malignant,train,aml_segmented/PML_RARA/RNQ/image_421.tif_0.png
3,aml_data/NPM1/CVW/image_414.tif,CVW,NPM1,malignant,train,aml_segmented/NPM1/CVW/image_414.tif_0.png
4,aml_data/CBFB_MYH11/XIE/image_436.tif,XIE,CBFB_MYH11,malignant,train,aml_segmented/CBFB_MYH11/XIE/image_436.tif_2.png
...,...,...,...,...,...,...
125439,aml_data/control/VPN/image_48.tif,VPN,control,control,train,aml_segmented/control/VPN/image_48.tif_0.png
125440,aml_data/control/LCW/image_476.tif,LCW,control,control,train,aml_segmented/control/LCW/image_476.tif_1.png
125441,aml_data/CBFB_MYH11/BJK/image_239.tif,BJK,CBFB_MYH11,malignant,test,aml_segmented/CBFB_MYH11/BJK/image_239.tif_0.png
125442,aml_data/CBFB_MYH11/POM/image_278.tif,POM,CBFB_MYH11,malignant,test,aml_segmented/CBFB_MYH11/POM/image_278.tif_0.png


# **Preparing For Training**

In [None]:
batch_size = 128                           # de 32 para 128
target_size = (64,64)

# CONTROL METHOD

In [None]:
train= keras.preprocessing.image.ImageDataGenerator()
train= train.flow_from_dataframe(
    segmented_df[segmented_df['split']=='train'],
    x_col="filename",
    y_col="cancer",
    target_size=target_size,  # Fixed here
    batch_size=batch_size,
    shuffle = False,
    class_mode="categorical"
)

test= keras.preprocessing.image.ImageDataGenerator()
test= test.flow_from_dataframe(
    segmented_df[segmented_df['split']=='test'],
    x_col="filename",
    y_col="cancer",
    target_size=target_size,  # Fixed here
    batch_size=batch_size,
    shuffle = False,
    class_mode="categorical"
)

Found 100416 validated image filenames belonging to 2 classes.
Found 25028 validated image filenames belonging to 2 classes.


# EXPERIMENTAL METHOD

In [None]:
train= keras.preprocessing.image.ImageDataGenerator()
train= train.flow_from_dataframe(
    segmented_df[segmented_df['split']=='train'],
    x_col="nucleus",
    y_col="cancer",
    target_size=target_size,  # Fixed here
    batch_size=batch_size,
    shuffle = False,
    class_mode="categorical"
)

test= keras.preprocessing.image.ImageDataGenerator()
test= test.flow_from_dataframe(
    segmented_df[segmented_df['split']=='test'],
    x_col="nucleus",
    y_col="cancer",
    target_size=target_size,  # Fixed here
    batch_size=batch_size,
    shuffle = False,
    class_mode="categorical"
)

Found 100416 validated image filenames belonging to 2 classes.
Found 25028 validated image filenames belonging to 2 classes.


# **Training Models**

# Sequential

# VG19

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

image_size = (64, 64)

vgg19_model = tf.keras.applications.VGG19(include_top=False, input_shape=(image_size[0], image_size[1], 3))

num_freeze = int(len(vgg19_model.layers) * 0.2)

for n in range(num_freeze):
    vgg19_model.layers[n].trainable = False

model1 = tf.keras.models.Sequential()
model1.add(vgg19_model)

model1.add(tf.keras.layers.Flatten())
model1.add(tf.keras.layers.Dense(512, activation='relu'))
model1.add(tf.keras.layers.Dropout(0.2))
model1.add(tf.keras.layers.Dense(256, activation='relu'))
model1.add(tf.keras.layers.Dropout(0.2))
model1.add(tf.keras.layers.Dense(2, activation='softmax'))

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

model1.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)

history = model1.fit(
    train,
    batch_size=64,
    epochs=20,
    validation_data=test,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m80134624/80134624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step
Epoch 1/20
[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 285ms/step - accuracy: 0.8514 - loss: 0.4473 - val_accuracy: 0.8869 - val_loss: 0.2563 - learning_rate: 1.0000e-04
Epoch 2/20
[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 213ms/step - accuracy: 0.9253 - loss: 0.1901 - val_accuracy: 0.9323 - val_loss: 0.1866 - learning_rate: 1.0000e-04
Epoch 3/20
[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 204ms/step - accuracy: 0.9489 - loss: 0.1322 - val_accuracy: 0.9356 - val_loss: 0.1636 - learning_rate: 1.0000e-04
Epoch 4/20
[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 204ms/step - accuracy: 0.9608 - loss: 0.0997 - val_accuracy: 0.9282 - val_loss: 0.1810 - learning_rate: 1.0000e-04


In [None]:
y_pred = model1.predict(test)
y_pred = np.argmax(y_pred, axis=1)

[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 90ms/step


In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(test.classes, y_pred)

0.9355921368067764

In [None]:
test_df = segmented_df[segmented_df['split']=='test']

In [None]:
test_df['true'] = test.classes
test_df['pred'] = y_pred

In [None]:
test_df

Unnamed: 0,filename,patient,label,cancer,split,nucleus,true,pred
6,aml_data/control/GJZ/image_127.tif,GJZ,control,control,test,aml_segmented/control/GJZ/image_127.tif_1.png,0,0
8,aml_data/CBFB_MYH11/ZEE/image_398.tif,ZEE,CBFB_MYH11,malignant,test,aml_segmented/CBFB_MYH11/ZEE/image_398.tif_0.png,1,1
9,aml_data/control/MOR/image_37.tif,MOR,control,control,test,aml_segmented/control/MOR/image_37.tif_2.png,0,0
10,aml_data/control/HQQ/image_50.tif,HQQ,control,control,test,aml_segmented/control/HQQ/image_50.tif_0.png,0,0
20,aml_data/RUNX1_RUNX1T1/UWF/image_374.tif,UWF,RUNX1_RUNX1T1,malignant,test,aml_segmented/RUNX1_RUNX1T1/UWF/image_374.tif_...,1,1
...,...,...,...,...,...,...,...,...
125427,aml_data/NPM1/UVT/image_305.tif,UVT,NPM1,malignant,test,aml_segmented/NPM1/UVT/image_305.tif_0.png,1,1
125432,aml_data/control/CCO/image_195.tif,CCO,control,control,test,aml_segmented/control/CCO/image_195.tif_0.png,0,0
125441,aml_data/CBFB_MYH11/BJK/image_239.tif,BJK,CBFB_MYH11,malignant,test,aml_segmented/CBFB_MYH11/BJK/image_239.tif_0.png,1,1
125442,aml_data/CBFB_MYH11/POM/image_278.tif,POM,CBFB_MYH11,malignant,test,aml_segmented/CBFB_MYH11/POM/image_278.tif_0.png,1,1


In [None]:
test_df['original_img'] = test_df['filename'].apply(lambda x: "_".join(x.split("_")[:-1]))

In [None]:
by_img = test_df.groupby(by="filename", as_index=False).agg({"true":"mean", "pred":"mean"})

In [None]:
def threshold(value):
  if value < 0.5: return 0
  return 1

In [None]:
by_img['true'] = by_img['true'].apply(threshold)
by_img['pred'] = by_img['pred'].apply(threshold)

In [None]:
accuracy_score(by_img['true'], by_img['pred'])

0.9389891029982146