# Leukemia Subtypes Recognition 
###  → See Section 6 in Master Thesis : "Identification of Leukemia Subtypes" 

## • Imports & preprocessing 

In [None]:
# This Python 3 environment comes with many helpful anal,ytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, BatchNormalization, GaussianNoise
from keras.layers.convolutional import Convolution2D
from keras.layers.convolutional import MaxPooling2D
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split

from keras import backend as K

import matplotlib.pyplot as plt

import allidb1_preproc

In [None]:
dataset_folder = 'ALL_IDB1/resized_im'

# Resize img folder (size divided by 10)
#preprocess.resize_folder("Datasets/ALL_IDB1/im")

# Create CSV from img folder
#myFileList = allidb1_preproc.createFileList(dataset_folder) 
#allidb1_preproc.data_to_CSV(myFileList) # Uncomment to create the CSV file 

# get Y labels from the data 
myFileListForY = allidb1_preproc.createFileList(dataset_folder)
y = allidb1_preproc.get_yLabels(myFileListForY)
print(y)

## • Training set & test set definition

In [None]:
# Read training and test data files
csv_file = "csv/resizedALLIDB1-color.csv"

data = pd.read_csv(csv_file)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2) 

print("\nX_train:\n")
print(X_train.head())
print(X_train.shape)

print("\nX_test:\n")
print(X_test.head())
print(X_test.shape)

X_train = X_train.values
X_test = X_test.values 

In [None]:
img_size = 100 

# normalize the inputs from 0-255 to between 0 and 1 by dividing by 255
trainX = X_train.reshape(X_train.shape[0],1,img_size, img_size).astype( 'float32' )
X_train = trainX / 255.0 

testX = X_test.reshape(X_test.shape[0],1,img_size, img_size).astype( 'float32' )
X_test = testX / 255.0 

In [None]:
# specify the number of classes that are in the dataset, so we know how many neurons to compress the final layer down to 
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

class_num = y_test.shape[1]

## • Creating the CNN

In [None]:
model = Sequential()
K.common.set_image_dim_ordering('th')

dropoutValue = 0.1 # why bad? https://stats.stackexchange.com/questions/299292/dropout-makes-performance-wors
noiseValue = 0.8

model.add(Convolution2D(32, (3, 3), input_shape=(1, img_size, img_size), activation= 'relu' ))
#model.add(Dropout(dropoutValue)) 
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
#model.add(GaussianNoise(noiseValue)) # add noise
model.add(Convolution2D(64, (3, 3), activation= 'relu' ))
#model.add(Dropout(dropoutValue)) 
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
#model.add(GaussianNoise(noiseValue)) # add noise

model.add(Flatten())

model.add(Dense(128, activation= 'relu' ))
#model.add(Dropout(dropoutValue)) 
model.add(BatchNormalization())
model.add(Dense(64, activation= 'relu' ))
#model.add(Dropout(dropoutValue)) 
model.add(BatchNormalization())
model.add(Dense(class_num, activation= 'softmax' ))

In [None]:
 # Compile model
model.compile(loss= 'categorical_crossentropy' , optimizer= 'adam' , metrics=[ 'accuracy' ])
print(model.summary())

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=25,batch_size=32)

In [None]:
print("Acc :\n")
print(history.history['acc'])
print("\n")
print("Loss :\n")
print(history.history['loss'])

print("\n")

print("Val_acc :\n")
print(history.history['val_acc'])
print("\n")
print("Val_loss :\n")
print(history.history['val_loss'])

In [None]:
# Model evaluation
_, acc = model.evaluate(X_test, y_test)
print('accuracy > %.3f' % (acc * 100.0))

In [None]:
# plot diagnostic learning curves
def summarize_diagnostics_by_epochs(history):
    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.ylim(0, 1.1)
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss --> https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html
    # https://towardsdatascience.com/machine-learning-fundamentals-via-linear-regression-41a5d11f5220
    # https://datascience.stackexchange.com/questions/25267/keras-difference-beetween-val-loss-and-loss-during-training
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.ylim(0, 12)
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

summarize_diagnostics_by_epochs(history)