In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout
from tensorflow.keras.optimizers import SGD
from keras.preprocessing.image import ImageDataGenerator
import shutil
import numpy as np
import pandas as pd
import glob
import math 
import random
import cv2

In [2]:
#load datasets

mutant = pd.read_csv('Downloads/mutant_notG2_full.csv')
wt = pd.read_csv('Downloads/wt_notG2_full.csv')

mutant.head()

Unnamed: 0,Image Path,Initial Label,Manual label,Class
0,C:/Users/zmvar/Documents/GitHub/cell-cycle/HAP...,not G2,bad,bad
1,C:/Users/zmvar/Documents/GitHub/cell-cycle/HAP...,not G2,bad,bad
2,C:/Users/zmvar/Documents/GitHub/cell-cycle/HAP...,not G2,bad,bad
3,C:/Users/zmvar/Documents/GitHub/cell-cycle/HAP...,not G2,bad,bad
4,C:/Users/zmvar/Documents/GitHub/cell-cycle/HAP...,not G2,apop,bad


In [3]:
#replace to interphase and non-interphase classes
"""
mutant["Class"].replace({"M phase": "non-interphase", "prophase": "non-interphase", "bad":"non-interphase"}, inplace=True)
wt["Class"].replace({"M phase": "non-interphase", "prophase": "non-interphase", "bad":"non-interphase"}, inplace=True)"""

print(len(wt.index), len(mutant.index))

3428 2774


In [4]:
mutant['Class'].value_counts()

interphase    2706
M phase         27
bad             22
prophase        19
Name: Class, dtype: int64

In [5]:
wt['Class'].value_counts()

interphase    3327
bad             41
M phase         39
prophase        21
Name: Class, dtype: int64

In [6]:
wt.head()

Unnamed: 0,Image Path,Initial Label,Manual Label,Class
0,C:/Users/zmvar/Documents/GitHub/cell-cycle/HAP...,not G2,bad?? meta??,bad
1,C:/Users/zmvar/Documents/GitHub/cell-cycle/HAP...,not G2,bad (folded),bad
2,C:/Users/zmvar/Documents/GitHub/cell-cycle/HAP...,not G2,bad,bad
3,C:/Users/zmvar/Documents/GitHub/cell-cycle/HAP...,not G2,bad?? meta??,bad
4,C:/Users/zmvar/Documents/GitHub/cell-cycle/HAP...,not G2,apop,bad


In [12]:
#separate images to folder

files = list(glob.glob('Z:/cell classifier/test_run/images'+"/*.bmp"))
fname = []

for i in range(len(files)):
    fname.append(files[i].split('\\')[1])

for i in range(0, len(wt.index)):
    im = wt['Image Path'][i]
    fold = wt['Class'][i]
    name = wt['Image Path'][i].split('/')[9]
    old = 'Z:/cell classifier/test_run/images/'+name
    if name in fname:
        new = 'Z:/cell classifier/test_run/images/'+fold
        shutil.move(old, new)
print('done!')

done!


In [13]:
for i in range(0, len(mutant.index)):
    im = mutant['Image Path'][i]
    fold = mutant['Class'][i]
    name = mutant['Image Path'][i].split('/')[9]
    old = 'Z:/cell classifier/test_run/images/'+name
    if name in fname:
        new = 'Z:/cell classifier/test_run/images/'+fold
        shutil.move(old, new)
print('done')

done


In [14]:
inter = list(glob.glob('Z:/cell classifier/test_run/images/interphase'+"/*.bmp"))
bad = list(glob.glob('Z:/cell classifier/test_run/images/bad'+"/*.bmp"))
mphase = list(glob.glob('Z:/cell classifier/test_run/images/M phase'+"/*.bmp"))
pro = list(glob.glob('Z:/cell classifier/test_run/images/prophase'+"/*.bmp"))

print(len(inter), len(bad), len(mphase), len(pro))

369 74 98 59


In [None]:
#questions:
# 1) do we want to put new column for type, and concat the mutant and wildtype dataset
# 2) should I use an SVG or CNN for classification
# 3) make each class equal to interphase count
# 4) distinguish interphase and non-interphase

In [None]:
#to-do:
# 1) download full csv files and concat them
# 2) split test and training set
# 3) separate classes into folders
# 4) apply augmentation + standardization - make classes equal to interphase
# 5) covert images to numpy arrays
# 6) https://www.analyticsvidhya.com/blog/2021/07/step-by-step-guide-for-image-classification-on-custom-datasets/
# 7) build model
# 8) test

In [2]:
path = 'Z:/cell classifier/test_run/'
groups = ['interphase', 'bad', 'M phase', 'prophase']
train_dir =path+"images/train/"
test_dir = path+"images/test/"
export_dir = path+"data/" # directory for data export
model_name = "Model1" # name of output model w/o extension

In [21]:
#apply augmentation for non-interphase cells

from skimage import io
datagen = ImageDataGenerator(        
            rotation_range=45,
            width_shift_range=0.2,  
            height_shift_range=0.2,    
            shear_range=0.2,        
            zoom_range=0.2,        
            horizontal_flip=True,         
            fill_mode='nearest')

print("----Augmenting {}----".format(groups[3]))
dir_n = 'Z:/cell classifier/test_run/images/'+groups[3]
files = list(glob.glob(dir_n+"/*.bmp"))

for file in files:
    x = io.imread(file)
    x = x.reshape(1,250,250,1)
    i = 0
    new_p = 'Z:/cell classifier/test_run/images/prophase/'
    pref = file.split('\\')[1].split('.')[0] + '_aug'
    for batch in datagen.flow(x, batch_size=16,save_to_dir=new_p,save_prefix=pref,save_format='bmp'):
        i += 1    
        if i > 5:      
            break
print('done')

----Augmenting prophase----
done


In [22]:
inter = list(glob.glob('Z:/cell classifier/test_run/images/interphase'+"/*.bmp"))
bad = list(glob.glob('Z:/cell classifier/test_run/images/bad'+"/*.bmp"))
mphase = list(glob.glob('Z:/cell classifier/test_run/images/M phase'+"/*.bmp"))
pro = list(glob.glob('Z:/cell classifier/test_run/images/prophase'+"/*.bmp"))

print(len(inter), len(bad), len(mphase), len(pro))

369 370 391 413


In [3]:
#split test and training dataset

for i, group in enumerate(groups):
    print("----Processing {}----".format(group))
    image_dir = path + "images/" + group # original images
    move_train_dir = train_dir + group # destination; training data
    move_test_dir = test_dir + group # destination; test data
    files = list(glob.glob(image_dir+"/*.bmp"))
    print("Files detected:")
    print(len(files)) 
    
    # 20% of data will be moved to "test"
    th = math.floor(len(files)*0.2)
    random.shuffle(files)
    for i in range(th):
        shutil.move(files[i],move_test_dir)
    
    # move rest of data to "train"
    files = glob.glob(image_dir+"/*.bmp")
    for file in files:
        shutil.move(file, move_train_dir)

print("----All done----")

----Processing interphase----
Files detected:
369
----Processing non-interphase----
Files detected:
231
----All done----


In [3]:
#convert to numpy array

X = []
Y = []
for i,group in enumerate(groups):
    image_dir = train_dir + group
    files = glob.glob(image_dir+"/*")
    print("----Processing train: {}----".format(group))
    with open('Z:/cell classifier/data/train-'+group+'.csv', 'w') as f_csv:
        for id, file in enumerate(files):
            img = cv2.imread(file)
            X.append(img)
            f_csv.write(str(file))
            f_csv.write("\n")
            Y.append(i) 
    
X = np.array(X)
Y = np.array(Y)
np.save(export_dir+"/x_train.npy",X)
np.save(export_dir+"/y_train.npy",Y)
print("Number of files generated:")
print(len(Y))

# run image-to-array conversion for test data
X = []
Y = []
for i,group in enumerate(groups):
    image_dir = test_dir + group
    files = glob.glob(image_dir+"/*")
    print("----Processing test: {}----".format(group))
    with open('Z:/cell classifier/data/test-'+group+'.csv', 'w') as f_csv:
        for id, file in enumerate(files):
            img = cv2.imread(file)
            X.append(img)
            f_csv.write(str(file))
            f_csv.write("\n")
            Y.append(i)   
        
X = np.array(X)
Y = np.array(Y)
np.save(export_dir+"/X_test.npy",X)
np.save(export_dir+"/Y_test.npy",Y)
print("Number of files generated:")
print(len(Y))
print("----All done----")

----Processing train: interphase----
----Processing train: non-interphase----
Number of files generated:
481
----Processing test: interphase----
----Processing test: non-interphase----
Number of files generated:
119
----All done----


In [4]:
import numpy as np
import pandas as pd
import random
import keras
import csv
import matplotlib.pyplot as plt
from keras.callbacks import EarlyStopping
from bayes_opt import BayesianOptimization
from keras.models import Model, load_model, model_from_json
from keras.layers import Input, Conv2D,MaxPooling2D,Dense, Flatten,Dropout
from keras.utils import np_utils

In [5]:
#normalize test and training set, as well as one hot encode y-test and y-train

# load training and test data sets
x_train = np.load(path+"/data/x_train.npy")
y_train = np.load(path+"/data/y_train.npy")

x_test = np.load(path +"/data/X_test.npy")
y_test = np.load(path +"/data/Y_test.npy")

# normalize
x_train = x_train.astype('float32') #integers to floats
x_test = x_test.astype('float32')
x_train = x_train/255
x_test = x_test / 255

y_train = to_categorical(y_train,2)
y_test = to_categorical(y_test,2)

# shuffle sequence of data
index=list(range(x_train.shape[0]))
index=random.sample(index,len(index))
x_train = x_train[index]
y_train = y_train[index]

In [6]:
print(x_train.shape)

(481, 250, 250, 3)


In [7]:
#build model w/ cross validation
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

#3 block vgg model
def model_def():
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=(250,250,3)))
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(2, activation='sigmoid'))
    # compile model
    opt = SGD(learning_rate=0.001, momentum=0.9)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = model_def()

In [8]:
#model report
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, mean_squared_error
model.fit(x_train, y_train)
early_stopping = EarlyStopping(patience=5, verbose=1)

# hyperparameters
batch = 89 # batch size

# learning
hist = model.fit(x_train, y_train, validation_split=0.2, epochs=100, batch_size=batch,callbacks=[early_stopping])

# evaluated accuracy for test data
print(model.evaluate(x_test, y_test))


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 00012: early stopping
[0.5356975793838501, 0.6974790096282959]


In [9]:
predictions = model.predict(x_test)

classes = np.argmax(predictions, axis =1)
with open('Z:/cell classifier/data/test_val.csv', 'w') as f_csv:
    for c in classes:
        f_csv.write(str(c))
        f_csv.write("\n")

In [None]:
# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
lst_accu_stratified = []
  
for train_index, test_index in skf.split(x_train, y_train):
    x_train_fold, x_test_fold = x_train[train_index], x_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
    model.fit(x_train_fold, y_train_fold)
    lst_accu_stratified.append(model.score(x_test_fold, y_test_fold))
    
print('\nOverall Accuracy:', mean(lst_accu_stratified)*100, '%')

In [None]:
#test on a random image