# Training of a super simple model for celltype classification

In [1]:
import tensorflow as tf
!which python
!python --version
print(tf.VERSION)
print(tf.keras.__version__)
!pwd #  start jupyter under notebooks/ for correct relative paths

/home/phineas/anaconda3/envs/depiction-env/bin/python
Python 3.6.5 :: Anaconda, Inc.
1.14.0
2.2.4-tf
/home/phineas/Documents/repos/dl-interpretability-compbio/notebooks


In [3]:
import datetime
import inspect
import pandas as pd
import numpy as np
import seaborn as sns
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from depiction.models.celltype.celltype import one_hot_encoding, one_hot_decoding

## a look at the data
labels are categories 1-20, here's the associated celltype:

In [4]:
meta_series = pd.read_csv('../data/single-cell/metadata.csv', index_col=0)
meta_series

Unnamed: 0_level_0,cell type name
label,Unnamed: 1_level_1
1,CD11b- Monocyte
2,CD11bhi Monocyte
3,CD11bmid Monocyte
4,Erythroblast
5,HSC
6,Immature B
7,Mature CD38lo B
8,Mature CD38mid B
9,Mature CD4+ T
10,Mature CD8+ T


There are 13 unbalanced classes, and over 80k samples

In [5]:
data_df = pd.read_csv('../data/single-cell/data.csv')
data_df.groupby('category').count()['CD45']

category
1       912
2      6779
3      1278
4     12030
5       261
6       502
7      7796
8       608
9     13964
10     7821
11     3684
12     3025
13     3864
14     6987
15     9564
16      468
17      293
18        5
19      994
20      240
Name: CD45, dtype: int64

In [6]:
data_df.sample(n=10)

Unnamed: 0,CD45,CD45RA,CD19,CD11b,CD4,CD8,CD34,CD20,CD33,CD123,CD38,CD90,CD3,category
53353,-0.69492,0.88153,-0.11434,-0.087083,1.6076,0.40803,-0.19655,-0.088004,0.31978,-0.12796,-0.11775,-0.006062,-1.259,11
38224,4.4452,0.47676,-0.077736,-0.016581,3.5801,-0.21988,-0.011004,-0.059784,0.43719,-0.42698,2.3154,0.13289,4.262,9
33373,4.4696,-0.5615,-0.16888,0.34735,3.076,1.0373,-0.1186,-0.014868,-0.2235,0.22026,1.8308,1.1712,4.4117,9
11350,-0.31212,-0.095535,-0.16109,-0.10579,-0.010378,-0.025211,-0.040666,-0.073326,1.1076,-0.097798,1.7045,1.0566,2.8261,4
51254,5.2515,1.0077,0.50964,1.0501,-0.056795,5.1078,0.60179,1.6214,0.2245,-0.22458,2.4123,1.0145,5.5227,10
5565,4.7211,1.4999,-0.24284,3.1604,2.8838,-0.34364,0.93898,-0.09333,4.6788,2.6893,4.2178,0.58529,0.59754,2
2398,3.871,-0.032058,-0.10821,2.5303,0.74386,-0.021683,0.96055,-0.082569,4.3076,1.0666,4.2895,1.2482,0.35031,2
47280,4.3755,0.28705,-0.088491,-0.086775,0.45848,3.2607,-0.12959,-0.05428,-0.2762,0.28983,0.50937,-0.15545,4.0204,10
17804,-0.20823,-0.10315,-0.15351,0.50917,0.4655,-6e-05,1.6054,0.19066,2.0356,-0.14748,2.4124,0.73784,3.0422,4
74743,3.7746,3.1739,-0.23994,0.25733,-0.1721,4.3737,1.6925,0.31966,-0.15248,-0.14114,0.47485,-0.059107,4.2775,15


In [7]:
print(inspect.getsource(one_hot_encoding)) # from keras, but taking care of 1 indexed classes
print(inspect.getsource(one_hot_decoding))

def one_hot_encoding(classes):
    return to_categorical(classes)[:, 1:]  # remove category 0

def one_hot_decoding(labels):
    return labels.argmax(axis=1) + 1



In [8]:
classes = data_df['category'].values
labels = one_hot_encoding(classes)

#scale the data from 0 to 1
min_max_scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
data = min_max_scaler.fit_transform(data_df.drop('category', axis=1).values)
data.shape

(81075, 13)

In [9]:
one_hot_decoding(labels)

array([ 1,  1,  1, ..., 20, 20, 20])

In [10]:
data_train, data_test, labels_train, labels_test = train_test_split(
    data, labels, test_size=0.33, random_state=42, stratify=data_df.category)

In [11]:
labels

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [12]:
batchsize = 32

In [13]:
dataset = tf.data.Dataset.from_tensor_slices((data_train, labels_train))
dataset = dataset.shuffle(2 * batchsize).batch(batchsize)
dataset = dataset.repeat()

testset = tf.data.Dataset.from_tensor_slices((data_test, labels_test))
testset = testset.batch(batchsize)

## I don't know how a simpler network would look like

In [14]:
model = tf.keras.Sequential()
# Add a softmax layer with output units per celltype:
model.add(layers.Dense(
    len(meta_series), activation='softmax',
    batch_input_shape=tf.data.get_output_shapes(dataset)[0]
))

W1103 19:54:27.627181 139803615647552 deprecation.py:506] From /home/phineas/anaconda3/envs/depiction-env/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 20)                280       
Total params: 280
Trainable params: 280
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
              loss='categorical_crossentropy',
              metrics=[tf.keras.metrics.categorical_accuracy])

In [17]:
# evaluation on testset on every epoch
# log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
model.fit(
    dataset,
    epochs=20, steps_per_epoch=np.ceil(data_train.shape[0]/batchsize),
    validation_data=testset, #  callbacks=[tensorboard_callback]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f262283e978>

## Is such a simple model interpretable?

In [18]:
# Save entire model to a HDF5 file
model.save('./celltype_model.h5')

In [19]:
# tensorboard --logdir logs/fit

In [20]:
# To recreate the exact same model, including weights and optimizer.
# model = tf.keras.models.load_model('../data/models/celltype_dnn_model.h5')

# What is the effect of increasing model complexity? 
Play around by adding some layers, train and save the model under some name to use with the other notebook.

![title](https://i.kym-cdn.com/photos/images/newsfeed/000/531/557/a88.jpg)

In [21]:
model = tf.keras.Sequential()
# Adds a densely-connected layers with 64 units to the model:
model.add(layers.Dense(64, activation='relu', batch_input_shape=tf.data.get_output_shapes(dataset)[0])) # 
# ...
# do whatever you want
# model.add(layers.Dense(64, activation='relu'))
# model.add(layers.Dropout(0.5))
# ...
# Add a softmax layer with output units per celltype:
model.add(layers.Dense(len(meta_series), activation='softmax'))