# Classifying cell types with neural networks

## 1. Imports

In [None]:
!pip install --user scprep

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import scprep

from sklearn import model_selection

# import os
# import h5py
# import sklearn
# import requests

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## 2. Loading the retinal bipolar data

We'll use the same retinal bipolar data you saw in preprocessing and visualization.

In [3]:
scprep.io.download.download_google_drive("1kxsMav1ly_S6pQ1vKeAtlFFW3QVvilz0", "shekhar_data.pkl")
scprep.io.download.download_google_drive("1J4K8bo8Pys-8xayO5vtMK3t5wJ0_TG2Y", "shekhar_clusters.pkl")

In [2]:
data = pd.read_pickle("shekhar_data.pkl")
clusters = pd.read_pickle("shekhar_clusters.pkl")

#### Converting data to `numpy` format

Tensorflow expects data to be stored as a NumPy array.

In [3]:
data = scprep.reduce.pca(data, n_components=100, method='dense').to_numpy()
labels, cluster_names = pd.factorize(clusters['CELLTYPE'])

In [6]:
num_classes = len(np.unique(labels))
num_classes

28

#### Splitting the data into training and validation sets

We'll allocate 80\% of our data for training and 20\% for testing.

In [7]:
# first let's split our data into training and validation sets
train_test_split = int(.8 * data.shape[0])

data_training = data[:train_test_split, :]
labels_training = labels[:train_test_split]
data_validation = data[train_test_split:, :]
labels_validation = labels[train_test_split:]
data_training.shape, data_validation.shape

((15018, 100), (3755, 100))

## 3. Computational graphs

Tensorflow works with an abstract computational graph

In [17]:
# let's make an object in this graph corresponding to our first 10 points
data_tf = tf.constant(data_training[:10, :], dtype=tf.float32)

# and now their corresponding labels
labels_tf = tf.constant(labels_training[:10], dtype=tf.int32)

# look at the output
print(labels_tf)

Tensor("Const_3:0", shape=(10,), dtype=int32)


In [9]:
# compare this to the numpy data we started with
print(labels_training[:10])

[0 1 2 3 2 1 4 0 3 5]


In [10]:
# and now go back to the original cluster names
cluster_names[labels_training[:10]]

Index(['BC5A', 'BC1B', 'BC6', 'Rod BC', 'BC6', 'BC1B', 'BC3B', 'BC5A',
       'Rod BC', 'Muller Glia'],
      dtype='object')

data is a np variable, with actual numbers
data_tf is a tf variable, and is just a set of instructions, i.e. "grab numbers from this variable and make them a constant"

#### Tensorflow's `Session`

tf variables are just *instructions* for how to do computation, not the actual computations themselves
to perform computations as instructed, we need to start a session and ask for the output by "running"

In [11]:
sess = tf.InteractiveSession()

In [12]:
sess.run(labels_tf)

array([0., 1., 2., 3., 2., 1., 4., 0., 3., 5.], dtype=float32)

In [13]:
np.allclose(sess.run(data_tf), data_training[:10])

True

In [14]:
# we can now give instructions for computations on this data and then ask for the output
w = 10 * data_tf + 3
x = w / 2
y = x + w
z = y**2

sess.run(z)

array([[4.55465302e+02, 5.86087031e+04, 4.70691553e+03, 5.71816406e+04,
        2.26201950e+02, 1.89646558e+03, 2.08491040e+03, 9.73422229e-01,
        2.76444971e+03, 2.08067505e+02, 1.80798450e+03, 2.04972794e+02,
        2.91574658e+03, 5.56084277e+03, 1.89451157e+02, 4.93480377e+02,
        8.17330933e+02, 1.20037830e+03, 5.04254181e+02, 3.29946198e+02,
        3.34681976e+02, 1.81121780e+02, 1.18960498e+03, 1.11654945e+02,
        5.72496729e+03, 1.48646570e+03, 1.88747818e+02, 8.97228165e+01,
        1.19735825e+02, 1.76366806e+02, 7.00738342e+02, 3.63830902e+02,
        1.83918190e+01, 7.48567009e+00, 1.55218701e+03, 3.49349976e+03,
        3.63029724e+02, 7.77578857e+02, 1.76109778e+03, 6.54872192e+02,
        4.90719109e+01, 1.01245728e+02, 1.73189758e+02, 2.09980408e+02,
        4.55323877e+03, 2.87891455e+03, 9.27585602e+01, 1.58389636e+03,
        5.67447021e+02, 2.02089600e+03, 3.45625758e+00, 3.18488483e+01,
        1.56000809e+02, 1.67953014e+01, 1.14625403e+03, 2.178075

note now output is a np variable that corresponds to the value of z. we do not have a np variable 
corresponding to w, x, or y. if all we want is the output z, we don't need them!

## Exercise 1 - Print the last 5 rows of the data matrix with their values doubled (using tensorflow operations)

In [60]:
# =================
# Get the last five rows of `data_training`
data_last5 = 
# Create a tensorflow constant storing `data_last5`
tf_last5 = 
# Multiply by two
tf_last5_double =
# Use `sess` to compute the result
data_last5_double =
# Print the result
data_last5_double
# =================

SyntaxError: invalid syntax (<ipython-input-60-c76805c08f8b>, line 3)

## 4. Building a one-layer neural network

#### Build the network architecture

In [15]:
# this function applies the simple feedforward operation
def layer(x, n_dim, name, activation=None):
    # create the weight matrix
    W = tf.get_variable(dtype=tf.float32, shape=[x.get_shape()[-1], n_dim], name='W{}'.format(name))
    # create the bias vector
    b = tf.get_variable(dtype=tf.float32, shape=[n_dim], name='b{}'.format(name))
    # X2 = X1 * W + b
    output = tf.matmul(x, W) + b
    if activation:
        # nonlinear activation function
        output = activation(output)
    return output

# create a hidden (middle) layer
hidden_layer_tf = layer(data_tf, n_dim=100, name='hidden', activation=tf.nn.relu)

# create the output layer used to classify
output_tf = layer(hidden_layer_tf, n_dim=num_classes, name='output', activation=tf.nn.softmax)

Instructions for updating:
Colocations handled automatically by placer.


#### Build the loss function

In [18]:
# we need a loss/score to tell our network how good or bad these results are
# let's use cross-entropy like we talked about
labels_one_hot = tf.one_hot(labels_tf, num_classes)

loss_tf = labels_one_hot * tf.log(output_tf + 1e-6) + (1 - labels_one_hot) * tf.log(1 - output_tf + 1e-6)
loss_tf = -1 * tf.reduce_sum(loss_tf)

#### Create the optimizer and tell it to minimize the loss

In [68]:
# now we need an optimizer that we'll give this loss, and it'll take responsibility
# for updating the network to make this score go down
learning_rate = .00001
opt = tf.train.GradientDescentOptimizer(learning_rate)

# this will be the tf object we call for when we want to take a single step to train our network
train_op = opt.minimize(loss_tf)

#### Initialize variables

In [75]:
# last thing: we need to set our network weights to random values to start
sess.run(tf.global_variables_initializer())

# and that's it! we've built a one-layer neural network!
###
####################################

####################################
### Let's train our network!
# we've built our network, but it probably isn't very good yet
# (it just has random values, after all)

# to check that, let's feed our np data to our tf network and see how it does

output_np, labels_np = sess.run([output_tf, labels_tf])

print(output_np)

[[2.89921794e-04 7.48221006e-04 1.41890964e-03 6.13534707e-04
  4.35154624e-02 3.82874720e-02 3.49754962e-04 6.51041558e-03
  2.25365581e-03 2.42739186e-01 4.66422699e-02 3.84765142e-03
  1.81492288e-02 2.73553102e-04 1.07084503e-02 1.31256774e-03
  1.29130436e-03 1.37352059e-02 1.25516031e-03 1.28589526e-01
  1.04063285e-04 2.02990812e-03 6.55069575e-03 4.09933011e-04
  3.28277677e-01 8.61117244e-03 8.87017474e-02 2.78331572e-03]
 [4.75706984e-05 1.44131377e-01 3.51640023e-02 7.17279863e-06
  4.34001582e-03 8.37495027e-04 5.51097444e-04 4.35080379e-02
  9.00734961e-03 3.05869132e-02 1.92660447e-02 2.82478362e-01
  4.00338368e-03 7.02203065e-03 1.07017078e-03 3.25386645e-04
  4.70022031e-04 3.01296324e-01 3.07184499e-04 1.31993205e-04
  4.92344261e-05 1.32113113e-04 1.27230119e-03 3.66907880e-05
  3.07156821e-03 1.90725172e-04 1.10650353e-01 4.51404703e-05]
 [4.10570588e-04 2.53576902e-04 1.76920667e-02 3.18784121e-04
  5.08534862e-03 2.37004086e-03 3.70532594e-04 1.90910290e-03
  1.47

In [76]:
print(np.argmax(output_np, axis=1))

[24 17 17  4 12  4  9 26 12  7]


In [77]:
print(labels_np)

[0 1 2 3 2 1 4 0 3 5]


In [78]:
print('Correct: {} / {}'.format((np.argmax(output_np, axis=1) == labels_np).sum(), output_np.shape[0]))

Correct: 0 / 10


In [79]:
# not very good :(
# but that's ok, it hasn't had a chance to train yet!

In [80]:
for step in range(1000):
    sess.run(train_op)

    if step % 100 == 0:
        output_np, labels_np = sess.run([output_tf, labels_tf])
        print('Training step {} correct: {} / {}'.format(step, (np.argmax(output_np, axis=1) == labels_np).sum(), output_np.shape[0]))

Training step 0 correct: 0 / 10
Training step 100 correct: 2 / 10
Training step 200 correct: 5 / 10
Training step 300 correct: 5 / 10
Training step 400 correct: 9 / 10
Training step 500 correct: 10 / 10
Training step 600 correct: 10 / 10
Training step 700 correct: 10 / 10
Training step 800 correct: 10 / 10
Training step 900 correct: 10 / 10


In [None]:
# now it knows these 10 data points perfectly
###
####################################

### Start again with placeholders so we can use all of t

In [None]:
####################################
### tf placeholders
# the power of tensorflow is that we are able to define computations
# as we did above, but with 'placeholders' instead of actual data
# we just have to define the shape and type of the variable, and then we don't
# have to give it actual data until we call sess.run
# then we can call the same computation over and over again with
# different data without having to rewrite the tensorflow code

# so now let's start over and do it with tf placeholders
# conveniently, we don't have to specify the number of rows and can instead just use "None"
# to indicate this may vary from batch to batch

In [None]:
tf.reset_default_graph() # a helpful function for clearing the tf code in your existing session
batch_size = 10
data_tf = tf.placeholder(shape=[None, data.shape[1]], dtype=tf.float32, name='data_tf')
labels_tf = tf.placeholder(shape=[None], dtype=tf.int32, name='labels_tf')


hidden_layer_tf = layer(data_tf, 10, 'hidden_layer', activation=tf.nn.relu)

output_tf = layer(hidden_layer_tf, num_classes, 'output_tf', activation=tf.nn.softmax)

labels_one_hot = tf.one_hot(labels_tf, num_classes)

loss_tf = labels_one_hot * tf.log(output_tf + 1e-6) + (1 - labels_one_hot) * tf.log(1 - output_tf + 1e-6)
loss_tf = - tf.reduce_sum(loss_tf)

learning_rate = .001
opt = tf.train.AdamOptimizer(learning_rate)

train_op = opt.minimize(loss_tf)

sess.run(tf.global_variables_initializer())

In [None]:
# now let's train our network with new data each step
step = 0
for epoch in range(100):
    random_order = np.random.choice(data_training.shape[0], data_training.shape[0], replace=False)
    data_randomized = data_training[random_order]
    labels_randomized = labels_training[random_order]
    
    for data_batch, labels_batch in zip(np.array_split(data_randomized, data_randomized.shape[0] // batch_size), np.array_split(labels_randomized, labels_randomized.shape[0] // batch_size)):
        step += 1

        sess.run(train_op, {data_tf: data_batch, labels_tf: labels_batch})

        # evaluate accuracy on both the training and validation datasets every once in awhile
        if step % 10 == 0:
            loss_np = sess.run(loss_tf, {data_tf: data_batch, labels_tf: labels_batch})
            output_np = []
            labels_np = []
            for data_batch, labels_batch in zip(np.array_split(data_training, data_training.shape[0] // ), np.array_split(labels_training, labels_training.shape[0] // batch_size)):
                output_np_ = sess.run(output_tf, {data_tf: data_batch})
                output_np.append(output_np_)
                labels_np.append(labels_batch)
            output_np = np.concatenate(output_np, axis=0)
            labels_np = np.concatenate(labels_np, axis=0)
            acc_training = (np.argmax(output_np, axis=1) == labels_np).sum() / output_np.shape[0]

            output_np = []
            labels_np = []
            for data_batch, labels_batch in zip(np.array_split(data_validation, data_validation.shape[0] // batch_size), np.array_split(labels_validation, labels_validation.shape[0] // batch_size)):
                output_np_ = sess.run(output_tf, {data_tf: data_batch})
                output_np.append(output_np_)
                labels_np.append(labels_batch)
            output_np = np.concatenate(output_np, axis=0)
            labels_np = np.concatenate(labels_np, axis=0)
            acc_validation = (np.argmax(output_np, axis=1) == labels_np).sum() / output_np.shape[0] 
            print('Step {} loss: {:.3f} training accuracy: {:.3f} validation accuracy: {:.3f} '.format(step, loss_np, acc_training, acc_validation))

In [51]:
# how did our network do?
###
####################################

####################################
### Exercise 3

# create a network with a wider hidden layer and compare its performance to the network with 10 hidden neurons we just built

###
####################################

####################################
### Exercise 3

# create a network with *two* hidden layers and compare its performance to the network with one hidden layer we just built

###
####################################



####################################
### Exercise 4

# create a network with *five* hidden layers and compare its performance to the network with one hidden layer we just built

###
####################################



####################################
####################################
####################################
###############
###############
###############
# PART II: Autoencoders
###############
###############
###############
####################################
####################################
####################################

####################################
### let's build an autoencoder
tf.reset_default_graph()
batch_size = 100
data_tf = tf.placeholder(shape=[None, data.shape[1]], dtype=tf.float32, name='data_tf')


# layers will be input -> 100 -> 2 --> 100 -> output
hidden_layer1_tf = layer(data_tf, 100, 'hidden_layer1', activation=tf.nn.relu)
hidden_layer2_tf = layer(hidden_layer1_tf, 2, 'hidden_layer2', activation=None)
hidden_layer3_tf = layer(hidden_layer2_tf, 100, 'hidden_layer3', activation=tf.nn.relu)
output_tf = layer(hidden_layer3_tf, data.shape[1], 'output_tf', activation=None)


# use mean-squared-error reconstruction loss
loss_tf = tf.reduce_mean((data_tf - output_tf)**2)

# this part is all the same as before
learning_rate = .001
opt = tf.train.AdamOptimizer(learning_rate)

train_op = opt.minimize(loss_tf)

sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=.1)))
sess.run(tf.global_variables_initializer())

for epoch in range(10):
    random_order = np.random.choice(data.shape[0], data.shape[0], replace=False)
    data_randomized = data[random_order]

    for data_batch in np.array_split(data_randomized, data_randomized.shape[0] // batch_size):
        if step % 100 == 0:
            loss_np = sess.run(loss_tf, {data_tf: data_batch})
            print("Step: {} Loss: {:.3f}".format(step, loss_np))
        step += 1

        sess.run(train_op, {data_tf: data_batch})
###
####################################



####################################
### rather than evaluating our model with our data like
### we did with the classifier, we can now use our model
### to evaluate our data (aka exploratory data analysis)!


# let's get the 2D internal hidden layer and visualize it
# with a scatter plot
viz_coordinates = []
for data_batch in np.array_split(data, data.shape[0] // batch_size):
    out = sess.run(hidden_layer2_tf, {data_tf: data_batch})
    viz_coordinates.append(out)
viz_coordinates = np.concatenate(viz_coordinates, axis=0)


fig = plt.figure()
ax = fig.subplots(1, 1)
ax.scatter(viz_coordinates[:, 0], viz_coordinates[:, 1], c=labels, s=5)

ax.set_xlabel('AE Coordinate 1')
ax.set_ylabel('AE Coordinate 2')

###
####################################





####################################
### Exercise 4

# notice we used activation=None for the hidden layer we were going to visualize
# repeat the process with other activation functions like tf.nn.relu, tf.nn.sigmoid, tf.nn.tanh, etc...
# and note how the visualization changes. has the data changed at all?

###
####################################



####################################
### Exercise 5

# now turn the activation for the visualization layer back to None, but experiment with
# the activation function for the 100-dimensional layers. is there a change? why?

###
####################################

SyntaxError: invalid syntax (<ipython-input-51-ef2a91e319f7>, line 118)