<a href="https://colab.research.google.com/github/MutianWang/novel-cell/blob/main/gan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.vis_utils import plot_model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = '/content/drive/My Drive/Colab Notebooks/Brain Cell/data/'

## Preprocessing

In [None]:
meta = pd.read_csv(path+'meta.csv', header=0)
cols_glut = meta[meta['class']=='Glutamatergic']['sample_name']
cols_non = meta[meta['class']=='Non-neuronal']['sample_name']
cols_gaba = meta[meta['class']=='GABAergic']['sample_name']

In [None]:
def normalize(df):
    # counts per million
    df = df.div(df.sum(axis=1), axis=0) * 10**6
    df = df.fillna(0)
    return df

In [None]:
tmp = pd.DataFrame([[1,2,3],
                    [1,2,3],
                    [1,2,3]])

In [None]:
normalize(tmp)

Unnamed: 0,0,1,2
0,166666.666667,333333.333333,500000.0
1,166666.666667,333333.333333,500000.0
2,166666.666667,333333.333333,500000.0


In [None]:
def read_expression(file1, file2, usecols):
    # point-wise addition of exon and intron tables
    reader1 = pd.read_csv(file1, header=0, usecols=usecols, chunksize=1000)
    reader2 = pd.read_csv(file2, header=0, usecols=usecols, chunksize=1000)

    df = normalize(reader1.get_chunk() + reader2.get_chunk())
    for i in range(1, 51):
        df = pd.concat([df, normalize(reader1.get_chunk() + reader2.get_chunk())])
        if i%10==0:
            print('{}/50'.format(i))

    return df.transpose()

In [None]:
exp_glut = read_expression(path+'exon.csv', path+'intron.csv', cols_glut) # 10525 * 50281
exp_non = read_expression(path+'exon.csv', path+'intron.csv', cols_non) # 914 * 50281
exp_gaba = read_expression(path+'exon.csv', path+'intron.csv', cols_gaba) # 4164 * 50281

10/50
20/50
30/50
40/50
50/50
10/50
20/50
30/50
40/50
50/50
10/50
20/50
30/50
40/50
50/50


In [None]:
np.save(path+'exp_glut', exp_glut)
np.save(path+'exp_non', exp_non)
np.save(path+'exp_gaba', exp_gaba)

# clear RAM
del exp_glut, exp_non, exp_gaba

In [None]:
exp_glut = np.load(path+'exp_glut.npy')[:5000-914]
exp_non = np.load(path+'exp_non.npy')

# first 4086 are Glutamatergic, last 914 are Non-neuronal
exp_train = np.concatenate([exp_glut, exp_non], axis=0)

# clear RAM
del exp_glut, exp_non

In [None]:
np.save(path+'exp_train', exp_train)

## Dimension Reduction

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
pipe = Pipeline([('scaler1', StandardScaler()), ('pca', PCA(n_components=5000)), ('scaler2', MinMaxScaler())])

In [None]:
#exp_train = np.load(path+'exp_train.npy')
exp_train = pipe.fit_transform(exp_train)
np.save(path+'exp_train_pca', exp_train)
del exp_train

In [None]:
exp_test = np.load(path+'exp_gaba.npy')
exp_test = pipe.transform(exp_test)
np.save(path+'exp_test_pca', exp_test)
del exp_test

## Classification

In [None]:
x = np.concatenate([np.load(path+'exp_train_pca.npy'), np.load(path+'exp_test_pca.npy')], axis=0)
y = np.concatenate([np.zeros(4086), np.ones(914), 2*np.ones(4164)]).astype('int8')

In [None]:
dimension = x.shape[1]

In [None]:
def make_classfication_model():
    model = tf.keras.Sequential()
    model.add(layers.Dense(1024, input_shape=(dimension,), activation='relu'))
    model.add(layers.Dense(256, activation='relu'))
    #model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(3, activation='softmax'))

    return model

In [None]:
model = make_classfication_model()

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(x, y, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f12b6e52780>

In [None]:
model.predict(x[0:1])

array([[nan, nan, nan]], dtype=float32)

In [None]:
model.weights

[<tf.Variable 'dense/kernel:0' shape=(5000, 1024) dtype=float32, numpy=
 array([[ 0.00292103, -0.00271823,  0.03652097, ..., -0.00848656,
          0.01721019,  0.0325576 ],
        [ 0.01478301, -0.0146351 ,  0.01334175, ...,  0.02601389,
          0.01639932, -0.01862054],
        [ 0.02751722,  0.02044688, -0.01908464, ...,  0.0147965 ,
         -0.00430434, -0.0043278 ],
        ...,
        [-0.00011265,  0.01241507,  0.00128024, ..., -0.02809043,
         -0.02561568,  0.03600849],
        [ 0.00494689, -0.01594155, -0.00071997, ...,  0.00676259,
          0.0356102 , -0.01729849],
        [        nan,         nan,         nan, ...,         nan,
                 nan,         nan]], dtype=float32)>,
 <tf.Variable 'dense/bias:0' shape=(1024,) dtype=float32, numpy=
 array([-0.0039803 , -0.00433646,  0.00988532, ..., -0.00057972,
         0.00437393,  0.00635676], dtype=float32)>,
 <tf.Variable 'dense_1/kernel:0' shape=(1024, 512) dtype=float32, numpy=
 array([[        nan,         

## GAN

In [None]:
exp_train = np.load(path+'exp_train_pca.npy')

In [None]:
dimension = exp_train.shape[1]

In [None]:
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [None]:
def make_generator_model():
    model = tf.keras.Sequential()
    model.add(layers.Dense(256, use_bias=True, input_shape=(128,)))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Dense(512, use_bias=True))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Dense(512, use_bias=True))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Dense(1024, use_bias=True))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Dense(2048, use_bias=True))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    # sigmoid function will make the range [0,1]
    model.add(layers.Dense(dimension, use_bias=True, activation='sigmoid'))

    return model

In [None]:
def make_discriminator_model():
    model = tf.keras.Sequential()
    model.add(layers.Dense(1024, use_bias=True, input_shape=(dimension,)))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))

    model.add(layers.Dense(512, use_bias=True))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))

    model.add(layers.Dense(256, use_bias=True))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))

    model.add(layers.Dense(1))

    return model

In [None]:
generator = make_generator_model()
discriminator = make_discriminator_model()

In [None]:
generator.summary()

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_108 (Dense)            (None, 256)               33024     
_________________________________________________________________
batch_normalization_53 (Batc (None, 256)               1024      
_________________________________________________________________
leaky_re_lu_86 (LeakyReLU)   (None, 256)               0         
_________________________________________________________________
dense_109 (Dense)            (None, 512)               131584    
_________________________________________________________________
batch_normalization_54 (Batc (None, 512)               2048      
_________________________________________________________________
leaky_re_lu_87 (LeakyReLU)   (None, 512)               0         
_________________________________________________________________
dense_110 (Dense)            (None, 512)             

In [None]:
discriminator.summary()

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_114 (Dense)            (None, 1024)              5121024   
_________________________________________________________________
leaky_re_lu_91 (LeakyReLU)   (None, 1024)              0         
_________________________________________________________________
dropout_33 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_115 (Dense)            (None, 512)               524800    
_________________________________________________________________
leaky_re_lu_92 (LeakyReLU)   (None, 512)               0         
_________________________________________________________________
dropout_34 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_116 (Dense)            (None, 256)             

In [None]:
def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss

In [None]:
def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)

In [None]:
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

In [None]:
def train_step(images):
    noise = tf.random.normal([BATCH_SIZE, 128])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_images = generator(noise, training=True)

        real_output = discriminator(images, training=True)
        fake_output = discriminator(generated_images, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

In [None]:
def train(dataset, epochs):
    for epoch in range(epochs):
        start = time.time()

        for data in dataset:
            train_step(data)

        print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start))

In [None]:
BATCH_SIZE = 500
dataset = tf.data.Dataset.from_tensor_slices(exp_train).shuffle(1000).batch(BATCH_SIZE)

In [None]:
train(dataset, 10)

Time for epoch 1 is 1.1117630004882812 sec
Time for epoch 2 is 0.933995246887207 sec
Time for epoch 3 is 0.9195923805236816 sec
Time for epoch 4 is 0.8918173313140869 sec
Time for epoch 5 is 0.9009981155395508 sec
Time for epoch 6 is 0.8773994445800781 sec
Time for epoch 7 is 0.8955671787261963 sec
Time for epoch 8 is 0.9076099395751953 sec
Time for epoch 9 is 0.881399393081665 sec
Time for epoch 10 is 0.8846614360809326 sec


In [None]:
exp_gen = generator(tf.random.normal([1000,128]))

## Evaluation

In [None]:
from scipy.spatial import distance

In [None]:
mean_glut = np.mean(np.load(path+'exp_train_pca.npy')[:4086], axis=0)
mean_non = np.mean(np.load(path+'exp_train_pca.npy')[4086:], axis=0)
mean_gaba = np.mean(np.load(path+'exp_test_pca.npy'), axis=0)

In [None]:
# L1 distance
res = [0, 0, 0]
for exp in exp_gen:
    d1 = np.sum(np.abs(exp-mean_glut))
    d2 = np.sum(np.abs(exp-mean_non))
    d3 = np.sum(np.abs(exp-mean_gaba))
    i = np.argmin([d1, d2, d3])
    res[i] += 1

assert sum(res) == exp_gen.shape[0]

print(res)

[759, 241, 0]


In [None]:
# L2 distance
res = [0, 0, 0]
for exp in exp_gen:
    d1 = np.sum((exp-mean_glut)**2)
    d2 = np.sum((exp-mean_non)**2)
    d3 = np.sum((exp-mean_gaba)**2)
    i = np.argmin([d1, d2, d3])
    res[i] += 1

assert sum(res) == exp_gen.shape[0]

print(res)

[759, 241, 0]


In [None]:
# cosine distance
res = [0, 0, 0]
for exp in exp_gen:
    d1 = distance.cosine(exp, mean_glut)
    d2 = distance.cosine(exp, mean_non)
    d3 = distance.cosine(exp, mean_gaba)
    i = np.argmin([d1, d2, d3])
    res[i] += 1

assert sum(res) == exp_gen.shape[0]

print(res)

[767, 233, 0]
