# Imports

In [1]:
import os
import numpy as np
import scipy as sp
import pandas as pd

In [2]:
import h5py
import h5sparse
from scipy.sparse import csc_matrix, csr_matrix, coo_matrix, vstack
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity

  from ._conv import register_converters as _register_converters


In [3]:
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances

In [4]:
os.chdir('../scripts')
from gvm_scripts import *
from vae_scripts import *
os.chdir('../notebooks')

Using TensorFlow backend.


In [5]:
all_genes = pd.read_csv('../data/AE/genes_info.tsv', sep='\t', index_col=0)

# Format gmt

### Convert to gene vector matrix (gvm)

In [6]:
gmt_fname = '../data/demo.txt'

lib_name = os.path.splitext(gmt_fname.rsplit('/', 1)[-1])[0]
gvm_fname = '../data/' + lib_name + '.h5'
formatted_gvm_fname = '../data/' + lib_name + '_FORMATTED.h5'

In [7]:
if os.path.isfile(gvm_fname): 
    gvm = open_gvm(gvm_fname)
else:
    gvm = convert_genesetlist(get_genesetlist(gmt_fname, 'gmt_fname'), 
                              to='gvm_h5', output_fname=gvm_fname)

In [8]:
gvm['gvm'].shape

(70, 23289)

Each row gene-vector is a geneset. The number of rows is the number of samples; the number of columns is the total number of genes.

### Re-initialize gene column-index to match that of the matrix used to train the autoencoder

In [9]:
summary = format_gvm_h5(gvm_fname = gvm_fname, all_genes = all_genes,
                       output_fname = formatted_gvm_fname, max_gs_loss=1.0, min_gs_size=1,
                       overwrite = False, return_value='summary')

../data/trunc_alz_genes_proper.gmt.h5:
	8445 out of 23289 genes were removed.
	3629 genes were mapped onto a pre-existing gene.
	0 out of 70 labels were removed.


Genes not in the data used to train the autoencoder are removed. Missing genes are added as empty columns. If a geneset loses a proportion of its genes greater than `max_gs_loss`, or if its number of genes is less than `min_gs_size`, it is removed from the gvm.

In [10]:
n_labels, n_genes = get_gvm_size(formatted_gvm_fname)
(n_labels, n_genes)

(70, 19996)

# Get Latent Space Embedding

### Construct autoencoder

In [11]:
group = 'AE' # vanilla autoencoder

batch_size = 128
m = 1000 # middle dimension
l = 50 # latent dimension

In [12]:
model = build_vae(input_dim=n_genes, middle_dim = m, latent_dim = l, 
                  batch_size=batch_size, optimizer='Adamax', lr=.001)
vae, enc, dec = (model['vae'], model['enc'], model['dec'])
vae.load_weights('../models/%s/weights/%04dm_%04dl.h5'%(group, m, l))            

Instructions for updating:
Colocations handled automatically by placer.


### Encode genesets

In [13]:
z = enc.predict_generator(
    GeneVec_Generator(formatted_gvm_fname, gvm_path='gvm', batch_size=1000, shuffle=False),
    workers=4, use_multiprocessing=True, verbose=0)
z.shape

(70, 50)

# Compute Proximity Matrices

### Euclidean distance

In [14]:
euc_dist = pairwise_distances(z, metric='euclidean')

In [15]:
np.min(euc_dist), np.max(euc_dist)

(0.0, 117.33293)

### Cosine similarity

In [16]:
cos_sim = cosine_similarity(z)

In [17]:
np.min(cos_sim), np.max(cos_sim)

(-0.055102646, 1.0000004)

### Save results to pd.DataFrame

In [18]:
labels = open_gvm(formatted_gvm_fname)['idx']

euc_dist_df = pd.DataFrame(euc_dist, index=labels, columns=labels)
cos_sim_df = pd.DataFrame(cos_sim, index=labels, columns=labels) 

In [19]:
euc_dist_df.iloc[:5, :5]

Unnamed: 0,GSE103359_S14061_mm9_genesdiff_groups_F382,GSE103359_S15010_genesdiff_HS179_HS190_F330,GSE104704_F600,GSE111789_down_S200,GSE111789_up_S200
GSE103359_S14061_mm9_genesdiff_groups_F382,0.0,13.403594,22.606493,17.12236,17.807388
GSE103359_S15010_genesdiff_HS179_HS190_F330,13.403594,0.0,18.056818,20.566029,18.063286
GSE104704_F600,22.606493,18.056818,0.0,27.346394,27.705006
GSE111789_down_S200,17.12236,20.566029,27.346394,0.0,15.34773
GSE111789_up_S200,17.807388,18.063286,27.705006,15.34773,0.0


In [20]:
cos_sim_df.iloc[:5, :5]

Unnamed: 0,GSE103359_S14061_mm9_genesdiff_groups_F382,GSE103359_S15010_genesdiff_HS179_HS190_F330,GSE104704_F600,GSE111789_down_S200,GSE111789_up_S200
GSE103359_S14061_mm9_genesdiff_groups_F382,1.0,0.665615,0.619709,0.277656,0.311206
GSE103359_S15010_genesdiff_HS179_HS190_F330,0.665615,1.0,0.796785,0.143618,0.405349
GSE104704_F600,0.619709,0.796785,1.0,0.326417,0.326476
GSE111789_down_S200,0.277656,0.143618,0.326417,1.0,0.454049
GSE111789_up_S200,0.311206,0.405349,0.326476,0.454049,1.0


In [21]:
euc_dist_df.to_pickle('../data/%s_DIST_EUC.pkl'%lib_name)
cos_sim_df.to_pickle('../data/%s_DIST_COS.pkl'%lib_name)

### Demo for loading results

In [23]:
cos_sim_df2 = pd.read_pickle('../data/%s_DIST_COS.pkl'%lib_name)
np.all(cos_sim_df == cos_sim_df2)

True

# Clustergrammer

In [24]:
# import widget classes and instantiate Network instance
from clustergrammer_widget import *

### Cosine similarities

In [25]:
net = Network(clustergrammer_widget)

# load matrix file
net.load_df(cos_sim_df)

# cluster using default parameters
net.cluster()

# make interactive widget
net.widget()

clustergrammer_widget(network='{"row_nodes": [{"name": "GSE103359_S14061_mm9_genesdiff_groups_F382", "ini": 70…