# Infer Ontology from Clustering
This notebook will work on trying to infer gene ontology for 'dark' genes from hierarchical clustering of proteins based on some data source (e.g. CCLE expression). 



In [1]:
import pandas as pd
from clustergrammer_widget import *
from scipy.spatial.distance import pdist, squareform
net = Network()

In [2]:
gene_info = net.load_json_to_dict('../grant_pois/gene_info_with_dark.json')

In [3]:
net.load_file('../hzome_data/my_CCLE_exp.txt')

In [4]:
ccle = net.export_df()

In [5]:
ccle.shape

(18874, 1037)

## Filter for genes of interest

In [18]:
ccle = ccle.transpose()
all_genes = ccle.columns.tolist()
all_kin = gene_info['KIN']['all']

In [7]:
found_kin = sorted(list(set(all_genes).intersection(all_kin)))

In [8]:
len(found_kin)

504

In [9]:
ccle_kin = ccle[found_kin]
ccle_kin = ccle_kin.transpose()
ccle_kin.shape

(504, 1037)

In [10]:
net.load_df(ccle_kin)

## Calc distance matrix

In [11]:
net.normalize(axis='row', norm_type='zscore', keep_orig=False)
hzome_data = net.export_df()

In [12]:
inst_dm = pdist(hzome_data, metric='cosine')
inst_dm = squareform(inst_dm)
inst_dm = 1 - inst_dm

In [20]:
# convert to normal names
class_titles = {}
class_titles['KIN'] = 'Kinases'
class_titles['IC'] = 'Ion Channels'
class_titles['GPCR'] = 'GPCRs'

gene_class = 'KIN'
gene_title = class_titles[gene_class]

In [26]:
# add categories to found genes
################################
found_genes_cat = []
for inst_gene in found_kin:

    inst_tuple = ()

    inst_name = gene_title + ': ' + inst_gene

    if inst_gene in gene_info[gene_class]['dark']:
      inst_cat = 'Dark Gene: True'
    else:
      inst_cat = 'Dark Gene: False'

    inst_tuple = (inst_name, inst_cat)
    found_genes_cat.append( inst_tuple )

In [32]:
sim_cutoff = 0.15
inst_dm[ abs(inst_dm) < sim_cutoff] = 0

In [33]:
df_dm = pd.DataFrame(data=inst_dm, columns=found_genes_cat, index=found_genes_cat)

In [34]:
net.load_df(df_dm)

In [35]:
net.make_clust(views=[])

In [36]:
clustergrammer_widget(network=net.widget())

* group 1: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=1bopq
* group 2: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=1bopr
* group 3: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=1bops