In [1]:
import h5py
import pandas as pd
import numpy as np
import json

  from ._conv import register_converters as _register_converters


In [2]:
f = h5py.File("tcga.hdf5", "r+")

In [3]:
# transpose to get 60000x60000 matrix
# ex_matrix = np.transpose(f['ex_matrix'])
data = f['data']
meta = f['meta']
genes = f['meta']['genes']
eids = f['meta']['ensembl_id']
gene_exp = data['processed_expression']

# Subcorrelation matrix

In [4]:
with open("biomart.json") as b:
    eid_to_type = json.load(b)
eid_to_type

{'ENSG00000210049': 'Mt_tRNA',
 'ENSG00000211459': 'Mt_rRNA',
 'ENSG00000210077': 'Mt_tRNA',
 'ENSG00000210082': 'Mt_rRNA',
 'ENSG00000209082': 'Mt_tRNA',
 'ENSG00000198888': 'protein_coding',
 'ENSG00000210100': 'Mt_tRNA',
 'ENSG00000210107': 'Mt_tRNA',
 'ENSG00000210112': 'Mt_tRNA',
 'ENSG00000198763': 'protein_coding',
 'ENSG00000210117': 'Mt_tRNA',
 'ENSG00000210127': 'Mt_tRNA',
 'ENSG00000210135': 'Mt_tRNA',
 'ENSG00000210140': 'Mt_tRNA',
 'ENSG00000210144': 'Mt_tRNA',
 'ENSG00000198804': 'protein_coding',
 'ENSG00000210151': 'Mt_tRNA',
 'ENSG00000210154': 'Mt_tRNA',
 'ENSG00000198712': 'protein_coding',
 'ENSG00000210156': 'Mt_tRNA',
 'ENSG00000228253': 'protein_coding',
 'ENSG00000198899': 'protein_coding',
 'ENSG00000198938': 'protein_coding',
 'ENSG00000210164': 'Mt_tRNA',
 'ENSG00000198840': 'protein_coding',
 'ENSG00000210174': 'Mt_tRNA',
 'ENSG00000212907': 'protein_coding',
 'ENSG00000198886': 'protein_coding',
 'ENSG00000210176': 'Mt_tRNA',
 'ENSG00000210184': 'Mt_tRNA',


In [5]:
def biomart_id_to_type(key):
    if (key in eid_to_type):
        return eid_to_type[key]
    else:
        return None

In [6]:
biomart_gene_types = np.transpose([ biomart_id_to_type(str(key)[2:-1]) for key in eids ])
biomart_filtered_types = biomart_gene_types[biomart_gene_types!=None]
counts = np.unique(biomart_filtered_types, return_counts=True)
counts

(array(['IG_C_gene', 'IG_C_pseudogene', 'IG_D_gene', 'IG_J_gene',
        'IG_J_pseudogene', 'IG_V_gene', 'IG_V_pseudogene', 'Mt_rRNA',
        'Mt_tRNA', 'TEC', 'TR_C_gene', 'TR_J_gene', 'TR_V_gene',
        'TR_V_pseudogene', 'lncRNA', 'miRNA', 'misc_RNA',
        'polymorphic_pseudogene', 'processed_pseudogene', 'protein_coding',
        'pseudogene', 'rRNA', 'rRNA_pseudogene', 'scRNA', 'scaRNA',
        'snRNA', 'snoRNA', 'transcribed_processed_pseudogene',
        'transcribed_unitary_pseudogene',
        'transcribed_unprocessed_pseudogene',
        'translated_processed_pseudogene',
        'translated_unprocessed_pseudogene', 'unitary_pseudogene',
        'unprocessed_pseudogene', 'vaultRNA'], dtype=object),
 array([   13,     7,    22,     7,     2,   140,    92,     2,    16,
          688,     6,    11,    94,    14,  9901,   279,   500,    19,
         4782, 18463,     9,     8,    77,     1,    16,   384,   214,
          353,   101,   623,     1,     1,    27,   790,     

In [7]:
g_types = { 
    "ig": ['IG_C_gene', 'IG_C_pseudogene', 'IG_D_gene', 'IG_J_gene',
        'IG_J_pseudogene', 'IG_V_gene', 'IG_V_pseudogene'],
    "mito": ['Mt_rRNA','Mt_tRNA'], 
    "tr": ['TR_C_gene', 'TR_J_gene', 'TR_V_gene','TR_V_pseudogene'], 
    "pseudo": ['pseudogene', 'transcribed_processed_pseudogene',
               'transcribed_unitary_pseudogene',
               'transcribed_unprocessed_pseudogene',
               'translated_processed_pseudogene',
               'translated_unprocessed_pseudogene',
               'unitary_pseudogene', 'unprocessed_pseudogene', 
               'polymorphic_pseudogene', 'processed_pseudogene'
              ], 
    "rRNA": ["rRNA", 'rRNA_pseudogene'], 
    "other_RNA": ['scRNA', 'scaRNA','snRNA', 'snoRNA', 'misc_RNA', 'vaultRNA'], 
    "protein_coding": ["protein_coding"], 
    "TEC": ["TEC"], "lncRNA": ["lncRNA"], "miRNA": ["miRNA"]
}

flipped = {}
keys = list(g_types.keys())
for i in range(len(g_types)):
    key = keys[i]
    val = g_types[key]
    for j in val:
        flipped[j] = key

In [8]:
print(len(flipped))
print(len(counts[0]))

35
35


In [9]:
gene_exp = pd.DataFrame(np.matrix(gene_exp))

In [16]:
def subcorrelation(gtypes):
    """
    Calculate subcorrelation by looping through the 
    gene types in order, putting their indices into
    categories and then getting the correlation matrix.
    """
    g_idx_dict = {"None": []}
    for idx in range(len(gtypes)):
        gtype = gtypes[idx]
        if gtype == None: continue 
        category = flipped[gtype]
        if category not in g_idx_dict: 
            g_idx_dict[category] = []
        g_idx_dict[category].append(idx)
        
    print("The dict has {} keys".format(len(g_idx_dict)))
    
    for key in g_idx_dict: 
        rows = g_idx_dict[key]
        print("{} has {} values".format(key, len(rows)))
        gene_matrix = gene_exp.iloc[rows]
        corr = np.triu(np.corrcoef(gene_matrix))
        np.fill_diagonal(corr, 0)
        subcor = data.create_dataset("{}_corr".format(key), data=corr)
        gene_order = meta.create_dataset("{}_genes".format(key), data=rows)

In [17]:
subcorrelation(biomart_gene_types)

The dict has 1 keys
None has 886 values


In [None]:
print(18463+6706+9901+283+1116+279+85+125+18+688)
print(sum(counts[1]))

In [18]:
print(list(data.keys()))
print(list(meta.keys()))
f.close()

['None_corr', 'TEC_corr', 'expression', 'ig_corr', 'lncRNA_corr', 'miRNA_corr', 'mito_corr', 'other_RNA_corr', 'processed_expression', 'protein_coding_corr', 'pseudo_corr', 'rRNA_corr', 'tr_corr']
['None_genes', 'TEC_genes', 'ensembl_id', 'genes', 'ig_genes', 'lncRNA_genes', 'miRNA_genes', 'mito_genes', 'other_RNA_genes', 'protein_coding_genes', 'pseudo_genes', 'rRNA_genes', 'tr_genes']


# Correlation matrix

In [7]:
# get the upper triangle and make the diagonal zero of the correlation matrix
cor = np.triu(np.corrcoef(exp))
np.fill_diagonal(cor,0)

In [8]:
# correlation matrix 
cor

array([[ 0.        ,  0.26270527,  0.14656306, ...,  0.25640679,
        -0.17067008,  0.21301928],
       [ 0.        ,  0.        ,  0.00520883, ...,  0.05731222,
        -0.06433745, -0.03968901],
       [ 0.        ,  0.        ,  0.        , ..., -0.08012387,
         0.06000723,  0.02973038],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -0.07912547,  0.33178938],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.00034693],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [9]:
correlation_matrix = f.create_dataset("correlation_matrix", data=cor)

In [14]:
cor = f['correlation_matrix']

In [16]:
# flatten the matrix to get the counts 
flat = np.matrix(cor).flatten()

In [17]:
# remove the zero values in the matrix
flat = flat[flat != 0]

In [22]:
# get the counts for the unique letters in the matrix 
counts = np.unique(flat, return_counts=True)
counts

(matrix([[-0.88279573, -0.87689072, -0.87543078, ...,  0.98587739,
           0.98638712,  0.98646871]]), array([0, 0, 0, ..., 1, 1, 1]))

In [19]:
# Number of 1 values 
counts[1][-1]

1

In [20]:
# Number of counts total (total number of correlations)
sum(counts[1])

743031975

In [21]:
counts

(matrix([[-0.88279573, -0.87689072, -0.87543078, ...,  0.98587739,
           0.98638712,  0.98646871]]), array([0, 0, 0, ..., 1, 1, 1]))

In [41]:
high = counts[0][-4900:]

In [42]:
for index in range(len(high)):
    val = high[index]
    if val >= 0.99:
        print(index+4900)
        break

4975


In [44]:
sum(counts[1][(-4900+75):])

2160824