In [28]:
import numpy as np
import pandas as pd
from tqdm import trange

# Load 2020 MPI data

In [20]:
bi = np.load("mgi_binary_matrix_2020_06.npz", allow_pickle=True)

In [21]:
ns = bi['nonsymmetric']
idx = bi['index']
col = bi['columns']

In [22]:
df = pd.DataFrame(ns)


# Preprocessing 

Attributes_to_genes contains the indices of genes associated with each condition or function.

In [29]:
attributes_to_genes = {}
for i in trange(len(col)):
    row = df.iloc[:, i]
    genes = np.where(row == 1)
    attributes_to_genes[i] = genes

100%|██████████| 9619/9619 [00:08<00:00, 1179.64it/s]


# Similarity matrix

Start by getting the correlation matrix so that the average correlation for each gene for each set is easier to compute, by just taking the average of the rest of the correlations found associated with the given set.

In [31]:
cor = np.corrcoef(df)

In [32]:
# Initialize similarity matrix 
sim = np.zeros((len(idx), len(col)))

In [33]:
# Convert to Pandas DataFrame to easily use 
# .iloc function, which allows row selection
cor = pd.DataFrame(cor)

In [41]:
for j in trange(len(col)):
    indices = attributes_to_genes[j][0]
    n = len(indices)-1
    if n == 0: continue
    temp = cor.iloc[indices]
    for i in range(len(indices)):
        gene_idx = indices[i]
        gene_cor = temp.iloc[:,i]
        sim[gene_idx][j] = (sum(gene_cor)-1)/n

100%|██████████| 9619/9619 [00:43<00:00, 218.88it/s]


Check if any row sums to 0. If there is, then there is probably something wrong, because each gene is associated with at least one function.

In [73]:
for row in sim:
    if sum(row) == 0:
        print(row)

[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]


In [83]:
d = {} 
with open("MGI_Mammalian_Phenotype_Level_4_2019.txt") as f:
    for line in f:
        lst = line.split("\t")
        ph = " ".join(lst[0].split()[1:])
        lst = lst[2:-1]
        d[ph] = lst

In [84]:
gene_dict = {}
for k,v in d.items(): 
    for gene in v: 
        if gene not in gene_dict: 
            gene_dict[gene] = []
        gene_dict[gene].append(k)

In [86]:
gene_list = list(gene_dict.keys())

In [87]:
idx

array(['0610010K14Rik', '1110017D15Rik', '1600029I14Rik', ..., 'Zzef1',
       'a', 'mt-Rnr2'], dtype=object)

In [89]:
aset = set(col)
bset = set(gene_list)
cset = aset.intersection(bset)
cset

set()

In [93]:
import h5py
f = h5py.File("tcga.hdf5", "r+")

  from ._conv import register_converters as _register_converters


In [115]:
g = np.transpose(np.array(f['meta']['genes'])).tolist()[0]

In [121]:
gene_arr = []
for s in g:
    gene_arr.append(str(s)[2:-1])

In [126]:
seta = set(col)
setb = set(gene_arr)
setc = set(gene_list)
setd = setb.intersection(setc)

In [127]:
len(setd)

10015

In [131]:
print(len(setc))
print(len(setb))

13420
38550
