In [65]:
import pandas as pd
import numpy as np
import h5py

  from ._conv import register_converters as _register_converters


# Preprocessing 

Converting the text file into a dictionary in which each phenotype is a key and its associated genes are in a list as a value.

In [17]:
d = {} 
with open("MGI_Mammalian_Phenotype_Level_4_2019.txt") as f:
    for line in f:
        lst = line.split("\t")
        ph = lst[0]
        lst = lst[2:-1]
        d[ph] = lst

In [20]:
d

{'MP:0000600 liver hypoplasia': ['LIG1',
  'ARID3A',
  'TRP53',
  'TSC2',
  'TNIP1',
  'TSC1',
  'ATP11A',
  'ERN1',
  'RUNX2',
  'WT1',
  'MYCN',
  'MEIS1',
  'SP3',
  'KAT6A',
  'NF1',
  'RAF1',
  'RB1',
  'RELA',
  'RNASEH2B',
  'RPS27L',
  'HBB-B1',
  'SOCS3',
  'CDH5',
  'DDRGK1',
  'SH3PXD2A',
  'RDH10',
  'JARID2',
  'SPTBN1',
  'CCDC134',
  'SPEN',
  'XBP1',
  'HGF',
  'LSR',
  'EPOR',
  'DHFR',
  'PBX1',
  'EIF6',
  'TTC7',
  'LHX2',
  'KITL',
  'TCEA1',
  'TACC3',
  'KRAS',
  'ARF6',
  'KMT2A',
  'YBX1',
  'GREB1L',
  'SLC20A1',
  'IKZF1',
  'ARHGAP1',
  'GATA2',
  'GATA1',
  'HLX',
  'BAG1',
  'SOX17',
  'MBTD1',
  'ARIH2',
  'CTHRC1',
  'CDK5RAP3'],
 'MP:0001716 abnormal placenta labyrinth morphology': ['MED1',
  'SS18',
  'FOXD3',
  'FBXW8',
  'CDX2',
  'HUS1',
  'ARAP3',
  'GAB1',
  'ETV2',
  'BRAF',
  'DUSP9',
  'TGFBR1',
  'ERN1',
  'MAPK14',
  'CUL4B',
  'SP3',
  'SPINT1',
  'MYH9',
  'ASB4',
  'SYNB',
  'RAF1',
  'SYNA',
  'ATF2',
  'GGNBP2',
  'RB1',
  'DLX3',
  'MPI

Creating the reverse of the above dictionary, in which each gene is a key and its value is a list of its associated phenotypes.

In [22]:
gene_dict = {}
for k,v in d.items(): 
    for gene in v: 
        if gene not in gene_dict: 
            gene_dict[gene] = []
        gene_dict[gene].append(k)

In [23]:
gene_dict

{'LIG1': ['MP:0000600 liver hypoplasia',
  'MP:0001698 decreased embryo size',
  'MP:0000245 abnormal erythropoiesis',
  'MP:0003717 pallor',
  'MP:0009395 increased nucleated erythrocyte cell number',
  'MP:0003984 embryonic growth retardation',
  'MP:0000596 abnormal liver development',
  'MP:0001577 anemia',
  'MP:0000603 pale liver',
  'MP:0000601 small liver',
  'MP:0002123 abnormal definitive hematopoiesis',
  'MP:0002020 increased tumor incidence',
  'MP:0003763 abnormal thymus physiology',
  'MP:0012431 increased lymphoma incidence',
  'MP:0011099 lethality throughout fetal growth and development, complete penetrance',
  'MP:0001262 decreased body weight',
  'MP:0001732 postnatal growth retardation',
  'MP:0000248 macrocytosis',
  'MP:0002643 poikilocytosis',
  'MP:0008973 decreased erythroid progenitor cell number',
  'MP:0009308 increased adenocarcinoma incidence',
  'MP:0000208 decreased hematocrit',
  'MP:0000321 increased bone marrow cell number',
  'MP:0000691 enlarged sp

Create a list of functions and genes in the correct index order for the gene set matrix, which is a binary matrix with genes as rows and functions or phenotypes as columns. If the gene is associated with a phenotype, the corresponding cells receives a value of 1. Otherwise, the cell receives a value of 0.

In [31]:
functions = list(d.keys())
genes = list(gene_dict.keys())
gene_set = np.zeros((len(genes), len(functions)))

In [32]:
for row in range(len(gene_set)): 
    gene = genes[row] 
    for col in range(len(functions)):
        f = functions[col]
        if f in gene_dict[gene]: 
            gene_set[row][col] = 1

In [33]:
pd.DataFrame(gene_set)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5251,5252,5253,5254,5255,5256,5257,5258,5259,5260
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
print(len(genes))
print(len(functions))

13420
5261


Create dictionaries to easily get the index associated with a particular gene or function for later computations.

In [38]:
gene_to_idx = {}
f_to_idx = {}
# to get index associated with a particular gene
for i in range(len(genes)): 
    gene_to_idx[genes[i]] = i
# to get index associated with particular function
for j in range(len(functions)):
    f_to_idx[functions[j]] = j

# Similarity matrix

I'm going to try this for now, before I ask for help on Monday. To compute the similarity matrix, I will compare each gene to every other gene in a given set, get the average correlation and save that in a matrix that has genes has rows and sets as columns. 

Start by getting the correlation matrix so that the average correlation for each gene for each set is easier to compute, by just taking the average of the rest of the correlations found associated with the given set.

In [40]:
cor = np.corrcoef(gene_set)

In [41]:
# Should be a square matrix M x M in which M = # of genes
cor.shape

(13420, 13420)

In [47]:
# Initialize similarity matrix 
sim = np.zeros((len(genes), len(functions)))

In [None]:
# Convert to Pandas DataFrame to easily use 
# .iloc function, which allows row selection
cor = pd.DataFrame(cor)

Create dictionary to convert the list of genes associated with each function to their corresponding indices in the list of genes above. This would help simplify the process of taking the average of the other correlations for each set by using the .iloc function in Pandas that allows row selection.

In [45]:
gene_indices = {}
for func in d:
    gene_indices[func] = [] 
    for gene in d[func]:
        gene_indices[func].append(gene_to_idx[gene])

In [50]:
gene_indices

{'MP:0000600 liver hypoplasia': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58],
 'MP:0001716 abnormal placenta labyrinth morphology': [59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  7,
  71,
  72,
  12,
  73,
  74,
  75,
  76,
  15,
  77,
  78,
  79,
  16,
  80,
  81,
  82,
  83,
  84,
  85,
  21,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138],
 'MP:0001698 decreased

Loop through the indices of functions. From gene_indices,
we can get a list of the indices of genes that are associated
with that particular set/function. We can start by selecting
a sub-correlation matrix with only the rows corresponding to
the indices chosen.

Loop through these indices. For each row index, sum its column
and subtract 1 (for the self-correlation) and divide by the 
number of indices minus 1 to get the mean correlation of this
particular gene to every other gene in this set.

In [61]:
for j in range(len(functions)):
    f = functions[j]
    indices = gene_indices[f] 
    n = len(indices)-1
    temp = cor.iloc[indices]
    for i in range(len(indices)):
        gene_idx = indices[i]
        gene_cor = temp.iloc[:,i]
        sim[gene_idx][j] = (sum(gene_cor)-1)/n

Check if any row sums to 0. If there is, then there is probably something wrong, because each gene is associated with at least one function.

In [64]:
for row in sim:
    if sum(row) == 0:
        print(row)

Save the similarity matrix to HDF5 file format since it took a nontrivial amount of time to run.

In [66]:
f = h5py.File("mpi.hdf5", "w")
f.create_dataset("similarity", data=sim)
print(list(f.keys()))
f.close()

['similarity']
