# Mouse Gene Ontology (MGI)

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Downloaded: 04-2017 <br/>
Data Source: http://www.informatics.jax.org/

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/MGI/untility_functions.py'>

# Load Data

In [3]:
df = pd.read_table('http://www.informatics.jax.org/downloads/reports/MGI_GenePheno.rpt', header=None)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0000600,12529408,MGI:97874,,MGI:2166359
1,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001716,16449662,MGI:97874,,MGI:2166359
2,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001698,16449662,MGI:97874,,MGI:2166359
3,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001092,16449662,MGI:97874,,MGI:2166359
4,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0000961,16449662,MGI:97874,,MGI:2166359


In [5]:
df.shape

(204952, 9)

# Get Relevent Columns and Name Them

In [6]:
df.drop(7, axis=1, inplace=True) # drop blank column

In [7]:
col = ['Allelic Composition','Allele Symbol(s)','Allele ID(s)','Genetic Background','Mammalian Phenotype ID','PubMed ID','MGI Marker Accession ID (comma-delimited)','MGI Genotype Accession ID (comma-delimited)']

In [8]:
df.columns = col

In [9]:
df.head()

Unnamed: 0,Allelic Composition,Allele Symbol(s),Allele ID(s),Genetic Background,Mammalian Phenotype ID,PubMed ID,MGI Marker Accession ID (comma-delimited),MGI Genotype Accession ID (comma-delimited)
0,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0000600,12529408,MGI:97874,MGI:2166359
1,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001716,16449662,MGI:97874,MGI:2166359
2,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001698,16449662,MGI:97874,MGI:2166359
3,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001092,16449662,MGI:97874,MGI:2166359
4,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0000961,16449662,MGI:97874,MGI:2166359


# Load Gene Data

In [10]:
gene_meta = pd.read_table('http://www.informatics.jax.org/downloads/reports/MRK_GXDAssay.rpt', header=None)

In [11]:
gene_meta.set_index(0, inplace=True)

In [12]:
gene_meta.head()

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
MGI:1353431,Pcsk1n,"MGI:5421241,MGI:2662712,MGI:3586139,MGI:482710..."
MGI:1931526,Hist1h1c,"MGI:4825375,MGI:5291705,MGI:3845788"
MGI:2177151,Cd99l2,"MGI:3806997,MGI:3806967,MGI:3806994,MGI:382859..."
MGI:1916858,Sec24d,"MGI:4827936,MGI:5698498"
MGI:1915444,Ndufb4,"MGI:3691813,MGI:5423678,MGI:3691814"


# Get Relevent Data and Map Gene ID's to Symbols

In [13]:
ontology_df = pd.DataFrame()

i = 0

for index in df.index:
    
    i += 1
    
    progress = (i/len(df.index))*100
        
    sys.stdout.write("Progress: %d%%   \r" % (progress))
    sys.stdout.flush()
    
    
    
    if len(df.ix[index, 'MGI Marker Accession ID (comma-delimited)'].split(',')) > 1:
        for mgi in df.ix[index, 'MGI Marker Accession ID (comma-delimited)'].split(','):
            if mgi in gene_meta.index:
                lst = []
                lst.append(gene_meta.ix[mgi, 1])
                lst.append(df.ix[index, 'Mammalian Phenotype ID'])
                temp = pd.DataFrame(data=lst)
                df = pd.concat([df,temp.T])
    else:
        mgi = df.ix[index, 'MGI Marker Accession ID (comma-delimited)'].split(',')[0]
        if mgi in gene_meta.index:
            lst = []
            lst.append(gene_meta.ix[mgi, 1])
            lst.append(df.ix[index, 'Mammalian Phenotype ID'])
            temp = pd.DataFrame(data=lst)
            ontology_df = pd.concat([ontology_df,temp.T])

Progress: 100%   

In [14]:
ontology_df.drop_duplicates(inplace=True)

ontology_df.reset_index(inplace=True)

ontology_df.drop('index', axis=1,inplace=True)

In [25]:
ontology_df.head()

Unnamed: 0,0,1
0,Rb1,MP:0000600
1,Rb1,MP:0001716
2,Rb1,MP:0001698
3,Rb1,MP:0001092
4,Rb1,MP:0000961


In [19]:
ontology_df.shape

(140084, 1)

# Load Ontology Metadata

In [20]:
ontology_meta = pd.read_table('http://www.informatics.jax.org/downloads/reports/VOC_MammalianPhenotype.rpt', header=None)

In [22]:
ontology_meta.set_index(0, inplace=True)

In [23]:
ontology_meta.head()

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
MP:0000001,mammalian phenotype,"the observable morphological, physiological, b..."
MP:0000002,obsolete Morphology,OBSOLETE.
MP:0000003,abnormal adipose tissue morphology,any structural anomaly of the connective tissu...
MP:0000005,increased brown adipose tissue amount,increased amount of the thermogenic form of ad...
MP:0000008,increased white adipose tissue amount,increased quantity of fat-storing cells/tissue


# Map Ontology to Ontology ID's

In [28]:
lst = []

for i,index in enumerate(ontology_df.index):

    progress = ((i+1)/len(ontology_df.index))*100
        
    sys.stdout.write("Progress: %d%%   \r" % (progress))
    sys.stdout.flush()
    
    lst.append(ontology_meta.ix[ontology_df.ix[index, 1], 1])

ontology_df[1] = lst

Progress: 99%   

In [29]:
ontology_df.drop_duplicates(inplace=True)

ontology_df.reset_index(inplace=True)

ontology_df.drop('index', axis=1,inplace=True)

In [30]:
ontology_df.head()

Unnamed: 0,0,1
0,Rb1,liver hypoplasia
1,Rb1,abnormal placenta labyrinth morphology
2,Rb1,decreased embryo size
3,Rb1,abnormal trigeminal ganglion morphology
4,Rb1,abnormal dorsal root ganglion morphology


In [31]:
ontology_df.shape

(140084, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [32]:
ontology_df.set_index(0, inplace=True)

In [33]:
uf.mapgenesymbols(ontology_df)

Progeres: 100%  140084 Out of 140084   

In [34]:
ontology_df.shape

(134516, 1)

# Create Binary Matrix

In [35]:
ontology_df.reset_index(inplace=True)

In [36]:
binary_matrix = uf.createBinaryMatix(ontology_df)

Progeres: 100%  7758 Out of 7758   

In [37]:
binary_matrix.head()

Unnamed: 0,abnormal thrombopoiesis,akinesia,increased ovary apoptosis,decreased skeletal muscle weight,abnormal respiratory system physiology,increased rhabdomyosarcoma incidence,abnormal alveolocapillary membrane morphology,abnormal leukocyte morphology,prenatal lethality,absent immature B cells,...,abnormal renal sodium ion transport,decreased marginal zone B cell number,abnormal thoracic mammary gland morphology,decreased circulating complement protein level,increased pituitary gland tumor incidence,increased pre-B cell number,abnormal neurotransmitter level,abnormal macrophage derived foam cell morphology,increased sensitivity to xenobiotic induced morbidity/mortality,increased erythrocyte clearance
HOXC13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PIK3R5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
YES1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENTPD3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RUFY3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
binary_matrix.shape

(7758, 8639)

# Save Binary Matrix

In [40]:
filename = '~/./Documents/Harmonizome/MGI/Output/mgi_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [41]:
path = '/Users/moshesilverstein/Documents/Harmonizome/MGI/Output/'

In [42]:
name = 'mgi_gene_set'

In [43]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  8639 Out of 8639   

# Create Attribute Library

In [44]:
path = '/Users/moshesilverstein/Documents/Harmonizome/MGI/Output/'

In [45]:
name = 'mgi_attribute_set'

In [46]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  7758 Out of 7758   

# Create Gene Similarity Matrix

In [47]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [48]:
gene_similarity_matix.head()

Unnamed: 0,HOXC13,PIK3R5,YES1,ENTPD3,RUFY3,GNAT1,FTO,DMRTA1,PASK,HOXC8,...,OMG,GORAB,TUFT1,CDH3,MYOF,PPP4C,MSLN,CSGALNACT1,TBX5,CDC7
HOXC13,1.0,0.0,0.0,0.0,0.0,0.0,0.015385,0.0,0.0,0.054054,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017241,0.0
PIK3R5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YES1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.023256,0.0
ENTPD3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RUFY3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25


## Save Gene Similarity Matrix 

In [49]:
filename = '~/./Documents/Harmonizome/MGI/Output/mgi_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [50]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [51]:
attribute_similarity_matix.head()

Unnamed: 0,abnormal thrombopoiesis,akinesia,increased ovary apoptosis,decreased skeletal muscle weight,abnormal respiratory system physiology,increased rhabdomyosarcoma incidence,abnormal alveolocapillary membrane morphology,abnormal leukocyte morphology,prenatal lethality,absent immature B cells,...,abnormal renal sodium ion transport,decreased marginal zone B cell number,abnormal thoracic mammary gland morphology,decreased circulating complement protein level,increased pituitary gland tumor incidence,increased pre-B cell number,abnormal neurotransmitter level,abnormal macrophage derived foam cell morphology,increased sensitivity to xenobiotic induced morbidity/mortality,increased erythrocyte clearance
abnormal thrombopoiesis,1.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009434,0.0
akinesia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.060606,0.0,0.0,0.0
increased ovary apoptosis,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
decreased skeletal muscle weight,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.017241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abnormal respiratory system physiology,0.017241,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.009009,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006993,0.0


## Save Attribute Similarity Matrix

In [52]:
filename = '~/./Documents/Harmonizome/MGI/Output/mgi_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [53]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  7758 Out of 7758   

In [54]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,HOXC13,3229
1,PIK3R5,23533
2,YES1,7525
3,ENTPD3,956
4,RUFY3,22902


In [55]:
gene_list.shape

(7758, 2)

### Save Gene List

In [56]:
filename = '~/./Documents/Harmonizome/MGI/Output/mgi_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [57]:
attribute_list = uf.createAttributeList(binary_matrix)

In [58]:
attribute_list.head()

Unnamed: 0,Attributes
0,abnormal thrombopoiesis
1,akinesia
2,increased ovary apoptosis
3,decreased skeletal muscle weight
4,abnormal respiratory system physiology


In [59]:
attribute_list.shape

(8639, 1)

### Save Attribute List

In [60]:
filename = '~/./Documents/Harmonizome/MGI/Output/mgi_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [61]:
path = '/Users/moshesilverstein/Documents/Harmonizome/MGI/Output/'

In [62]:
name = 'mgi_gene_attribute_edge_list'

In [63]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  8639 Out of 8639   

 The number of statisticaly relevent gene-attribute associations is: 134408
