# Mouse Gene Ontology (MGI)

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Downloaded: 04-2017 <br/>
Data Source: http://www.informatics.jax.org/

In [39]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
# import untility_functions as uf
%matplotlib inline

In [40]:
def createBinaryMatrix(inputDF, ppi=False):

    if ppi:

        genes = list(set(inputDF.iloc[:,0].unique().tolist()+inputDF.iloc[:,1].unique().tolist()))

        matrix = pd.DataFrame(index=genes, columns=genes, data=0)

        for i, gene in enumerate(genes):

            lst = inputDF[inputDF.iloc[:,0] == gene].iloc[:,1].tolist()
            lst += inputDF[inputDF.iloc[:,1] == gene].iloc[:,0].tolist()
            lst = set(lst)
            lst.discard(gene)
            lst = list(lst)

            matrloc.loc[gene, lst] = 1

        return(matrix)

    else:
        genes = list(set(inputDF.iloc[:,0].unique().tolist()))

        attributes = list(set(inputDF.iloc[:,1].unique().tolist()))

        matrix = pd.DataFrame(index=genes, columns=attributes, data=0.0)

        for i, gene in enumerate(genes):

            lst = inputDF.loc[(inputDF.iloc[:,0] == gene), inputDF.columns[1]].values.tolist()

            matrix.at[gene, lst] = 1

        return(matrix)

In [41]:
def mapgenesymbols(inputDF):
    mappingDF = pd.read_csv('mappingFile_2017.txt', sep='\t', header=None, index_col=0)
    inputDF.reset_index(inplace=True)


    lst1 = []

    for i, index in enumerate(inputDF.index):

        if inputDF.loc[index, inputDF.columns[0]] in mappingDF.index:
            lst1.append(mappingDF.loc[inputDF.loc[index, inputDF.columns[0]], 1])
        else:
            lst1.append(np.nan)


    inputDF[inputDF.columns[0]] = lst1


    inputDF.dropna(inplace=True, subset=[inputDF.columns[0]])
    inputDF.set_index(inputDF.columns[0], inplace=True)

In [42]:
def createUpGeneSetLib(inputDF, name, details=None):

    filenameGMT = f'{name}.gmt'
    for col in inputDF.columns:

        index = inputDF[inputDF[col] == 1].index

        lst = index.values.tolist()

        if len(lst) > 5 and len(lst) <= 2000:

            lst.insert(0, col)
            if details:
                lst.insert(1, details[i])
            else:
                lst.insert(1, 'NA')
            lst = ['{0}\t'.format(elem) for elem in lst] # add tabs between terms in the lst
            lst.insert(len(lst), '\n') # add a newline char at the end of each lst

            with open(filenameGMT, 'a') as the_file:
                the_file.writelines(lst)

# Load Data

In [43]:
df = pd.read_table('http://www.informatics.jax.org/downloads/reports/MGI_GenePheno.rpt', header=None)

In [None]:
df.head()

In [None]:
df.shape

# Get Relevent Columns and Name Them

In [None]:
# df.drop(7, axis=1, inplace=True) # drop blank column

In [44]:
col = ['Allelic Composition','Allele Symbol(s)','Allele ID(s)','Genetic Background','Mammalian Phenotype ID','PubMed ID','MGI Marker Accession ID (comma-delimited)','MGI Genotype Accession ID (comma-delimited)']

In [45]:
df.columns = col

In [46]:
df.head()

Unnamed: 0,Allelic Composition,Allele Symbol(s),Allele ID(s),Genetic Background,Mammalian Phenotype ID,PubMed ID,MGI Marker Accession ID (comma-delimited),MGI Genotype Accession ID (comma-delimited)
0,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0000600,12529408,MGI:97874,MGI:2166359
1,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001716,16449662,MGI:97874,MGI:2166359
2,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001698,16449662,MGI:97874,MGI:2166359
3,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001092,16449662,MGI:97874,MGI:2166359
4,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0000961,16449662,MGI:97874,MGI:2166359


# Load Gene Data

In [47]:
gene_meta = pd.read_table('http://www.informatics.jax.org/downloads/reports/MRK_GXDAssay.rpt', header=None)

In [48]:
gene_meta.set_index(0, inplace=True)

In [49]:
gene_meta.head()

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
MGI:101757,Cfl1,"MGI:4836610,MGI:4836612,MGI:4836613,MGI:483661..."
MGI:101759,Syt4,"MGI:3522221,MGI:3522268,MGI:5421150"
MGI:101760,Sfswap,MGI:4945277
MGI:101761,Hmga2,"MGI:2654714,MGI:2654715,MGI:2657185,MGI:265718..."
MGI:101762,Elk3,"MGI:2655709,MGI:2655710,MGI:3507694,MGI:350888..."


# Get Relevent Data and Map Gene ID's to Symbols

In [50]:
ontology_df = pd.DataFrame()

i = 0

for index in df.index:
    mgi_maid = df.loc[index, 'MGI Marker Accession ID (comma-delimited)'].split(',')
    
    if len(mgi_maid) > 1:
        for mgi in mgi_maid:
            if mgi in gene_meta.index:
                lst = []
                lst.append(gene_meta.loc[mgi, 1])
                lst.append(df.loc[index, 'Mammalian Phenotype ID'])
                df = pd.concat([df, pd.DataFrame(data=lst).T])
    else:
        mgi = mgi_maid[0]
        if mgi in gene_meta.index:
            lst = []
            lst.append(gene_meta.loc[mgi, 1])
            lst.append(df.loc[index, 'Mammalian Phenotype ID'])
            ontology_df = pd.concat([ontology_df, pd.DataFrame(data=lst).T])

In [51]:
ontology_df.drop_duplicates(inplace=True)

ontology_df.reset_index(inplace=True)

ontology_df.drop('index', axis=1,inplace=True)

In [52]:
ontology_df.head()

Unnamed: 0,0,1
0,Rb1,MP:0000600
1,Rb1,MP:0001716
2,Rb1,MP:0001698
3,Rb1,MP:0001092
4,Rb1,MP:0000961


In [53]:
ontology_df.shape

(174909, 2)

# Load Ontology Metadata

In [56]:
ontology_meta = pd.read_table('http://www.informatics.jax.org/downloads/reports/VOC_MammalianPhenotype.rpt', header=None)

In [57]:
ontology_meta.set_index(0, inplace=True)

In [58]:
ontology_meta.head()

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
MP:0000001,mammalian phenotype,"the observable morphological, physiological, b..."
MP:0000002,obsolete Morphology,OBSOLETE.
MP:0000003,abnormal adipose tissue morphology,any structural anomaly of the connective tissu...
MP:0000005,increased brown adipose tissue amount,increased amount of the thermogenic form of ad...
MP:0000008,increased white adipose tissue amount,increased quantity of fat-storing cells/tissue


# Map Ontology to Ontology ID's

In [95]:
lst = []

for index in ontology_df.index:
    term = ontology_meta.loc[ontology_df.loc[index, 1], 1]
                             
    lst.append(f'{term} {ontology_df.loc[index, 1]}')

ontology_df[1] = lst

In [96]:
ontology_df.drop_duplicates(inplace=True)

ontology_df.reset_index(inplace=True)

ontology_df.drop('index', axis=1,inplace=True)

In [97]:
ontology_df.head()

Unnamed: 0,0,1
0,Rb1,liver hypoplasia MP:0000600
1,Rb1,abnormal placenta labyrinth morphology MP:0001716
2,Rb1,decreased embryo size MP:0001698
3,Rb1,abnormal trigeminal ganglion morphology MP:000...
4,Rb1,abnormal dorsal root ganglion morphology MP:00...


In [98]:
ontology_df.shape

(174909, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [99]:
ontology_df.set_index(0, inplace=True)

In [100]:
mapgenesymbols(ontology_df)

In [101]:
ontology_df.shape

(165579, 1)

# Create Binary Matrix

In [102]:
ontology_df.reset_index(inplace=True)

In [103]:
binary_matrix = createBinaryMatrix(ontology_df)

In [104]:
binary_matrix.head()

Unnamed: 0,abnormal aortic valve cusp morphology MP:0010595,increased blood uric acid level MP:0008821,thin tail MP:0000589,decreased susceptibility to dopaminergic neuron neurotoxicity MP:0011452,small basisphenoid bone MP:0004462,absent metacarpal bones MP:0003074,polycythemia MP:0002872,abnormal leukotriene level MP:0009813,fused dorsal root ganglion MP:0000963,increased cerebellar foliation MP:0020531,...,decreased glutamine level MP:0030707,small incus MP:0030106,abnormal synaptic glutamate release MP:0004494,abnormal canal of Schlemm morphology MP:0005204,abnormal milk composition MP:0004047,head tilt MP:0005191,abnormal hippocampus pyramidal cell layer MP:0008284,epididymal inflammation MP:0003596,lacrimal gland hypertrophy MP:0013454,pulmonary artery hypoplasia MP:0010460
FADS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
USP42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FANCA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MMP16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SCNN1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [105]:
binary_matrix.shape

(9792, 9753)

# Save Binary Matrix

In [106]:
binary_matrix.to_csv('mgi_binary_matrix.tsv.zip', sep='\t', compression='gzip')

# Create Gene Set Library

In [108]:
createUpGeneSetLib(binary_matrix, 'MGI_Mammalian_Phenotype_Level_4_2021')