# Mouse Gene Ontology (MGI) Mammalian Phenotype processing script

Maxim Kuleshov, 6/14/2021<br/>
Data Source: http://www.informatics.jax.org/

In [29]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
# import untility_functions as uf
from maayanlab_bioinformatics.harmonization import ncbi_genes_lookup
%matplotlib inline

In [30]:
gene_lookup = ncbi_genes_lookup()

In [32]:
gene_lookup('MTFMT')

'MTFMT'

In [33]:
mapper = pd.read_table('http://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt')
mapper.head()

Unnamed: 0,DB Class Key,Common Organism Name,NCBI Taxon ID,Symbol,EntrezGene ID,Mouse MGI ID,HGNC ID,OMIM Gene ID,Genetic Location,Genome Coordinates (mouse: GRCm39 human: GRCh38),Nucleotide RefSeq IDs,Protein RefSeq IDs,SWISS_PROT IDs
0,47367020,"mouse, laboratory",10090,Banf1,23825,MGI:1346330,,,Chr19 4.31 cM,Chr19:5414661-5416904(-),"NM_001038231,NM_011793,NM_001286608","NP_001033320,NP_001273537,NP_035923",O54962
1,47367020,human,9606,BANF1,8815,,HGNC:17397,OMIM:603811,Chr11 q13.1,Chr11:66002079-66004149(+),"NM_003860,NM_001143985","NP_001137457,NP_003851,XP_016874004,XP_054226339",O75531
2,47367021,"mouse, laboratory",10090,Pde9a,18585,MGI:1277179,,,Chr17 16.23 cM,Chr17:31605184-31695284(+),"NM_001410689,NM_001410688,XM_036160401,XM_0361...","XP_036016295,XP_036016294,XP_036016293,XP_0360...",O70628
3,47367021,human,9606,PDE9A,5152,,HGNC:8795,OMIM:602973,Chr21 q22.3,Chr21:42653621-42775509(+),"NM_001001571,NM_001315533,NM_002606,NM_0010015...","NP_001001567,NP_001001568,NP_001001569,NP_0010...",O76083
4,47367022,"mouse, laboratory",10090,Asrgl1,66514,MGI:1913764,,,Chr19 6.06 cM,Chr19:9089083-9112930(-),NM_025610,NP_079886,Q8C0M9


In [63]:
mouse = mapper[mapper['NCBI Taxon ID'] == 10090][['DB Class Key', 'Symbol']]
human = mapper[mapper['NCBI Taxon ID'] == 9606][['DB Class Key', 'Symbol']]

mouse.shape, human.shape

((21812, 2), (24586, 2))

In [64]:
len(human['DB Class Key']), len(set(human['DB Class Key']))

(24586, 20120)

In [69]:
len(set(human['DB Class Key']).intersection(set(mouse['DB Class Key'])))

20117

In [79]:
mapper = pd.merge(mouse, human, on='DB Class Key', how="inner", suffixes=('_mouse', '_human'))

In [80]:
len(mapper["Symbol_mouse"].unique()), len(mapper["Symbol_human"].unique())

(20117, 19323)

In [81]:
mapper = mapper.set_index('Symbol_mouse')

In [84]:
mapper = mapper[~mapper.index.duplicated(keep='first')]

In [85]:
mapper

Unnamed: 0_level_0,DB Class Key,Symbol_human
Symbol_mouse,Unnamed: 1_level_1,Unnamed: 2_level_1
Banf1,47367020,BANF1
Pde9a,47367021,PDE9A
Asrgl1,47367022,ASRGL1
Clpb,47367023,CLPB
Emd,47367024,EMD
...,...,...
Zswim1,47387708,ZSWIM1
Zswim3,47387709,ZSWIM3
Zswim4,47387710,ZSWIM4
Zswim9,47387711,ZSWIM9


In [99]:
def createBinaryMatrix(inputDF, ppi=False):

    if ppi:

        genes = list(set(inputDF.iloc[:,0].unique().tolist()+inputDF.iloc[:,1].unique().tolist()))

        matrix = pd.DataFrame(index=genes, columns=genes, data=0)

        for i, gene in enumerate(genes):

            lst = inputDF[inputDF.iloc[:,0] == gene].iloc[:,1].tolist()
            lst += inputDF[inputDF.iloc[:,1] == gene].iloc[:,0].tolist()
            lst = set(lst)
            lst.discard(gene)
            lst = list(lst)

            matrloc.loc[gene, lst] = 1

        return(matrix)

    else:
        genes = list(set(inputDF.iloc[:,0].unique().tolist()))

        attributes = list(set(inputDF.iloc[:,1].unique().tolist()))

        matrix = pd.DataFrame(index=genes, columns=attributes, data=0.0)

        for i, gene in enumerate(genes):

            lst = inputDF.loc[(inputDF.iloc[:,0] == gene), inputDF.columns[1]].values.tolist()

            matrix.loc[gene, lst] = 1

        return(matrix)

In [86]:
def mapgenesymbols(inputDF):
    # mappingDF = pd.read_csv('mappingFile_2017.txt', sep='\t', header=None, index_col=0)
    inputDF.reset_index(inplace=True)


    lst1 = []

    for i, index in enumerate(inputDF.index):

        if inputDF.loc[index, inputDF.columns[0]] in mapper.index:
            lst1.append(mapper.loc[inputDF.loc[index, inputDF.columns[0]], "Symbol_human"])
        else:
            lst1.append(np.nan)


    inputDF[inputDF.columns[0]] = lst1


    inputDF.dropna(inplace=True, subset=[inputDF.columns[0]])
    inputDF.set_index(inputDF.columns[0], inplace=True)

In [4]:
def createUpGeneSetLib(inputDF, name, details=None):

    filenameGMT = f'{name}.gmt'
    for col in inputDF.columns:

        index = inputDF[inputDF[col] == 1].index

        lst = index.values.tolist()

        if len(lst) > 5 and len(lst) <= 2000:

            lst.insert(0, col)
            if details:
                lst.insert(1, details[i])
            else:
                lst.insert(1, 'NA')
            lst = ['{0}\t'.format(elem) for elem in lst] # add tabs between terms in the lst
            lst.insert(len(lst), '\n') # add a newline char at the end of each lst

            with open(filenameGMT, 'a') as the_file:
                the_file.writelines(lst)

# Load Data

In [5]:
df = pd.read_table('http://www.informatics.jax.org/downloads/reports/MGI_GenePheno.rpt', header=None)

In [6]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0000600,12529408,MGI:97874,MGI:2166359
1,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001716,16449662,MGI:97874,MGI:2166359
2,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001698,16449662,MGI:97874,MGI:2166359
3,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001092,16449662,MGI:97874,MGI:2166359
4,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0000961,16449662,MGI:97874,MGI:2166359


In [7]:
df.shape

(272580, 8)

# Get Relevent Columns and Name Them

In [8]:
# df.drop(7, axis=1, inplace=True) # drop blank column

In [9]:
col = ['Allelic Composition','Allele Symbol(s)','Allele ID(s)','Genetic Background','Mammalian Phenotype ID','PubMed ID','MGI Marker Accession ID (comma-delimited)','MGI Genotype Accession ID (comma-delimited)']

In [10]:
df.columns = col

In [11]:
df.head()

Unnamed: 0,Allelic Composition,Allele Symbol(s),Allele ID(s),Genetic Background,Mammalian Phenotype ID,PubMed ID,MGI Marker Accession ID (comma-delimited),MGI Genotype Accession ID (comma-delimited)
0,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0000600,12529408,MGI:97874,MGI:2166359
1,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001716,16449662,MGI:97874,MGI:2166359
2,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001698,16449662,MGI:97874,MGI:2166359
3,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001092,16449662,MGI:97874,MGI:2166359
4,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0000961,16449662,MGI:97874,MGI:2166359


# Load Gene Data

In [12]:
gene_meta = pd.read_table('http://www.informatics.jax.org/downloads/reports/MRK_GXDAssay.rpt', header=None)

In [13]:
gene_meta.set_index(0, inplace=True)

In [14]:
gene_meta.head()

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
MGI:101757,Cfl1,"MGI:4836610,MGI:4836612,MGI:4836613,MGI:483661..."
MGI:101759,Syt4,"MGI:3522221,MGI:3522268,MGI:5421150,MGI:7522875"
MGI:101760,Sfswap,"MGI:4945277,MGI:7531520"
MGI:101761,Hmga2,"MGI:2654714,MGI:2654715,MGI:2657185,MGI:265718..."
MGI:101762,Elk3,"MGI:2655709,MGI:2655710,MGI:3507694,MGI:350888..."


# Get Relevent Data and Map Gene ID's to Symbols

In [15]:
ontology_df = pd.DataFrame()

i = 0

for index in df.index:
    mgi_maid = df.loc[index, 'MGI Marker Accession ID (comma-delimited)'].split(',')
    
    if len(mgi_maid) > 1:
        for mgi in mgi_maid:
            if mgi in gene_meta.index:
                lst = []
                lst.append(gene_meta.loc[mgi, 1])
                lst.append(df.loc[index, 'Mammalian Phenotype ID'])
                df = pd.concat([df, pd.DataFrame(data=lst).T])
    else:
        mgi = mgi_maid[0]
        if mgi in gene_meta.index:
            lst = []
            lst.append(gene_meta.loc[mgi, 1])
            lst.append(df.loc[index, 'Mammalian Phenotype ID'])
            ontology_df = pd.concat([ontology_df, pd.DataFrame(data=lst).T])

In [16]:
ontology_df.drop_duplicates(inplace=True)

ontology_df.reset_index(inplace=True)

ontology_df.drop('index', axis=1,inplace=True)

In [17]:
ontology_df.head()

Unnamed: 0,0,1
0,Rb1,MP:0000600
1,Rb1,MP:0001716
2,Rb1,MP:0001698
3,Rb1,MP:0001092
4,Rb1,MP:0000961


In [18]:
ontology_df.shape

(195629, 2)

# Load Ontology Metadata

In [19]:
ontology_meta = pd.read_table('http://www.informatics.jax.org/downloads/reports/VOC_MammalianPhenotype.rpt', header=None)

In [20]:
ontology_meta.set_index(0, inplace=True)

In [21]:
ontology_meta.head()

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
MP:0000001,mammalian phenotype,"the observable morphological, physiological, b..."
MP:0000002,obsolete Morphology,OBSOLETE.
MP:0000003,abnormal adipose tissue morphology,any structural anomaly of the connective tissu...
MP:0000005,increased brown adipose tissue amount,increased amount of the thermogenic form of ad...
MP:0000008,increased white adipose tissue amount,increased quantity of fat-storing cells/tissue


# Map Ontology to Ontology ID's

In [22]:
lst = []

for index in ontology_df.index:
    term = ontology_meta.loc[ontology_df.loc[index, 1], 1]
                             
    lst.append(f'{term} {ontology_df.loc[index, 1]}')

ontology_df[1] = lst

In [23]:
ontology_df.drop_duplicates(inplace=True)

ontology_df.reset_index(inplace=True)

ontology_df.drop('index', axis=1,inplace=True)

In [24]:
ontology_df.head()

Unnamed: 0,0,1
0,Rb1,liver hypoplasia MP:0000600
1,Rb1,abnormal placenta labyrinth morphology MP:0001716
2,Rb1,decreased embryo size MP:0001698
3,Rb1,abnormal trigeminal ganglion morphology MP:000...
4,Rb1,abnormal dorsal root ganglion morphology MP:00...


In [25]:
ontology_df.shape

(195629, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [26]:
ontology_df.set_index(0, inplace=True)

In [28]:
ontology_df

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
Rb1,liver hypoplasia MP:0000600
Rb1,abnormal placenta labyrinth morphology MP:0001716
Rb1,decreased embryo size MP:0001698
Rb1,abnormal trigeminal ganglion morphology MP:000...
Rb1,abnormal dorsal root ganglion morphology MP:00...
...,...
Mtfmt,decreased circulating alanine transaminase lev...
Mtfmt,decreased circulating interleukin-1 beta level...
Mtfmt,decreased subcutaneous adipose tissue amount M...
Mtfmt,decreased epididymal fat pad weight MP:0009289


In [87]:
mapgenesymbols(ontology_df)

In [88]:
ontology_df

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
RB1,liver hypoplasia MP:0000600
RB1,abnormal placenta labyrinth morphology MP:0001716
RB1,decreased embryo size MP:0001698
RB1,abnormal trigeminal ganglion morphology MP:000...
RB1,abnormal dorsal root ganglion morphology MP:00...
...,...
MTFMT,decreased circulating alanine transaminase lev...
MTFMT,decreased circulating interleukin-1 beta level...
MTFMT,decreased subcutaneous adipose tissue amount M...
MTFMT,decreased epididymal fat pad weight MP:0009289


# Create Binary Matrix

In [89]:
ontology_df.reset_index(inplace=True)

In [115]:
ontology_df[ontology_df[1].isna()]

Unnamed: 0,0,1


In [110]:
ontology_group = ontology_df.groupby(1)

In [111]:
ontology_group.get_group("Bergmeister's papilla MP:0012539")

Unnamed: 0,0,1
47652,SEMA3E,Bergmeister's papilla MP:0012539


In [116]:
with open('out/MGI_Mammalian_Phenotype_Level_4_2021.gmt', 'w') as o:
	for group in ontology_group.groups:
		genes = set(ontology_group.get_group(group)[0])
		if len(genes) > 5:
			o.write("\t".join([group, '', *list(genes)]) + "\n")


In [117]:
binary_matrix = createBinaryMatrix(ontology_df)

In [119]:
binary_matrix.head()

In [None]:
binary_matrix.shape

(9792, 9753)

# Save Binary Matrix

In [None]:
binary_matrix.to_csv('out/mgi_binary_matrix.tsv.zip', sep='\t', compression='gzip')

# Create Gene Set Library

In [118]:
createUpGeneSetLib(binary_matrix, 'out/MGI_Mammalian_Phenotype_Level_4_2021')