# Mouse Gene Ontology (MGI)

Created by: Charles Dai <br>
Credit to: Moshe Silverstein

Data Source: http://www.informatics.jax.org/downloads/reports/index.html

In [1]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [2]:
import sys
import os
from datetime import date

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [3]:
# from clustergrammer_widget import *
# net = Network(clustergrammer_widget)

In [4]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [5]:
print('This notebook was run on:', date.today(), '\nPython version:', sys.version)

This notebook was run on: 2020-06-28 
Python version: 3.8.0 (default, Oct 28 2019, 16:14:01) 
[GCC 8.3.0]


# Initialization

### Load Mapping Dictionaries

In [6]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

Gathering sources: 100%|██████████| 3/3 [00:10<00:00,  3.39s/it]


### Output Path

In [None]:
output_name = 'mgi'

path = 'Output/MGI'
if not os.path.exists(path):
    os.makedirs(path)

In [10]:
%%appyter hide_code
{% do SectionField(
    name='data',
    title='Load Data',
    subtitle='Upload Files from the Mouse Gene Ontology (MGI) Dataset',
) %}

# Load Data

In [11]:
%%appyter code_exec

df = pd.read_csv({{FileField(
    constraint='.*\.rpt$',
    name='gene_phenotype', 
    label='Genotypes and Phenotype Annotations (rpt)', 
    default='Input/MGI/MGI_GenePheno.rpt',
    section='data')
}}, header=None, usecols=[4, 6], names=['Phenotype ID', 'MGI Marker'], sep='\t')

```python

df = pd.read_csv('Input/MGI/MGI_GenePheno.rpt', header=None, usecols=[4, 6], names=['Phenotype ID', 'MGI Marker'], sep='\t')
```

In [12]:
df.head()

Unnamed: 0,Phenotype ID,MGI Marker
0,MP:0000600,MGI:97874
1,MP:0001716,MGI:97874
2,MP:0001698,MGI:97874
3,MP:0001092,MGI:97874
4,MP:0000961,MGI:97874


In [13]:
df.shape

(231666, 2)

## Load Gene Metadata

In [14]:
%%appyter code_exec

gene_meta = pd.read_csv({{FileField(
    constraint='.*\.rpt$',
    name='gene_metadata', 
    label='MGI Genetic Markers (rpt)', 
    default='Input/MGI/MRK_GXDAssay.rpt',
    section='data')
}}, header=None, usecols=[0, 1], index_col=0, sep='\t')

```python

gene_meta = pd.read_csv('Input/MGI/MRK_GXDAssay.rpt', header=None, usecols=[0, 1], index_col=0, sep='\t')
```

In [15]:
gene_meta.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
MGI:1353431,Pcsk1n
MGI:2177151,Cd99l2
MGI:1916858,Sec24d
MGI:1915444,Ndufb4
MGI:1923953,Cpb1


In [16]:
gene_meta.shape

(14934, 1)

# Load Ontology Metadata

In [17]:
%%appyter code_exec

ontology = pd.read_csv({{FileField(
    constraint='.*\.rpt$',
    name='phenotype_vocabulary', 
    label='Mammalian Phenotype Vocabulary (rpt)', 
    default='Input/MGI/VOC_MammalianPhenotype.rpt',
    section='data')
}}, header=None, usecols=[0, 1], index_col=0, sep='\t')

```python

ontology = pd.read_csv('Input/MGI/VOC_MammalianPhenotype.rpt', header=None, usecols=[0, 1], index_col=0, sep='\t')
```

In [18]:
ontology.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
MP:0000001,mammalian phenotype
MP:0000002,obsolete Morphology
MP:0000003,abnormal adipose tissue morphology
MP:0000005,increased brown adipose tissue amount
MP:0000008,increased white adipose tissue amount


In [19]:
ontology.shape

(13491, 1)

# Pre-process Data

## Split Marker Lists

In [20]:
df = df.dropna()
df['MGI Marker'] = df['MGI Marker'].str.split(pat='\|')
df = df.explode('MGI Marker')
df.head()

Unnamed: 0,Phenotype ID,MGI Marker
0,MP:0000600,MGI:97874
1,MP:0001716,MGI:97874
2,MP:0001698,MGI:97874
3,MP:0001092,MGI:97874
4,MP:0000961,MGI:97874


In [21]:
df.shape

(231668, 2)

## Map Marker to Gene

In [22]:
df['MGI Marker'] = gene_meta.reindex(df['MGI Marker'])[1].reset_index(drop=True)
df.head()

Unnamed: 0,Phenotype ID,MGI Marker
0,MP:0000600,Rb1
1,MP:0001716,Rb1
2,MP:0001698,Rb1
3,MP:0001092,Rb1
4,MP:0000961,Rb1


## Map Phenotype ID to Phenotype

In [23]:
df['Phenotype ID'] = ontology.reindex(df['Phenotype ID'])[1].reset_index(drop=True)
df = df.set_index('MGI Marker')
df.index.name = 'Gene Symbol'
df.columns = ['Phenotype']
df.head()

Unnamed: 0_level_0,Phenotype
Gene Symbol,Unnamed: 1_level_1
Rb1,liver hypoplasia
Rb1,abnormal placenta labyrinth morphology
Rb1,decreased embryo size
Rb1,abnormal trigeminal ganglion morphology
Rb1,abnormal dorsal root ganglion morphology


# Filter Data

## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [24]:
df = uf.map_symbols(df, symbol_lookup, remove_duplicates=True)
df.shape

100%|██████████| 231668/231668 [00:00<00:00, 548004.22it/s]


(160160, 1)

# Analyze Data

## Create Binary Matrix

In [None]:
binary_matrix = uf.binary_matrix(df)
binary_matrix.head()

In [None]:
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

## Create Gene List

In [None]:
gene_list = uf.gene_list(binary_matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.save_data(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

## Create Attribute List

In [None]:
attribute_list = uf.attribute_list(binary_matrix)
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.save_data(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

## Create Gene and Attribute Set Libraries

In [None]:
uf.save_setlib(binary_matrix, 'gene', 'up', path, output_name + '_gene_up_set')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', 'up', path, 
                           output_name + '_attribute_up_set')

## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [None]:
# net.load_df(attribute_similarity_matrix.iloc[:,:].copy())
# net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
gene_similarity_matrix.head()

In [None]:
uf.save_data(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

## Create Gene-Attribute Edge List

In [None]:
edge_list = uf.edge_list(binary_matrix)
uf.save_data(edge_list, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

# Create Downloadable Save File

In [None]:
uf.archive(path)

### Link to download output files: [click here](./output_archive.zip)