Links

- https://www.mousephenotype.org/help/non-programmatic-data-access/
- https://www.mousephenotype.org/data/phenotypes/MP:0002626
- http://www.informatics.jax.org/downloads/reports/index.html
- http://www.informatics.jax.org/humanDisease.shtml

Issues

- human phenotype to disease mapping https://monarchinitiative.org/phenotype/HP:0001649#disease

---

In [2]:
%load_ext lab_black

In [63]:
from pathlib import Path
from pprint import pprint
from typing import Any, Dict, Optional

import pandas as pd
import requests

---

In [50]:
geno_pheno_file = Path("data") / "genotype-phenotype-assertions-all.csv.gz"
hmd_humanphenotype_file = Path("data") / "HMD_HumanPhenotype.rpt"
mgi_geno_disease_do = Path("data") / "MGI_Geno_DiseaseDO.rpt"
mgi_geno_not_disease_do = Path("data") / "MGI_Geno_NotDiseaseDO.rpt"
assert geno_pheno_file.exists()
assert hmd_humanphenotype_file.exists()
assert mgi_geno_disease_do.exists()
assert mgi_geno_not_disease_do.exists()

In [59]:
MONARCH_API = "https://api.monarchinitiative.org/api"

---

# Datasets

In [7]:
df_geno_pheno = pd.read_csv(geno_pheno_file)
df_geno_pheno.head()

Unnamed: 0,marker_accession_id,marker_symbol,phenotyping_center,colony_id,sex,zygosity,allele_accession_id,allele_symbol,allele_name,strain_accession_id,...,parameter_name,top_level_mp_term_id,top_level_mp_term_name,mp_term_id,mp_term_name,p_value,percentage_change,effect_size,statistical_method,resource_name
0,MGI:1920347,Dact2,BCM,DACUB,not_considered,homozygote,MGI:5584408,Dact2<tm1b(EUCOMM)Wtsi>,tm1b(EUCOMM)Wtsi,MGI:2159965,...,Total bilirubin,MP:0005376,homeostasis/metabolism phenotype,MP:0005635,decreased circulating bilirubin level,3.970158e-09,-143.755506,1.540103,Linear Model Using Generalized Least Squares f...,IMPC
1,MGI:2685256,Ankub1,UC Davis,BL5612,female,homozygote,MGI:5708256,Ankub1<tm1.1(KOMP)Vlcg>,tm1.1(KOMP)Vlcg,MGI:2683688,...,Heart,MP:0005385,cardiovascular system phenotype,MP:0000266,abnormal heart morphology,0.0,,1.0,Supplied as data,IMPC
2,MGI:2385213,Pdik1l,UC Davis,CR1169,male,homozygote,MGI:6152611,Pdik1l<em1(IMPC)Mbp>,em1(IMPC)Mbp,MGI:2683688,...,Heart,MP:0005385,cardiovascular system phenotype,MP:0000266,abnormal heart morphology,0.0,,1.0,Supplied as data,IMPC
3,MGI:3028577,Arl5c,UC Davis,BL3866,female,homozygote,MGI:5558161,Arl5c<tm1.1(KOMP)Vlcg>,tm1.1(KOMP)Vlcg,MGI:2683688,...,Heart,MP:0005385,cardiovascular system phenotype,MP:0000266,abnormal heart morphology,0.0,,1.0,Supplied as data,IMPC
4,MGI:109232,Npas2,UC Davis,CR10417,female,homozygote,MGI:6276857,Npas2<em1(IMPC)Mbp>,em1(IMPC)Mbp,MGI:2683688,...,Heart,MP:0005385,cardiovascular system phenotype,MP:0000266,abnormal heart morphology,0.0,,1.0,Supplied as data,IMPC


In [8]:
df_geno_pheno.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46342 entries, 0 to 46341
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   marker_accession_id     46340 non-null  object 
 1   marker_symbol           46340 non-null  object 
 2   phenotyping_center      46342 non-null  object 
 3   colony_id               46342 non-null  object 
 4   sex                     46342 non-null  object 
 5   zygosity                46342 non-null  object 
 6   allele_accession_id     46323 non-null  object 
 7   allele_symbol           46323 non-null  object 
 8   allele_name             44950 non-null  object 
 9   strain_accession_id     46342 non-null  object 
 10  strain_name             46332 non-null  object 
 11  project_name            46332 non-null  object 
 12  project_fullname        0 non-null      float64
 13  pipeline_name           46342 non-null  object 
 14  pipeline_stable_id      46342 non-null

In [10]:
df_geno_pheno.head(500).to_csv("data/genotype-phenotype-head.csv", index=False)

In [16]:
df_markers = df_geno_pheno[["marker_accession_id", "marker_symbol"]].drop_duplicates()
print(df_markers["marker_accession_id"].drop_duplicates().count())
print(df_markers["marker_symbol"].drop_duplicates().count())
df_markers

6374
6373


Unnamed: 0,marker_accession_id,marker_symbol
0,MGI:1920347,Dact2
1,MGI:2685256,Ankub1
2,MGI:2385213,Pdik1l
3,MGI:3028577,Arl5c
4,MGI:109232,Npas2
...,...,...
46157,MGI:1891361,Psg29
46176,MGI:2686212,Dcdc2b
46185,MGI:2675296,Crebzf
46256,MGI:1891832,Patz1


In [15]:
df_markers[df_markers["marker_symbol"].duplicated(keep=False)]

Unnamed: 0,marker_accession_id,marker_symbol
692,MGI:97565,Pgm1
1894,MGI:97564,Pgm1


In [17]:
df_allele = df_geno_pheno[
    ["allele_accession_id", "allele_symbol", "allele_name"]
].drop_duplicates()
df_allele

Unnamed: 0,allele_accession_id,allele_symbol,allele_name
0,MGI:5584408,Dact2<tm1b(EUCOMM)Wtsi>,tm1b(EUCOMM)Wtsi
1,MGI:5708256,Ankub1<tm1.1(KOMP)Vlcg>,tm1.1(KOMP)Vlcg
2,MGI:6152611,Pdik1l<em1(IMPC)Mbp>,em1(IMPC)Mbp
3,MGI:5558161,Arl5c<tm1.1(KOMP)Vlcg>,tm1.1(KOMP)Vlcg
4,MGI:6276857,Npas2<em1(IMPC)Mbp>,em1(IMPC)Mbp
...,...,...,...
46176,MGI:5637175,Dcdc2b<tm1b(KOMP)Wtsi>,tm1b(KOMP)Wtsi
46185,MGI:5763778,Crebzf<em1(IMPC)J>,em1(IMPC)J
46256,MGI:5695909,Patz1<tm1.1(KOMP)Vlcg>,tm1.1(KOMP)Vlcg
46272,MGI:5548604,Dennd1c<tm1b(EUCOMM)Wtsi>,tm1b(EUCOMM)Wtsi


In [18]:
df_phenotype = df_geno_pheno[["mp_term_id", "mp_term_name"]].drop_duplicates()
df_phenotype

Unnamed: 0,mp_term_id,mp_term_name
0,MP:0005635,decreased circulating bilirubin level
1,MP:0000266,abnormal heart morphology
6,MP:0002644,decreased circulating triglyceride level
7,MP:0002797,increased thigmotaxis
8,MP:0001410,head bobbing
...,...,...
44771,MP:0013692,increased CD5-positive Ly6C-positive T cell nu...
44832,MP:0002217,small lymph nodes
44966,MP:0001158,abnormal prostate gland morphology
44993,MP:0009204,absent external male genitalia


In [28]:
df_hmd = pd.read_csv(
    hmd_humanphenotype_file,
    sep="\t",
    index_col=False,
    names=[
        "Human Marker Symbol",
        "Human Entrez Gene ID",
        "HomoloGene ID",
        # "HGNC Association?",
        "HGNC Association",
        "Mouse Marker Symbol",
        "MGI Marker Accession ID",
        # "High-level Mammalian Phenotype ID (space-delimited)",
        "High-level Mammalian Phenotype ID",
    ],
)
print(df_hmd.info())
df_hmd

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18797 entries, 0 to 18796
Data columns (total 7 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Human Marker Symbol                18797 non-null  object 
 1   Human Entrez Gene ID               18797 non-null  int64  
 2   HomoloGene ID                      18060 non-null  float64
 3   HGNC Association                   18797 non-null  object 
 4   Mouse Marker Symbol                18797 non-null  object 
 5   MGI Marker Accession ID            18797 non-null  object 
 6   High-level Mammalian Phenotype ID  11955 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 1.0+ MB
None


Unnamed: 0,Human Marker Symbol,Human Entrez Gene ID,HomoloGene ID,HGNC Association,Mouse Marker Symbol,MGI Marker Accession ID,High-level Mammalian Phenotype ID
0,A1BG,1,11167.0,yes,A1bg,MGI:2152878,
1,A1CF,29974,16363.0,yes,A1cf,MGI:1917115,"MP:0005367, MP:0005369, MP:0005370, MP:0005376..."
2,A2M,2,37248.0,yes,A2m,MGI:2449119,MP:0005376
3,A3GALT2,127550,16326.0,yes,A3galt2,MGI:2685279,
4,A4GALT,53947,9690.0,yes,A4galt,MGI:3512453,"MP:0005376, MP:0005386, MP:0005387, MP:0005397..."
...,...,...,...,...,...,...,...
18792,ZYG11A,440590,66294.0,yes,Zyg11a,MGI:2446208,
18793,ZYG11B,79699,14600.0,yes,Zyg11b,MGI:2685277,"MP:0005378, MP:0005380, MP:0010768"
18794,ZYX,7791,31164.0,yes,Zyx,MGI:103072,MP:0005384
18795,ZZEF1,23140,9027.0,yes,Zzef1,MGI:2444286,"MP:0001186, MP:0003631, MP:0005367, MP:0005375..."


In [30]:
df_hmd_gene_mapping = df_hmd.rename(
    columns={
        "Human Marker Symbol": "human_gene_label",
        "Mouse Marker Symbol": "mouse_gene_label",
        "MGI Marker Accession ID": "mouse_gene_id",
    }
)[["human_gene_label", "mouse_gene_label", "mouse_gene_id"]].drop_duplicates()
df_hmd_gene_mapping

Unnamed: 0,human_gene_label,mouse_gene_label,mouse_gene_id
0,A1BG,A1bg,MGI:2152878
1,A1CF,A1cf,MGI:1917115
2,A2M,A2m,MGI:2449119
3,A3GALT2,A3galt2,MGI:2685279
4,A4GALT,A4galt,MGI:3512453
...,...,...,...
18792,ZYG11A,Zyg11a,MGI:2446208
18793,ZYG11B,Zyg11b,MGI:2685277
18794,ZYX,Zyx,MGI:103072
18795,ZZEF1,Zzef1,MGI:2444286


In [39]:
df_mgi_geno_disease_do = pd.read_csv(
    mgi_geno_disease_do,
    sep="\t",
    index_col=False,
    names=[
        "Allelic Composition",
        "Allele Symbol(s)",
        "Allele ID(s)",
        "Genetic Background",
        "Mammalian Phenotype ID",
        "PubMed ID",
        # "PubMed ID (pipe-delimited)",
        "MGI Marker Accession ID",
        # "MGI Marker Accession ID (pipe-delimited)",
        "DO ID",
        # "DO ID (pipe-delimited)",
    ],
)
print(df_mgi_geno_disease_do.info())
df_mgi_geno_disease_do

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36826 entries, 0 to 36825
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Allelic Composition      36826 non-null  object
 1   Allele Symbol(s)         36826 non-null  object
 2   Allele ID(s)             36826 non-null  object
 3   Genetic Background       36826 non-null  object
 4   Mammalian Phenotype ID   36826 non-null  object
 5   PubMed ID                34674 non-null  object
 6   MGI Marker Accession ID  36826 non-null  object
 7   DO ID                    36826 non-null  object
dtypes: object(8)
memory usage: 2.2+ MB
None


Unnamed: 0,Allelic Composition,Allele Symbol(s),Allele ID(s),Genetic Background,Mammalian Phenotype ID,PubMed ID,MGI Marker Accession ID,DO ID
0,Ednra<tm1Ywa>/Ednra<tm1Ywa>,Ednra<tm1Ywa>,MGI:1857473,129S/SvEv-Ednra<tm1Ywa>,MP:0002127,9449664,MGI:105923,DOID:12583
1,Ednra<tm1Ywa>/Ednra<tm1Ywa>,Ednra<tm1Ywa>,MGI:1857473,129S/SvEv-Ednra<tm1Ywa>,MP:0000452,9449664,MGI:105923,DOID:12583
2,Ednra<tm1Ywa>/Ednra<tm1Ywa>,Ednra<tm1Ywa>,MGI:1857473,129S/SvEv-Ednra<tm1Ywa>,MP:0002108,9449664,MGI:105923,DOID:12583
3,Ednra<tm1Ywa>/Ednra<tm1Ywa>,Ednra<tm1Ywa>,MGI:1857473,129S/SvEv-Ednra<tm1Ywa>,MP:0000614,9449664,MGI:105923,DOID:12583
4,Ednra<tm1Ywa>/Ednra<tm1Ywa>,Ednra<tm1Ywa>,MGI:1857473,129S/SvEv-Ednra<tm1Ywa>,MP:0001823,9449664,MGI:105923,DOID:12583
...,...,...,...,...,...,...,...,...
36821,Nexn<tm1.1Chen>/Nexn<tm1.1Chen>,Nexn<tm1.1Chen>,MGI:6510878,Not Specified,MP:0002795,30982350,MGI:1916060,DOID:0110424
36822,Nexn<tm1.1Chen>/Nexn<tm1.1Chen>,Nexn<tm1.1Chen>,MGI:6510878,Not Specified,MP:0002833,30982350,MGI:1916060,DOID:0110424
36823,Nexn<tm1.1Chen>/Nexn<tm1.1Chen>,Nexn<tm1.1Chen>,MGI:6510878,Not Specified,MP:0011085,30982350,MGI:1916060,DOID:0110424
36824,Nexn<tm1.1Chen>/Nexn<tm1.1Chen>,Nexn<tm1.1Chen>,MGI:6510878,Not Specified,MP:0011925,30982350,MGI:1916060,DOID:0110424


In [40]:
df_mgi_geno_disease_do["DO ID"].drop_duplicates().head(50)

0                                   DOID:12583
42                                  DOID:14761
71                                DOID:0110948
78                                DOID:0110120
106                                 DOID:11836
189                               DOID:0110605
193                               DOID:0110858
195                               DOID:0110875
234                               DOID:0110859
255                               DOID:0060543
258                                  DOID:2223
261                               DOID:0110942
298                                    DOID:83
303                     DOID:0060783|DOID:8534
352                               DOID:0090028
355                                  DOID:9970
385                                 DOID:11476
443                                 DOID:14175
446                                  DOID:1415
460     DOID:0110913|DOID:0110914|DOID:0110915
465                                 DOID:14705
482          

In [45]:
df_mp_doid = (
    df_mgi_geno_disease_do[["Mammalian Phenotype ID", "DO ID"]]
    .rename(columns={"Mammalian Phenotype ID": "mp_id", "DO ID": "do_id"})
    .drop_duplicates()
    # DO ID is pipe delimed
    # convert do_id into a list then explode it
    .assign(do_id=lambda df: df["do_id"].apply(lambda x: x.split("|")))
    .explode("do_id")
)
df_mp_doid

Unnamed: 0,mp_id,do_id
0,MP:0002127,DOID:12583
1,MP:0000452,DOID:12583
2,MP:0002108,DOID:12583
3,MP:0000614,DOID:12583
4,MP:0001823,DOID:12583
...,...,...
36818,MP:0000274,DOID:0110424
36819,MP:0002753,DOID:0110424
36822,MP:0002833,DOID:0110424
36823,MP:0011085,DOID:0110424


In [52]:
df_mgi_geno_not_disease_do = pd.read_csv(
    mgi_geno_not_disease_do,
    sep="\t",
    index_col=False,
    names=[
        "Allelic Composition",
        "Allele Symbol(s)",
        "Allele ID(s)",
        "Genetic Background",
        "Mammalian Phenotype ID",
        "PubMed ID",
        # "PubMed ID (pipe-delimited)",
        "MGI Marker Accession ID",
        # "MGI Marker Accession ID (pipe-delimited)",
        "DO ID",
        # "DO ID (pipe-delimited)",
    ],
)
print(df_mgi_geno_not_disease_do.info())
df_mgi_geno_not_disease_do

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 963 entries, 0 to 962
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Allelic Composition      963 non-null    object
 1   Allele Symbol(s)         963 non-null    object
 2   Allele ID(s)             963 non-null    object
 3   Genetic Background       963 non-null    object
 4   Mammalian Phenotype ID   963 non-null    object
 5   PubMed ID                953 non-null    object
 6   MGI Marker Accession ID  963 non-null    object
 7   DO ID                    963 non-null    object
dtypes: object(8)
memory usage: 60.3+ KB
None


Unnamed: 0,Allelic Composition,Allele Symbol(s),Allele ID(s),Genetic Background,Mammalian Phenotype ID,PubMed ID,MGI Marker Accession ID,DO ID
0,Pax3<Sp-2H>/Pax3<Sp-2H>,Pax3<Sp-2H>,MGI:1856293,involves: C57BL/6,MP:0000729,7600971|8631247,MGI:97487,DOID:0110949
1,Pax3<Sp-2H>/Pax3<Sp-2H>,Pax3<Sp-2H>,MGI:1856293,involves: C57BL/6,MP:0003054,,MGI:97487,DOID:0110949
2,Pax3<Sp-2H>/Pax3<Sp-2H>,Pax3<Sp-2H>,MGI:1856293,involves: C57BL/6,MP:0003090,8631247,MGI:97487,DOID:0110949
3,Pax3<Sp-2H>/Pax3<Sp-2H>,Pax3<Sp-2H>,MGI:1856293,involves: C57BL/6,MP:0004206,8631247,MGI:97487,DOID:0110949
4,Pax3<Sp-2H>/Pax3<Sp-2H>,Pax3<Sp-2H>,MGI:1856293,involves: C57BL/6,MP:0011091,,MGI:97487,DOID:0110949
...,...,...,...,...,...,...,...,...
958,Nmnat1<tm1Ruch>/Nmnat1<tm1Ruch>,Nmnat1<tm1Ruch>,MGI:6272867,involves: 129S7/SvEvBrd,MP:0002169,29674119,MGI:1913704,DOID:0110005
959,Cenpf<em3Bko>/Cenpf<em3Bko>,Cenpf<em3Bko>,MGI:6287441,involves: C57BL/6 * C57BL/6NTac * DBA/2,MP:0002169,30856164,MGI:1313302,DOID:0110595
960,Sgcb<em1Isrd>/Sgcb<em1Isrd>,Sgcb<em1Isrd>,MGI:6119737,involves: 129 * C57BL/6N,MP:0002169,29360879,MGI:1346523,DOID:0110279
961,Eftud2<em2Lajm>/Eftud2<+>,Eftud2<+>|Eftud2<em2Lajm>,MGI:6358539,involves: CD-1 * FVB/N,MP:0003984,31276534,MGI:1336880,DOID:0080196


In [53]:
df_mp_doid_1 = (
    df_mgi_geno_not_disease_do[["Mammalian Phenotype ID", "DO ID"]]
    .rename(columns={"Mammalian Phenotype ID": "mp_id", "DO ID": "do_id"})
    .drop_duplicates()
    # DO ID is pipe delimed
    # convert do_id into a list then explode it
    .assign(do_id=lambda df: df["do_id"].apply(lambda x: x.split("|")))
    .explode("do_id")
)
df_mp_doid_1

Unnamed: 0,mp_id,do_id
0,MP:0000729,DOID:0110949
1,MP:0003054,DOID:0110949
2,MP:0003090,DOID:0110949
3,MP:0004206,DOID:0110949
4,MP:0011091,DOID:0110949
...,...,...
958,MP:0002169,DOID:0110005
959,MP:0002169,DOID:0110595
960,MP:0002169,DOID:0110279
961,MP:0003984,DOID:0080196


In [55]:
intersect = set(df_mp_doid["mp_id"].tolist()).intersection(
    set(df_mp_doid_1["mp_id"].tolist())
)
len(intersect)

619

---

# API

In [61]:
mp_id = "MP:0002626"
url = f"{MONARCH_API}/bioentity/{mp_id}"
r = requests.get(url)
r.raise_for_status()
pprint(r.json())

{'association_counts': None,
 'category': ['phenotype', 'quality'],
 'consider': None,
 'deprecated': None,
 'description': 'A rapid heartrate that exceeds the range of the normal '
                'resting heartrate for age.',
 'id': 'HP:0001649',
 'iri': 'http://purl.obolibrary.org/obo/HP_0001649',
 'label': 'Tachycardia',
 'replaced_by': None,
 'synonyms': [{'pred': 'synonym', 'val': 'Elevated heart rate', 'xrefs': None},
              {'pred': 'synonym', 'val': 'Heart racing', 'xrefs': None},
              {'pred': 'synonym', 'val': 'Increased heart rate', 'xrefs': None},
              {'pred': 'synonym', 'val': 'Racing heart', 'xrefs': None},
              {'pred': 'synonym', 'val': 'Rapid heart beat', 'xrefs': None},
              {'pred': 'synonym', 'val': 'Fast heart rate', 'xrefs': None}],
 'taxon': {'id': None, 'label': None},
 'types': None,
 'xrefs': ['UMLS:C4020868',
           'UMLS:C0039231',
           'MSH:D013610',
           'SNOMEDCT_US:3424008',
           'SNOMEDC

In [66]:
def mp_to_hp(mp_id: str) -> str:
    url = f"{MONARCH_API}/bioentity/{mp_id}"
    try:
        r = requests.get(url)
        r.raise_for_status()
        data = r.json()
        # res = {"hp_id": data["id"], "mp_id": mp_id}
        res = data["id"]
        return res
    except:
        return None

In [67]:
mp_to_hp(mp_id)

'HP:0001649'

In [68]:
df_mp_to_hp = (
    df_phenotype[["mp_term_id"]]
    .drop_duplicates()
    .assign(hp_id=lambda df: df["mp_term_id"].apply(lambda x: mp_to_hp(x)))
)
print(df_mp_to_hp.info())
df_mp_to_hp

<class 'pandas.core.frame.DataFrame'>
Int64Index: 758 entries, 0 to 45202
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   mp_term_id  757 non-null    object
 1   hp_id       730 non-null    object
dtypes: object(2)
memory usage: 17.8+ KB
None


Unnamed: 0,mp_term_id,hp_id
0,MP:0005635,MP:0005635
1,MP:0000266,HP:0001627
6,MP:0002644,HP:0012153
7,MP:0002797,MP:0002797
8,MP:0001410,MP:0001410
...,...,...
44771,MP:0013692,MP:0013692
44832,MP:0002217,MP:0002217
44966,MP:0001158,MP:0001158
44993,MP:0009204,MP:0009204


In [69]:
df_mp_to_hp.to_csv("data/mp_to_hp.csv", index=False)

In [70]:
df_mp_to_hp.dropna()

Unnamed: 0,mp_term_id,hp_id
0,MP:0005635,MP:0005635
1,MP:0000266,HP:0001627
6,MP:0002644,HP:0012153
7,MP:0002797,MP:0002797
8,MP:0001410,MP:0001410
...,...,...
44771,MP:0013692,MP:0013692
44832,MP:0002217,MP:0002217
44966,MP:0001158,MP:0001158
44993,MP:0009204,MP:0009204


In [71]:
foo = (
    df_mp_to_hp[["hp_id"]]
    .dropna()
    .assign(is_hp=lambda df: df["hp_id"].apply(lambda x: x.startswith("HP:")))
)
foo

Unnamed: 0,hp_id,is_hp
0,MP:0005635,False
1,HP:0001627,True
6,HP:0012153,True
7,MP:0002797,False
8,MP:0001410,False
...,...,...
44771,MP:0013692,False
44832,MP:0002217,False
44966,MP:0001158,False
44993,MP:0009204,False


In [78]:
foo["is_hp"].value_counts()

False    501
True     229
Name: is_hp, dtype: int64

---

# Mapping coverage

In [46]:
# gene mapping
df_cover_gene = df_markers.merge(
    df_hmd_gene_mapping.rename(
        columns={
            "mouse_gene_id": "marker_accession_id",
            "mouse_gene_label": "marker_symbol",
        }
    ),
    on=["marker_accession_id", "marker_symbol"],
)
print(df_cover_gene.info())
print(
    df_cover_gene["marker_accession_id"].drop_duplicates().pipe(len)
    / df_markers["marker_accession_id"].drop_duplicates().pipe(len)
)
df_cover_gene

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6221 entries, 0 to 6220
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   marker_accession_id  6221 non-null   object
 1   marker_symbol        6221 non-null   object
 2   human_gene_label     6221 non-null   object
dtypes: object(3)
memory usage: 194.4+ KB
None
0.9684705882352941


Unnamed: 0,marker_accession_id,marker_symbol,human_gene_label
0,MGI:1920347,Dact2,DACT2
1,MGI:2685256,Ankub1,ANKUB1
2,MGI:2385213,Pdik1l,PDIK1L
3,MGI:3028577,Arl5c,ARL5C
4,MGI:109232,Npas2,NPAS2
...,...,...,...
6216,MGI:1920412,Otud3,OTUD3
6217,MGI:2686212,Dcdc2b,DCDC2B
6218,MGI:2675296,Crebzf,CREBZF
6219,MGI:1891832,Patz1,PATZ1


In [79]:
# phenotype to disease mapping
df_cover_phenotype = df_phenotype.merge(
    df_mp_doid.rename(columns={"mp_id": "mp_term_id"}), on=["mp_term_id"]
)
print(df_cover_phenotype.info())
print(
    df_cover_phenotype["mp_term_id"].drop_duplicates().pipe(len)
    / df_phenotype["mp_term_id"].drop_duplicates().pipe(len)
)
df_cover_phenotype

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6652 entries, 0 to 6651
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   mp_term_id    6652 non-null   object
 1   mp_term_name  6652 non-null   object
 2   do_id         6652 non-null   object
dtypes: object(3)
memory usage: 207.9+ KB
None
0.716358839050132


Unnamed: 0,mp_term_id,mp_term_name,do_id
0,MP:0000266,abnormal heart morphology,DOID:0110875
1,MP:0000266,abnormal heart morphology,DOID:0110106
2,MP:0000266,abnormal heart morphology,DOID:6419
3,MP:0000266,abnormal heart morphology,DOID:1933
4,MP:0000266,abnormal heart morphology,DOID:0090145
...,...,...,...
6647,MP:0002217,small lymph nodes,DOID:0060010
6648,MP:0001158,abnormal prostate gland morphology,DOID:10283
6649,MP:0001158,abnormal prostate gland morphology,DOID:4674
6650,MP:0001158,abnormal prostate gland morphology,DOID:0111330
