In [8]:
import pandas as pd
import requests
import json
import gzip
import sys

from Bio import SeqIO
from enrichment_functions import * 

# Constants
URL = 'https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds'
PUBMED_IDS_PATH = '../../data/human_pubmed.tab.gz'
DOA_PATH = '../../data/uniprot_do.json'
DOID_PATH = '../../data/doid.obo'

## Mine and parse DO

In [9]:
'''
# DO mining

# { acc : disease ids list}
protein_to_do = map_protein_to_diseases(PUBMED_IDS_PATH, URL)

# Save result
json.dump(protein_to_do, '../data/uniprot_do.json', indent = 1)

'''

"\n# DO mining\n\n# { acc : disease ids list}\nprotein_to_do = map_protein_to_diseases(PUBMED_IDS_PATH, URL)\n\n# Save result\njson.dump(protein_to_do, '../data/uniprot_do.json', indent = 1)\n\n"

In [10]:
# Load DO on humans (DOA)
with open(DOA_PATH) as file:
    protein_to_do = json.load(file)

In [11]:
# Load DO ontology and parse
with open(DOID_PATH) as do_ontology:
    labels, ancestors, min_depth = parse_disease_ontology(do_ontology)

# Original dataset

In [12]:
# Read the original dataset (hits retrieved by our model)
with open("../../datasets/original.txt") as f:
    dataset = f.read().splitlines()

In [13]:
# background datasets is the entire human proteom in SwissProt
human = SeqIO.parse('../../data/SwissProt_humans_reference_all.fasta','fasta')
background = []
for sequence in human:
    name = sequence.id # name is in the form sp|P46108|CRK_HUMAN
    background.append(name.split('|')[1])

In [14]:
# count the ancestors for each sequence of the dataset
dataset_count = count_ancestors(dataset, ancestors, protein_to_do)
background_count = count_ancestors(background, ancestors, protein_to_do)

In [15]:
# perform fisher exact test
result_fisher = fisher_test(dataset_count, background_count, min_depth, labels)

In [16]:
result_fisher.sort_values(by = "OddRatio", ascending=False, inplace=True)

In [17]:
# add depth and description columns
result = add_depth_description(
    df = result_fisher,
    min_depth = min_depth, 
    labels = labels
)
result.head()

Unnamed: 0,OddRatio,p-value,depth,label
111512,128.694517,0.01540089,7,metachondromatosis
6420,128.694517,0.01540089,6,pulmonary valve stenosis
1884,92.883528,2.037241e-08,6,viral hepatitis
11702,86.017452,0.0005869084,6,dysgammaglobulinemia
12385,77.615748,2.493879e-05,5,shigellosis


In [22]:
result.shape

(125, 4)

In [18]:
# Bonferroni correction
# adjust threshold based on the numbe of tests
# http://www.biostathandbook.com/multiplecomparisons.html
alpha = 0.05
m = result_fisher.shape[0]
adjusted_threshold = alpha/m
print("Adjusted threshold: {}".format(adjusted_threshold))

Adjusted threshold: 0.0004


In [24]:
# filter by depth
filter_terms = (result_fisher["depth"] <= 4) & (result_fisher["p-value"] <= adjusted_threshold)

result_fisher[filter_terms].head()

Unnamed: 0,OddRatio,p-value,depth,label


In [35]:
result_fisher.sort_values(by='p-value')

Unnamed: 0,OddRatio,p-value,depth,label
0111512,128.694517,3.070280e-09,7,metachondromatosis
6420,128.694517,2.037241e-08,6,pulmonary valve stenosis
1884,92.883528,2.493879e-05,6,viral hepatitis
11702,86.017452,7.165420e-05,6,dysgammaglobulinemia
12385,77.615748,1.923081e-04,5,shigellosis
7188,64.511780,2.381738e-04,6,autoimmune thyroiditis
14499,64.345953,2.742622e-04,6,Fabry disease
0050120,42.896432,5.869084e-04,4,hemophagocytic lymphohistiocytosis
11294,42.896432,6.328204e-04,6,arteriovenous malformation
8568,32.253272,8.758576e-04,3,infectious mononucleosis


array([3.07028020e-09, 2.03724143e-08, 2.49387861e-05, 7.16541992e-05,
       1.92308056e-04, 2.38173845e-04, 2.74262220e-04, 5.86908404e-04,
       6.32820401e-04, 8.75857635e-04, 9.92545071e-04, 1.37993908e-03,
       1.90835360e-03, 1.98302410e-03, 2.41328977e-03, 2.57428514e-03,
       2.57428514e-03, 3.13029911e-03, 3.82704261e-03, 7.15907346e-03,
       1.46455598e-02, 1.52460138e-02, 1.54008909e-02, 1.54008909e-02,
       1.71381133e-02, 2.30123913e-02, 2.32101739e-02, 3.05652026e-02,
       3.05652026e-02, 3.15867281e-02, 3.80597761e-02, 3.80597761e-02,
       3.80597761e-02, 3.80597761e-02, 4.54965598e-02, 4.93554046e-02,
       5.12921809e-02, 5.28759981e-02, 5.32558291e-02, 5.52052380e-02,
       6.01985321e-02, 6.01985321e-02, 6.74645993e-02, 7.20527776e-02,
       7.26550858e-02, 7.46746342e-02, 7.46746342e-02, 8.18290678e-02,
       8.31288574e-02, 8.55907557e-02, 8.79167258e-02, 8.89283275e-02,
       8.89283275e-02, 9.59728380e-02, 1.02963020e-01, 1.04699333e-01,
      