# Human Disease Association Wormbase WP266

Author: Zachary Flamholz  
Date: 07-2018  
Database: https://wormbase.org/#012-34-5  
Data: ftp://ftp.wormbase.org/pub/wormbase/releases/WS266/ONTOLOGY/disease_association.WS266.wb, ftp://ftp.wormbase.org/pub/wormbase/releases/WS266/ONTOLOGY/disease_ontology.WS266.obo

# Versions of modules in use

In [21]:
%load_ext version_information
%version_information numpy, pandas

Software,Version
Python,3.6.5 64bit [GCC 4.2.1 Compatible Apple LLVM 9.1.0 (clang-902.0.39.1)]
IPython,6.4.0
OS,Darwin 17.5.0 x86_64 i386 64bit
numpy,1.14.5
pandas,0.23.1
Mon Jul 30 11:33:37 2018 EDT,Mon Jul 30 11:33:37 2018 EDT


## load libraries

In [1]:
import pandas as pd
import numpy as np
import sys, datetime
import networkx
import obonet

## read in data

In [3]:
wb_disease = pd.read_csv('in_production/disease_association.WS266.wb', sep='\t', header=None)

In [4]:
wb_disease.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,WB,WBGene00000001,aap-1,,DOID:2583,PMID:19029536,IEA,ENSEMBL:ENSG00000145675|OMIM:615214,D,,Y110A7A.10,gene,taxon:6239,20180609,WB,,
1,WB,WBGene00000002,aat-1,,DOID:0060439,PMID:19029536,IEA,ENSEMBL:ENSG00000155465|OMIM:222700,D,,F27C8.1,gene,taxon:6239,20180609,WB,,
2,WB,WBGene00000004,aat-3,,DOID:0060439,PMID:19029536,IEA,ENSEMBL:ENSG00000155465|OMIM:222700,D,,F52H2.2,gene,taxon:6239,20180609,WB,,
3,WB,WBGene00000020,abt-2,,DOID:0110015,PMID:19029536,IEA,ENSEMBL:ENSG00000198691|OMIM:153800,D,,F12B6.1,gene,taxon:6239,20180609,WB,,
4,WB,WBGene00000020,abt-2,,DOID:1388,PMID:19029536,IEA,ENSEMBL:ENSG00000165029|OMIM:205400,D,,F12B6.1,gene,taxon:6239,20180609,WB,,


In [11]:
wb_disease.shape

(3037, 17)

In [5]:
wb_disease.iloc[:,3].unique()

array([nan])

In [7]:
len(wb_disease.iloc[:,5].unique())

221

In [8]:
wb_disease.iloc[:,6].unique()

array(['IEA', 'IMP'], dtype=object)

In [9]:
wb_disease.iloc[:,12].unique()

array(['taxon:6239'], dtype=object)

In [10]:
len(wb_disease.iloc[:,4].unique())

1186

## build dictionary of dieases and associated genes

In [17]:
diseases = {}

for i in range(0, len(wb_disease)):
    p = wb_disease.iloc[i, 4]
    if p in diseases.keys():
        diseases[p].append(wb_disease.iloc[i,2])
    else:
        diseases[p] = [wb_disease.iloc[i,2]]

## get the ontologies for phenotypes

In [13]:
graph = obonet.read_obo('in_production/disease_ontology.WS266.obo')
term_data = graph.nodes(data=True)

In [15]:
term_data['DOID:0001816']['name']

'angiosarcoma'

## build gmt

In [19]:
filename = 'human_disease_wormbase_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
file = open(filename,'w+') 
terms = diseases.keys()
for i,term in enumerate(terms):
    
    progressPercent = ((i+1)/len(terms))*100
    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(terms)))
    sys.stdout.flush()
    
    if len(set(diseases[term])) > 4:
        name = term_data[term]['name'] + '(' + term + ')'
        file.write("%s\t" % name)
        file.write("\t")
        genes = set(diseases[term])
    
        for gene in genes:
              file.write("%s\t" % gene)
        file.write("\n")
        
file.close()

Progress: 100%  1186 Out of 1186   