## Get phenotypes associated with a list of genes


### Set up 


In [96]:
import json
#from pprint import pprint

In [97]:
import requests

INDEX_PAGE = "http://idr-demo.openmicroscopy.org/webclient/?experimenter=-1"

# create http session
with requests.Session() as session:
    request = requests.Request('GET', INDEX_PAGE)
    prepped = session.prepare_request(request)
    response = session.send(prepped)
    if response.status_code != 200:
        response.raise_for_status()

### First read in the table with the list of genes
The example contains 
* a gene that has an image annotated with a phenotype not mapped to any ontology (PSMA1)
* a gene that has no images annotated with any phenotypes (PSMA5)
* a gene that images that are annotated with multiple phenotypes and some phenotypes are mapped to more than one ontology term (PREX2)
* a gene that is not annotated to any images (NOTAGENE)


In [98]:
with open('FiveExampleGenes.txt') as f:
    genes = f.read().splitlines()

### Check that the table has been read in properly - print out first 5 values and check the length of the list

In [99]:
genes[:5]


['PSMA1', 'PSMA2', 'PSMA5', 'PREX2', 'NOTAGENE']

In [100]:
len (genes)

5

### Set up URLS we are going to use to query IDR

In [101]:
SCREENS_PROJECTS_URL = "http://idr-demo.openmicroscopy.org/mapr/api/{key}/?value={value}"
PLATES_URL = "http://idr-demo.openmicroscopy.org/mapr/api/{key}/plates/?value={value}&id={screen_id}"
IMAGES_URL = "http://idr-demo.openmicroscopy.org/mapr/api/{key}/images/?value={value}&node={parent_type}&id={parent_id}"
ATTRIBUTES_URL = "http://idr-demo.openmicroscopy.org/webclient/api/annotations/?type=map&image={image_id}"

### Then get the phenotypes

For each gene in our list
* get which screens its in
* get the plates in the screens with the gene in
* then the images
* then the annotations of those images

NOTE: to be added - should also do this for datasets

#### First open up the file that the results will be written to and create the column header row

In [102]:
file = open('GenesWithPhenotypes.txt','w') 
file.write('Gene Symbol\tScreen\tPlate\tImageID\tAuthorPhenotype\tCMPOTerm\tCMPOAccession\n')

#### Then iterate through each gene in the list and fetch the phenotypes

In [103]:


for gene in genes:
    #file.write ('{0}'.format(gene))
    qs1 = {'key': 'gene', 'value': gene}
    url1 = SCREENS_PROJECTS_URL.format(**qs1)
    
    for s in session.get(url1).json()['screens']:
        screen_id = s['id']
        screen_name = s['name']
        #print (gene, s['id'], s['name'])    
        qs2 = {'key': 'gene', 'value': gene, 'screen_id': screen_id}
        url2 = PLATES_URL.format(**qs2)
        
        for p in session.get(url2).json()['plates']:
            plate_id = p['id'] 
            plate_name = p['name']
            #print (gene, p['id'], p['name'])  
            qs3 = {'key': 'gene', 'value': gene, 'parent_type': 'plate', 'parent_id': plate_id}
            url3 = IMAGES_URL.format(**qs3)
            
            # now to get the phenotypes linked to the images - not outputting this correctly yet
            
            # gene - might be not associated with any images
            # gene - associated with image but none with phenotypes
            # gene - image with one phenotype but no ontology mapping
            # gene - image with one phenotype plus one ontology mapping
            # gene - image with one phenotype plus multiple ontology mappings (elongated plus protrustions)
            # gene - image with multiple phenotypes with mixture of number of ontology mappings
            # 
            # best structure for this is dictionary of dictionaries?
            
            
            
            for i in session.get(url3).json()['images']:

                # create a list in which to create each row of genes and their phenotypes
                geneRow = []
                genePhenotypes = {}
                
                image_id = str(i['id'])
                url4 = ATTRIBUTES_URL.format(**{'image_id': image_id})
                for a in session.get(url4).json()['annotations']:
                    for kvpair in a['values']:
                        #pprint(a['values'])
                        
                        if 'Phenotype' in kvpair:
                            #print (kvpair[1])
                            geneRow.extend([gene, screen_name, plate_name, image_id, kvpair[1]])
                            
                        if 'Phenotype Term Name' in kvpair:
                            #print (kvpair[1])
                            #file.write('\t{0}'.format(kvpair[1])) 
                            geneRow.append(kvpair[1])
                            
                        if 'Phenotype Term Accession' in kvpair:
                            #print (kvpair[1])
                            #file.write('\t{0}'.format(kvpair[1]))
                            geneRow.append(kvpair[1])
                            
                        #file.write('\n')
             
            
            if len (geneRow) > 0:
                #print "row is : ", geneRow
                # create a tab-delimted row to print out
                separator = '\t'
                printRow = separator.join( geneRow )
                printRow = printRow + '\n'
                print printRow
                file.write(printRow)
                
        
file.close()        
    


PSMA1	idr0010-doil-dnadamage/screenA (3)	2-60	3074578	increased DNA damage	cell response to DNA damage phenotype	CMPO_0000415

PSMA1	idr0010-doil-dnadamage/screenA (3)	7-14	3090872	increased DNA damage	cell response to DNA damage phenotype	CMPO_0000415

PSMA1	idr0012-fuchs-cellmorph/screenA (2)	HT11	1818569	other phenotype

PSMA2	idr0010-doil-dnadamage/screenA (3)	2-60	3074601	increased DNA damage	cell response to DNA damage phenotype	CMPO_0000415

PSMA2	idr0010-doil-dnadamage/screenA (3)	7-14	3090824	increased DNA damage	cell response to DNA damage phenotype	CMPO_0000415

PSMA2	idr0013-neumann-mitocheck/screenA (11)	LT0002_02	1484705	dynamic changes (automatic)	increased variability of nuclear shape in population	CMPO_0000345

PSMA2	idr0013-neumann-mitocheck/screenA (11)	LT0002_24	1485036	dynamic changes (automatic)	increased variability of nuclear shape in population	CMPO_0000345

PSMA2	idr0013-neumann-mitocheck/screenA (11)	LT0002_51	1485556	dynamic changes (automatic)	increased var