# Reproducing Fig. 1

This notebook examplifies how to reproduce figure 1 of the article. the annotations from all screens will be downloaded and parsed to build statistics on phenotypes, which will eb displayed using bokeh.

## Preamble

In [14]:
from pandas import DataFrame,concat, read_csv
import omero
import numpy as np

import matplotlib.pyplot as plt
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, output_notebook, show, save
from bokeh.charts import Scatter
from bokeh.models.formatters import TickFormatter, String, List, Dict, Int
from bokeh.models import FixedTicker,HoverTool
import bokeh.palettes as bpal

output_notebook()
%matplotlib inline

In [7]:
from idr import connection

conn = connection()

In [8]:
def getBulkAnnotationAsDf(screenID,conn):
#    ofId=8118685
    sc=conn.getObject('Screen',screenID)
    for ann in sc.listAnnotations():
        if isinstance(ann, omero.gateway.FileAnnotationWrapper):
            if (ann.getFile().getName()=='bulk_annotations'):
                if (ann.getFile().getSize()> 1476250900): #about 140Mb?
                    print "that's a big file..."
                    return None
                ofId=ann.getFile().getId()
                break


    original_file = omero.model.OriginalFileI(ofId, False)

    openTable = conn.c.sf.sharedResources().openTable(original_file)
    rowCount = openTable.getNumberOfRows()

    #table to df

    column_names = [col.name for col in openTable.getHeaders()]

    black_list = []
    column_indices = []
    for column_name in column_names:
        if column_name in black_list:
            continue
        column_indices.append(column_names.index(column_name))

    table_data = openTable.slice(column_indices, None)
    data = []
    for index in range(rowCount):
        row_values = [column.values[index] for column in table_data.columns]
        data.append(row_values)

    dfAnn=DataFrame(data)
    dfAnn.columns=column_names
    return dfAnn

In [9]:
def appendPhInfo(phall,screen,df):
# extract all phenotypes informatino from given bulk annotation file and
# append it to the phall
    phcol=df.columns[[('Phenotype' in s) and ('Term Accession' in s) for s in df.columns ]]
    for s in phcol:
        ph=df[s].unique()
        if ph[0]!='':
            ph=ph[0]            
            desc=df[s.replace('Accession', 'Name')].unique()[0]
        else:
            ph=ph[1]
            desc=df[s.replace('Accession', 'Name')].unique()[1]
        dfph=df[df[s]!='']
        try:
            phall[ph]['n']=phall[ph]['n']+len(dfph)
            if not (screen in phall[ph]['screens']): 
                phall[ph]['screens'].append(screen)

        except Exception as e: 
            print(e)
            phcur={'n':len(dfph),'screens':[screen],'desc':desc}
            phall[ph]=phcur
    

## Build and display figure

In [10]:

#screens=[3,206,1202]
screens= conn.getObjects("Screen")

phall={}
for sc in screens:
    scId=sc.getId()
    name=sc.getName()
    print 'loading '+str(scId)+'::'+name
    try:
        df=getBulkAnnotationAsDf(scId,conn)
        appendPhInfo(phall,name,df)
    except:
        print 'fail!'

loading 3::idr0001-graml-sysgro/screenA
'PATO_0000460'
'CMPO_0000369'
'CMPO_0000370'
'CMPO_0000371'
'CMPO_0000372'
'CMPO_0000365'
'CMPO_0000077'
'CMPO_0000367'
'CMPO_0000116'
'CMPO_0000364'
'CMPO_0000366'
'CMPO_0000118'
'GO_0022403'
'CMPO_0000383'
'CMPO_0000388'
'CMPO_0000378'
'CMPO_0000387'
'CMPO_0000412'
'CMPO_0000410'
'CMPO_0000413'
loading 102::idr0002-heriche-condensation/screenA
'CMPO_0000329'
'CMPO_0000328'
loading 51::idr0003-breker-plasticity/screenA
'CMPO_0000391'
'CMPO_0000392'
'CMPO_0000393'
'CMPO_0000394'
'CMPO_0000395'
'CMPO_0000396'
'CMPO_0000397'
'CMPO_0000398'
'CMPO_0000400'
'CMPO_0000401'
'CMPO_0000402'
loading 202::idr0004-thorpe-rad52/screenA
'CMPO_0000415'
loading 253::idr0006-fong-nuclearbodies/screenA
'CMPO_0000404'
'CMPO_0000405'
'CMPO_0000406'
'CMPO_0000407'
'CMPO_0000408'
'CMPO_0000409'
loading 201::idr0007-srikumar-sumo/screenA
'PATO_0000462'
loading 154::idr0008-rohn-actinome/screenA
'CMPO_0000129'
'CMPO_0000128'
'CMPO_0000270'
'CMPO_0000267'
'CMPO_0000274'


In [15]:
#load grounping of phenotypes for ordering and coloring them. Decided manually offline (see paper)

dfColor=read_csv('CMPOAccessionToPhenotypeCategoriesNov2016.csv')
colors={}
for i,grp in enumerate(dfColor.CmpoPhenotypeCategory.unique()):
    colors[grp]=bpal.Set3_10[i%10]

In [16]:
#add the information to the data and sort it
for ph in phall.keys():
#    print ph
    try:
        phall[ph]['group']=dfColor[dfColor['CmpoAcc']==ph]['CmpoPhenotypeCategory'].values[0]
        phall[ph]['groupColor']=colors[phall[ph]['group']]
        phall[ph]['FigureCmpoName']=dfColor[dfColor['CmpoAcc']==ph]['FigureCmpoName'].values[0]
        
    except:
        print 'pass:'+ph
        del phall[ph]

phalls=sorted(phall.values(), key=lambda x: x['group'])

pass:GO_0022403
pass:PATO_0000460
pass:PATO_0000462
pass:CMPO_0000369
pass:GO_0070887


In [17]:
TOOLS="pan,wheel_zoom,reset"

p = figure(title = "Fig 1", tools=TOOLS,y_axis_type="log",width=1200,toolbar_location="above")


source = ColumnDataSource(
    data=dict(
        ph=[ph['FigureCmpoName'] for ph in phalls],
        n=[ph['n'] for ph in phalls],
        names=[ph['screens'] for ph in phalls],
        desc=[ph['desc'] for ph in phalls],
        x=[2*x for x in range(len(phall.keys()))],
        r=[1*len(ph['screens']) for ph in phalls],
        color=[ph['groupColor'] for ph in phalls],
        groupName=[ph['group'] for ph in phalls]
    ))

label_data = {2*i:x for i,x in enumerate([ph['FigureCmpoName'] for ph in phalls])}

cir=p.circle('x','n',radius='r',source=source,color='color')


hover = HoverTool(
        tooltips=[
            ("Term", "@ph"),
            ("Description", "@desc"),
            ("Number of samples", "@n"),
            ("Screens name", "@names"),
            ("group", "@groupName")
    ]
    )
p.add_tools(hover)

JS_CODE =  """
        _ = require "underscore"
        Model = require "model"
        p = require "core/properties"
        class FixedTickFormatter extends Model
          type: 'FixedTickFormatter'
          doFormat: (ticks) ->
            labels = @get("labels")
            return (labels[tick] ? "" for tick in ticks)
          @define {
            labels: [ p.Any ]
          }
        module.exports =
          Model: FixedTickFormatter
    """

class FixedTickFormatter(TickFormatter):

    labels = Dict(Int, String, help="""
    A mapping of integer ticks values to their labels.
    """)

    __implementation__ = JS_CODE


p.xaxis.formatter = FixedTickFormatter(labels=label_data)
p.xaxis.ticker = FixedTicker(ticks=sorted(label_data.keys()))
p.xaxis.major_label_orientation = np.pi/4.
p.xaxis.axis_label_text_font_size = "10pt"
show(p)
#save(p,'fig1.html')