In [179]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import shelve
import random
import itertools
import omero
import scipy.cluster.hierarchy as hchy
from seaborn import clustermap

from pandas import Series,DataFrame,read_csv, merge,concat,read_hdf
from omero.gateway import BlitzGateway
from matplotlib.widgets import Slider
from matplotlib import gridspec
from sklearn.externals import joblib
from contextlib import closing

from sklearn import random_projection
import sklearn.neighbors as nn
import sklearn.manifold as man
import sklearn.decomposition as dec
from sklearn.preprocessing import scale,robust_scale
from sklearn.metrics.pairwise import euclidean_distances


%matplotlib nbagg
plt.rcParams['image.cmap']='gray'

#hard coded dirs
dbpath='/home/ubuntu/idr_homes/szleo/features/idr0008-rohn-actinome/screenB/output/'
dataDir='/home/ubuntu/data'
scratchDataDir='/scratch/rohnFeatures'


In [18]:
def getImIDs(plns, wells,fields):
    lids=list()
    try:
        conn = BlitzGateway('demo', 'cambridge', host='orca-3.openmicroscopy.org', port=4064)
        connected = conn.connect()

        for pln,wstr,f in zip(plns,wells,fields):
        
            r=ord(wstr[0:1])-65
            c=int(wstr[1:])-1
            
#            print pln,wstr,f,r,c

            Q='''select well from Well as well 
            left outer join fetch well.wellSamples as ws 
            left outer join fetch ws.image as img 
            where well.plate.name = \'''' + pln+ '\' and well.row='+str(r)+' and well.column='+str(c)

            w=conn.getQueryService().findAllByQuery(Q,None)[0]
            im = w.getWellSample(int(f)-1).getImage()
            lids.append(im.getId().getValue())
    finally:
        conn._closeSession()
    
    return lids

In [19]:
def getMapAnnotationsAsDicts(imids):
    la=list()
    try:
        conn = BlitzGateway('demo', 'cambridge', host='orca-3.openmicroscopy.org', port=4064)
        connected = conn.connect()

        for imid in imids:
            im= conn.getObject("Image",imid)
            a=im.getAnnotation()
            la.append(dict(a.getValue()))
    finally:
        conn._closeSession()
    
    return la

In [4]:
def db2pd(fin):
#from https://gist.github.com/manics/bc003272b038027faf653d576f3c9393

    # f = "plate1_1_013_5_features.db"

#    print "Opening %s" % fin
    with closing(shelve.open(fin, "r")) as f:
        values = list(f.itervalues())

    x = values[0]

    header_meta = []
    header_ft = []
    header_ftn = []
    # width_ft = []

    for k in sorted(x.keys()):
        v = x[k]
        if isinstance(v, list):
            header_ft.append(k)
            header_ftn.extend('%s_%02d' % (k, i) for i in xrange(len(v)))
            # width_ftn.append(len(v))
        else:
            header_meta.append(k)

    assert len(header_ft) == 124
    assert len(header_meta) == 11

#    print "Processing %s" % fin
    rows = []
    for vs in values:
        meta = [vs[h] for h in header_meta]
        ft = [vs[h] for h in header_ft]
        r = itertools.chain.from_iterable([meta] + ft)
        rows.append(list(r))

#    print "Creating dataframe %s" % fin
    df = DataFrame(rows, columns=(header_meta + header_ftn))

    #print "Saving %s" % fout
    #df.to_hdf(fout, 'wndcharm', complevel=9, complib='zlib')

    return df


In [5]:
def getRohnTile(imid,x,y,w,h,chan=0):
#    plateId,x,y,w,h,ti,tf=422,0,0,200,200,50,150
    try:
        conn = BlitzGateway('demo', 'cambridge', host='orca-2.openmicroscopy.org', port=4064)
        connected = conn.connect()

        im = conn.getObject("Image", imid)
        pix=im.getPrimaryPixels()

        z=0;c=chan;
        tile = (x, y, w, h) 
        plane = pix.getTile(tile=tile)
    finally:
        conn._closeSession()
    
    return plane

In [6]:
def goneFishing(df,qry,s2w,nbrs):
#    pln,x,y,t='plate1_1_013_5',504,384,220  #division at 220

#    qry=df[(df.x==x) & (df.y==y) & (df.series==se)].iloc[:,11:]

#    dfq=df[df.name!=pln]
    chan=0
    

#    hook=getRohnTile(imid,x,y,w,h,chan)    
    
    distances, indices = nbrs.kneighbors(qry)
    nnn=len(indices[0])
    d,r=divmod(nnn,4)
               
    w=df.w.iloc[0]
    h=df.h.iloc[0]
    tiles=np.zeros((h,w,nnn))
    for ind,ii in zip(indices[0],range(nnn)):
        try:
            secur=df.series.iloc[ind]
            x=df.x.iloc[ind]
            y=df.y.iloc[ind]
            imindcur=s2w[(s2w.PLATE==pln)&(s2w.SERIES==secur)].imIds.iloc[0]
            tiles[:,:,ii]=getRohnTile(imindcur,x,y,w,h,chan) 
        except:
            continue

    
    plt.figure(figsize=(12,12))

    imc=buildComposite(tiles,d+(1&r),4,smpl=1)
#    plt.figure(figsize=(12,15))
    plt.imshow(imc)
    

In [7]:
def buildComposite(st,n,m,smpl=None):
    #nxm shots from st in a grid, as an image
    nr=st.shape[0]
    nc=st.shape[1]
    if smpl==None:
        smpl=st.shape[2]/(n*m)
    res=np.zeros((nr*n,nc*m))
    for i in range(n):
        for j in range(m):
            try:
                res[i*nr:i*nr+nr,j*nc:j*nc+nc]=st[:,:,(i*m+j)*smpl]
            except:
                break
    return res

## preping data

In [14]:
#metadata: linking omero, features, annotations

s2w=read_csv(os.path.join(dataDir,'series_to_well.tsv'),sep='\t')
lids=getImIDs(s2w.PLATE, s2w.WELL,s2w.FIELD)
s2w['imIds']=lids
la=getMapAnnotationsAsDicts(s2w.imIds)
s2w['annotations']=la
s2w.to_csv(os.path.join(scratchDataDir,'series_to_well_IDAnnotation.tsv'),sep='\t')

In [8]:
s2w=read_csv(os.path.join(scratchDataDir,'series_to_well_IDAnnotation.tsv'),sep='\t')

In [8]:
pln='Plate10_Actinome1'
df=DataFrame()
for s in range(s2w[s2w.PLATE==pln].SERIES.max()):
    if s%50==0:
        print s
    fin=os.path.join(dbpath, pln,'116_86',pln+'_'+str(s)+'_features'+'.db')
    df=concat((df,db2pd(fin)))

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100


In [12]:
fout=os.path.join(dataDir,pln+'_features'+'.h5')
df.to_hdf(fout, 'wndcharm', complevel=9, complib='zlib')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->axis0] [items->None]

  f(store)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->block0_items] [items->None]

  f(store)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->block1_items] [items->None]

  f(store)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->block2_values] [items->[u'img_path', u'name', u'version']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->block2_items] [items->None]

  f(store)


## All .db to h5

In [11]:
s2w.PLATE.unique()[4:]

array(['Plate3_Actinome1', 'Plate4_Actinome1', 'Plate5_Actinome1',
       'Plate6_Actinome1', 'Plate7_Actinome1', 'Plate8_Actinome1',
       'Plate9_Actinome1'], dtype=object)

In [None]:
#pln='Plate10_Actinome1'
#for pln in s2w.PLATE.unique():
for pln in s2w.PLATE.unique()[4:]:
    print pln
    df=DataFrame()
    for s in range(s2w[s2w.PLATE==pln].SERIES.max()):
        if s%100==0:
            print s
        fin=os.path.join(dbpath, pln,'116_86',pln+'_'+str(s)+'_features'+'.db')
        df=concat((df,db2pd(fin)))
    fout=os.path.join(scratchDataDir,pln+'_features'+'.h5')
    df.to_hdf(fout, 'wndcharm', complevel=9, complib='zlib')
    

Plate3_Actinome1
0
100

## loading stuff

In [12]:
pln='Plate10_Actinome1'
fout=os.path.join(scratchDataDir,pln+'_features'+'.h5')
print fout

/scratch/rohnFeatures/Plate10_Actinome1_features.h5


In [3]:
s2w=read_csv(os.path.join(dataDir,'series_to_well_IDAnnotation.tsv'),sep='\t')
df=read_hdf(os.path.join(dataDir,'Plate10_Actinome1_features.h5'))

## DAPI PCA

In [13]:
pca= joblib.load(os.path.join(dataDir,'PCA_4condentationwells.pkl')) 

In [21]:
dfcond=read_hdf(os.path.join(dataDir,'PCA250features_testwells.h5'))

In [27]:
dfdapi=df[df.c==0].iloc[:,11:]
dfdapi=scale(dfdapi)
dfdapi=pca.transform(dfdapi)



In [32]:
s2w.head()

Unnamed: 0.1,Unnamed: 0,PLATE,SERIES,WELL,FIELD,imIds,annotations
0,0,Plate10_Actinome1,0,A01,1,106449,{}
1,1,Plate10_Actinome1,1,A01,2,106759,{}
2,2,Plate10_Actinome1,2,A01,3,106760,{}
3,3,Plate10_Actinome1,3,A02,1,107217,{}
4,4,Plate10_Actinome1,4,A02,2,107218,{}


In [35]:
imid=random.choice(s2w[s2w.PLATE==pln].imIds.unique())
x=random.choice(df.x.unique())
y=random.choice(df.y.unique())
w=116
h=86
im=getRohnTile(imid,x,y,w,h,chan=0)

In [36]:
plt.figure()
plt.imshow(im)

<IPython.core.display.Javascript object>

<matplotlib.image.AxesImage at 0x7fe959cfc390>

In [53]:

#pln,x,y,t='plate1_2_006_5',168,896,66  #division at 30 and 66

#at random
#x=random.choice(df.x.unique())
#y=random.choice(df.y.unique())
#t=random.choice(range(df.t.max()))
#pln=random.choice(df['name'].unique())

#dfq=df[df.name!=pln]
nbrs = nn.NearestNeighbors(n_neighbors=12, algorithm='ball_tree').fit(dfdapi) 




In [79]:
plncond,x,y,t='plate1_1_013_5',504,384,220  #division at 220

qry=dfcond[(dfcond.x==x) & (dfcond.y==y) & (dfcond.name==plncond) & (dfcond.t==t)].iloc[:,12:]
goneFishing(df,qry,s2w,nbrs)

Traceback (most recent call last):
  File "/home/ubuntu/OMERO.server-5.1.4-ice35-b55/lib/python/omero/gateway/__init__.py", line 4160, in __call__
    return self.f(*args, **kwargs)
  File "/home/ubuntu/OMERO.server-5.1.4-ice35-b55/lib/python/omero_api_RawPixelsStore_ice.py", line 883, in getTile
    return _M_omero.api.RawPixelsStore._op_getTile.invoke(self, ((z, c, t, x, y, w, h), _ctx))
InternalException: exception ::omero::InternalException
{
    serverStackTrace = ome.conditions.InternalException:  Wrapped Exception: (java.lang.RuntimeException):
loci.formats.FormatException: Invalid tile size: x=0, y=435, w=116, h=86
	at ome.io.bioformats.BfPixelBuffer.getTileDirect(BfPixelBuffer.java:495)
	at ome.services.RawPixelsBean.getTile(RawPixelsBean.java:789)
	at sun.reflect.GeneratedMethodAccessor510.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:497)
	at org.springframework

<IPython.core.display.Javascript object>

## phenotype clustering

In [112]:
conn = BlitzGateway('demo', 'cambridge', host='orca-3.openmicroscopy.org', port=4064)
connected = conn.connect()

In [113]:
ofId=4810255
original_file = omero.model.OriginalFileI(ofId, False)

openTable = conn.c.sf.sharedResources().openTable(ofile)
rowCount = openTable.getNumberOfRows()

In [58]:
for col in openTable.getHeaders():
    print "   ", col.name
rowCount = openTable.getNumberOfRows()
print "Row count:", rowCount

    Plate
    Well Number
    Well
    Plate_Well
    siRNA Identifier
    siRNA Pool Name
    Sense Sequence
    siRNA Catalog Number
    Gene Identifier
    Gene Symbol (Ensembl 56)
    Control Type
    Control Comments
    Quality Control
    Comments
    Final Well Hit Status Actinome 1
    Final Well Hit status Actinome 2
    Overall Hit Gene Consensus
    Has phenotypes
    Phenotype 1
    Phenotype 1 Term Name
    Phenotype 1 Term Accession
    Phenotype 2
    Phenotype 2 Term Name
    Phenotype 2 Term Accession
    Phenotype 3
    Phenotype 3 Term Name
    Phenotype 3 Term Accession
    Phenotype 4
    Phenotype 4 Term Name
    Phenotype 4 Term Accession
    Phenotype 5
    Phenotype 5 Term Name
    Phenotype 5 Term Accession
    Phenotype 6
    Phenotype 6 Term Name
    Phenotype 6 Term Accession
    Phenotype 7
    Phenotype 7 Term Name
    Phenotype 7 Term Accession
    Phenotype 8
    Phenotype 8 Term Name
    Phenotype 8 Term Accession
    Phenotype 9
    Phenotype 9 Term 

In [114]:
column_names = [col.name for col in openTable.getHeaders()]

black_list = []
column_indices = []
for column_name in column_names:
    if column_name in black_list:
        continue
    column_indices.append(column_names.index(column_name))

In [119]:
table_data = openTable.slice(column_indices, None)
data = []
for index in range(rowCount):
    row_values = [column.values[index] for column in table_data.columns]
    data.append(row_values)
    
dfRhonAnn=DataFrame(data)
dfRhonAnn.columns=column_names


In [124]:

#transforming phenotypes into one hot booleans
BoolCols=[]
for col in dfRhonAnn.columns:
    if ('Phenotype' in col)&~('Term' in col):
        dfRhonAnn['Bool'+col]=~(dfRhonAnn[col] =='')
        BoolCols.append('Bool'+col)


In [152]:
phenMap=dfRhonAnn.groupby('siRNA Pool Name')[BoolCols].sum()>0
phenMap=phenMap[phenMap.sum(axis=1)>0]
Z = hchy.linkage(phenMap, 'ward')
Zt = hchy.linkage(phenMap.transpose(), 'ward')


In [169]:
den=hchy.dendrogram(Z,no_plot=True)
dent=hchy.dendrogram(Zt,no_plot=True)
phenMapSorted=DataFrame(phenMap.values[dent['ivl'],den['ivl']])

  app.launch_new_instance()


IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (40,) (106,) 

In [177]:
np.array(den['ivl'])

array(['42', '69', '44', '52', '2', '58', '65', '84', '50', '101', '48',
       '10', '74', '72', '17', '59', '28', '104', '80', '81', '54', '20',
       '56', '0', '21', '1', '9', '3', '23', '27', '32', '24', '30', '16',
       '8', '14', '70', '36', '18', '26', '91', '105', '45', '51', '11',
       '68', '60', '33', '53', '49', '103', '94', '31', '71', '89', '73',
       '75', '15', '41', '79', '86', '19', '34', '66', '63', '39', '83',
       '4', '6', '61', '62', '57', '25', '55', '43', '67', '95', '93',
       '96', '85', '35', '78', '76', '87', '46', '64', '77', '5', '22',
       '40', '102', '82', '12', '13', '7', '37', '97', '100', '90', '98',
       '88', '38', '99', '92', '29', '47'], 
      dtype='|S3')

In [176]:
phenMap.values[np.array(den['ivl']),np.array(dent['ivl'])]

IndexError: arrays used as indices must be of integer (or boolean) type

In [186]:
clustermap(phenMap,row_linkage=Z,col_linkage=Zt)


<IPython.core.display.Javascript object>

<seaborn.matrix.ClusterGrid at 0x7f2ecc19a150>

In [126]:
dfRhonAnn[dfRhonAnn['siRNA Pool Name']=='WDR1'].head()
#dfRhonAnn['siRNA Pool Name'].unique()

Unnamed: 0,Plate,Well Number,Well,Plate_Well,siRNA Identifier,siRNA Pool Name,Sense Sequence,siRNA Catalog Number,Gene Identifier,Gene Symbol (Ensembl 56),...,BoolPhenotype 31,BoolPhenotype 32,BoolPhenotype 33,BoolPhenotype 34,BoolPhenotype 35,BoolPhenotype 36,BoolPhenotype 37,BoolPhenotype 38,BoolPhenotype 39,BoolPhenotype 40
1204,330,53,41448,Plate2_Actinome1_C5,WDR1 pool,WDR1,,,NM_005112,WDR1,...,True,False,False,False,False,True,True,False,True,True
3268,338,197,44605,Plate7_Actinome1_I5,WDR1-01,WDR1,GGAAAGUGCGUCAUCCUAA,D-011984-01,NM_005112,,...,True,False,False,False,False,True,True,False,True,True
3270,338,199,44404,Plate7_Actinome1_I7,WDR1-02,WDR1,GGUGGGAUUUACGCAAUUA,D-011984-02,NM_005112,,...,True,False,False,False,False,True,True,False,True,True
3272,338,201,44733,Plate7_Actinome1_I9,WDR1-03,WDR1,GCGGCAAGUCCUACAUUUA,D-011984-03,NM_005112,,...,True,False,False,False,False,True,True,False,True,True
3274,338,203,44415,Plate7_Actinome1_I11,WDR1-04,WDR1,CCACGGGAAGCGAUGAUAA,D-011984-04,NM_005112,,...,True,False,False,False,False,True,True,False,True,True


## Stuff

In [8]:
pln='Plate10_Actinome1'
s2w[(s2w.PLATE==pln)].iloc[9].annotations

"{'siRNA Catalog Number': 'D-007290-01', 'Gene Identifier URL': 'http://www.ncbi.nlm.nih.gov/nuccore/NM_005470', 'siRNA Pool Name': 'ABI1', 'siRNA Identifier': 'ABI1-01', 'Gene Identifier': 'NM_005470'}"

In [81]:
distances, indices = nbrs.kneighbors(qry)

In [83]:
indices

array([[11945, 40182,  7862, 26907, 35467, 31497,  3119, 27295, 15635,
        17221, 35528, 12220]])

In [62]:
DataFrame(data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,130,131,132,133,134,135,136,137,138,139
0,328,1,40633,Plate1_Actinome1_A1,,,,,,,...,,,,,,,,,Plate1_Actinome1,a1
1,328,2,40728,Plate1_Actinome1_A2,,,,,,,...,,,,,,,,,Plate1_Actinome1,a2
2,328,3,40544,Plate1_Actinome1_A3,siCONTROL,siCONTROL,,,,,...,,,,,,,,,Plate1_Actinome1,a3
3,328,4,40525,Plate1_Actinome1_A4,siCONTROL,siCONTROL,,,,,...,,,,,,,,,Plate1_Actinome1,a4
4,328,5,40812,Plate1_Actinome1_A5,ABI1 pool,ABI1,,,NM_005470,,...,,,,,,,,,Plate1_Actinome1,a5
5,328,6,40615,Plate1_Actinome1_A6,CYFIP2 pool,CYFIP2,,,NM_014376,,...,,,,,,,,,Plate1_Actinome1,a6
6,328,7,40875,Plate1_Actinome1_A7,ABI2 pool,ABI2,,,NM_005759,,...,,,,,,,,,Plate1_Actinome1,a7
7,328,8,40877,Plate1_Actinome1_A8,DAAM1 pool,DAAM1,,,NM_014992,,...,,,,,,,,,Plate1_Actinome1,a8
8,328,9,-1,Plate1_Actinome1_A9,,,,,,,...,,,,,,,,,Plate1_Actinome1,
9,328,10,40552,Plate1_Actinome1_A10,DAAM2 pool,DAAM2,,,NM_015345,,...,,,,,,,,,Plate1_Actinome1,a10
