In [1]:
#import napari
import numpy as np
import scipy.ndimage as ndimage
import matplotlib.pyplot as plt
from skimage import io
import glob
from skimage.filters import threshold_local
from skimage.filters import threshold_li
from skimage.color import rgb2gray
from math import sqrt
from skimage.morphology import disk, ball
from skimage.feature import blob_dog, blob_log, blob_doh
from skimage.filters.rank import enhance_contrast
from skimage.exposure import adjust_gamma
import pickle as pkl
import sys
from datetime import date
import os, psutil
import re
process = psutil.Process(os.getpid())
# Grab utilities from the imaging directories dir
utilsDir = re.sub(r'Registration', 'Imaging Utilities', str(sys.path[0]))
# Homebrew utilities for importing ims files & basic image manipulation
sys.path.append(utilsDir)
from functools import reduce
from scipy.spatial.distance import cdist, pdist
from skimage.exposure import rescale_intensity
from skimage.segmentation import find_boundaries

# Use this function for local thresholding of images
def lthresh(imgstk):
    th = threshold_local(imgstk, 65, offset=0)
    imgstk = imgstk>= th
    return imgstk
# Use this function to threshold images
def thresh(imgstk):
    th = threshold_li(imgstk)
    imgstk = imgstk>= th
    return imgstk
def spotcall(img, min_sig, max_sig, nsig, th):
    blobs_log = blob_log(img, min_sigma=min_sig, max_sigma=max_sig, num_sigma=nsig, threshold=th)
    # Compute radii in the 3rd column.
    blobs_log[:, 2] = blobs_log[:, 2] * sqrt(2)
    color = 'lime'
    title = 'Laplacian of Gaussian'
    return blobs_log

In [2]:
config = {
        "data_url" : "20230721_GFP_1",
    }
print(config)

{'data_url': '20230721_GFP_1'}


In [3]:
runName = config['data_url']
opdir = '/mnt/disks/external/jg4159/20230721_GFP/' # Path to output
ip = opdir + "RegisteredImages_reverse_" + runName + ".pkl" # path to stack
mk = opdir + "Unfiltered_segmentation_2D_reverse_20230721_GFP_1.pickle" #path to masks
print(ip)
print(mk)
print(runName)

/mnt/disks/external/jg4159/20230721_GFP/RegisteredImages_reverse_20230721_GFP_1.pkl
/mnt/disks/external/jg4159/20230721_GFP/Unfiltered_segmentation_2D_reverse_20230721_GFP_1.pickle
20230721_GFP_1


In [4]:
import glob
#Loading ims files for different cycles
lof = glob.glob(mk)
filehandler = open(lof[0], 'rb')
masks_mem, masks_nuc, ref_mem = pkl.load(filehandler)
print(len(ref_mem))
filehandler.close()

4832


In [9]:
### VARIABLE DEFINITION ###

maxpro = True # Set false if 3D registration required
interactive = False #If true will show napari visualizations
runOnServer = True
filehandler = open(ip, 'rb')
img_stack_all = pkl.load(filehandler)
filehandler.close()

In [13]:
for img in img_stack_all[1:]:
    for i in range(img.shape[0]):
        img[i] *= 65535
        img[i] = img[i].astype(np.uint16)
        m = np.median(img[i][img[i] > 0])
        print(np.percentile(img[i], (97.5, 99.8)))

[140. 275.]
[164. 302.]
[272. 398.]
[133. 244.]
[142. 215.]
[360. 547.]
[160. 290.]
[139. 203.]
[495. 749.]
[130. 153.]
[157. 228.]
[234. 387.]


In [14]:
# get the CM rounds
img_stack = img_stack_all[:4]
print(len(img_stack))

4


In [15]:
# get the GFP rounds
GFP_stack = img_stack_all[4]
print(len(GFP_stack))

3


In [21]:
num_chn = 2
num_cyc = 4
# threshold the CM rounds
gamstack=[]
for i in range(num_cyc):
        for j in range(num_chn):
            if (j==0): #640
                gamstack.append(rescale_intensity(img_stack[i][j].astype('float64'), (130,140), (0, 255)))
            else: #488
                gamstack.append(rescale_intensity(img_stack[i][j].astype('float64'),(130,140), (0, 255)))

In [10]:
# Generate 2D masks
import time
start_time = time.time()

# Convert ref_mem to 2Dimensional coordinates
nuclei_mask = np.zeros(masks_nuc.shape)
membrane_mask = np.zeros(masks_mem.shape)
nucMask_flat = nuclei_mask.ravel()
memMask_flat = membrane_mask.ravel()

for i, cellID in enumerate(ref_mem):
    nuc_id = ref_mem[cellID]["nuc_id"]
    nucMask_flat[ref_mem[cellID]["Nuclei Pixels"]] = nuc_id
    ref_mem[cellID]['Nuclei Pixels 2D'] = np.where(nuclei_mask==nuc_id)
    memMask_flat[ref_mem[cellID]['Membrane Pixels']] = cellID
    ref_mem[cellID]['Membrane Pixels 2D'] = np.where(membrane_mask==cellID)
    if i%1000==0:
        print(f"Finished Membrane {i}")
print(f'--- ref_mem conversion to 2D: {(time.time()-start_time)} seconds')
filehandler = open(opdir+"Unfiltered_segmentation_2D_reverse_" + runName +".pickle", 'wb')
pkl.dump((masks_mem, masks_nuc, ref_mem), filehandler)
filehandler.close()

Finished Membrane 0


KeyboardInterrupt: 

In [11]:
# check if memMaskgd exists
try:
    filehandler = open(opdir+'memMaskgd0_reverse_' + runName + '.pkl', 'rb')
    memMaskgd0 = pkl.load(filehandler)
    filehandler.close()
except:
    memMaskgd_exists = False
else:
    print("Exists!")
    memMaskgd_exists = True
# make config if it does not exist already (e.g. passed in by papermill) for manual running
if not(memMaskgd_exists):
    start_time = time.time()
    memMaskgd0 = {}
    i=0
    for key in ref_mem.keys():
        if i%1000==0:
            print(i)
        i=i+1
        spgd={}
        x0 = ref_mem[key]["Membrane Pixels 2D"][0]
        y0 = ref_mem[key]["Membrane Pixels 2D"][1]
        cellset = set([tuple([x,y]) for x,y in zip(x0,y0)]) #save all pixels for this cell\n",
        spgd['cellset']=cellset
        memMaskgd0[key] = spgd
    print(f'--- generate memeMaskgd0: {(time.time()-start_time)} seconds')
    filehandler = open(opdir+'memMaskgd0_reverse_' + runName + '.pkl', 'wb')
    pkl.dump(memMaskgd0, filehandler)
    filehandler.close()

0


KeyError: 'Membrane Pixels 2D'

In [18]:
import pandas as pd
spot_df = pd.DataFrame(0,columns = range(len(gamstack)), 
                   index = memMaskgd0.keys())
spot_df

Unnamed: 0,0,1,2,3,4,5,6,7
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
5198,0,0,0,0,0,0,0,0
5199,0,0,0,0,0,0,0,0
5200,0,0,0,0,0,0,0,0
5201,0,0,0,0,0,0,0,0


In [35]:
# for each image, assess and remove background spots
import time
start_time = time.time()

all_spots = np.zeros(shape=(0,3))
total_spots_cycle = []
retained_spots_cycle = []
spotlist = []

bkg_spot_th = 0
#file2.write(f"background removal threshold: {bkg_spot_th}\n")

cyc=0

for img in gamstack:
    print("Cycle " + str(cyc//3+1)+ ", channel" + str(cyc%3) + ":")
    #file2.write(f"Cycle {cyc//3+1}, channel {cyc%3}:\n")
    spots = spotcall(img[:,:],min_sig=2,max_sig=10,nsig=1,th=0.5)# Decrease threshold if spots are not identified and min sigma if spots are smaller\n",
    print(len(spots))
    #file2.write(f"total spots: {len(spots)}\n")
    spotset = set([tuple(x) for x in spots[:,:2].astype(int)])
    retained_spots= []
    num_retained = 0
    #mask_cyc={}
    for key, values in memMaskgd0.items():
        cellset=memMaskgd0[key]["cellset"]  # all 2d pixel positions for this mem id    
        spcell = np.array([[x[0],x[1],1] for x in spotset & cellset]) # find pixels (i.e. spots) that are in both spotset & cellset
        spot_df.loc[key, cyc] = len(spcell)
        if len(spcell) >= bkg_spot_th:
            num_retained = num_retained + len(spcell)
            #mask_cyc[key] = spcell
            spcell = spcell.tolist()
            retained_spots.extend(spcell)
    print("Retained spots above background: "+str(len(retained_spots)))
    #file2.write(f"Retained spots above background: {len(retained_spots)}\n")
    spotlist.append(np.array(retained_spots))

    total_spots_cycle.append(len(spots))
    retained_spots_cycle.append(len(retained_spots))
    all_spots = np.concatenate((all_spots, retained_spots), axis=0)
    cyc=cyc+1

total_spots = all_spots.shape[0]
print("Total # of spots:", total_spots)
#file2.write(f"Total # of spots: {total_spots}\n")
all_spots = np.unique(all_spots, axis=0)
print("Total # of unique spots:", all_spots.shape[0])
#file2.write(f"Total # of unique spots: {all_spots.shape[0]}\n")
total_spots = all_spots.shape[0]
print("--- %s seconds ---" % (time.time() - start_time))
#file2.write(f"--- %s seconds --- {time.time() - start_time}\n")

Cycle 1, channel0:


  r1 = blob1[-1] / blob2[-1]
  pos1 = blob1[:ndim] / (max_sigma * root_ndim)
  pos2 = blob2[:ndim] / (max_sigma * root_ndim)
  d = np.sqrt(np.sum((pos2 - pos1)**2))


48992
Retained spots above background: 38400
Cycle 1, channel1:
13546
Retained spots above background: 10660
Cycle 1, channel2:
36695
Retained spots above background: 29035
Cycle 2, channel0:
53104
Retained spots above background: 41800
Cycle 2, channel1:
40233
Retained spots above background: 32170
Cycle 2, channel2:
55535
Retained spots above background: 43926
Cycle 3, channel0:
65192
Retained spots above background: 51462
Cycle 3, channel1:
54532
Retained spots above background: 43298
Total # of spots: 290751
Total # of unique spots: 182883
--- 30.317699670791626 seconds ---


In [36]:
filehandler = open(opdir+'Spotcalling_spotlist_' + str(total_spots) + "_" + runName + '.pkl', 'wb')
pkl.dump(spotlist, filehandler)
filehandler.close()

filehandler = open(opdir+'Spotcalling_allspots_' + str(total_spots) + "_"  + runName + '.pkl', 'wb')
pkl.dump(all_spots, filehandler)
filehandler.close()

spot_df.to_csv(opdir+runName+'_'+str(total_spots)+'_spotbycycle.csv')

## codebook for guide identification

In [26]:
import pandas as pd
codebook = '/mnt/disks/external/jg4159/BEpilot/sgpilot_CM_GFP_codebook_self.csv'
cb = pd.read_csv(codebook, sep=',', header=0, index_col='Gene')
print(cb)
cb_list = np.asarray(cb.values.tolist(),dtype=bool)
genes = cb.index.tolist()
print(np.asarray(cb_list[0]).shape)

           R0  R1  R2  R3  R4  R5  R6  R7
Gene                                     
GFP001      1   0   0   1   0   1   1   0
UA001       1   1   0   1   0   0   1   0
NTC001      1   0   0   0   0   1   1   1
UA002       1   1   0   0   0   1   1   0
NTC002      1   0   0   1   0   0   1   1
UA003       0   0   1   1   1   1   0   0
GFP002      0   1   1   1   1   0   0   0
UA004       0   0   1   0   1   1   0   1
UA005       0   1   1   0   1   1   0   0
UA006       0   0   1   1   1   0   0   1
NTC003      0   0   0   1   1   1   1   0
UA007       0   1   0   1   1   0   1   0
GFP003      0   0   0   0   1   1   1   1
UA008       0   1   0   0   1   1   1   0
NTC004      0   0   0   1   1   0   1   1
UA009       0   0   1   1   0   1   1   0
UA010       0   1   1   1   0   0   1   0
UA011       0   0   1   0   0   1   1   1
GFP004      0   1   1   0   0   1   1   0
UA012       0   0   1   1   0   0   1   1
UA013       1   0   1   1   0   1   0   0
UA014       1   1   1   1   0   0 

In [37]:
xmax = gamstack[0].shape[0]
ymax = gamstack[0].shape[1]
del gamstack

In [38]:
spotset=[]
for j in spotlist:
    spotset.append(set([tuple(x) for x in j[:,:2]]))

In [41]:
num_chn=2
num_cyc=4
num_pxl_list=[4]#0,0.5,1,1.5,2,2.5,3,3.5,4
guide_spots_list=[]

for num_pxl in num_pxl_list:    
    print(num_pxl)
    import time
    start_time = time.time()

    ## Guide detection, generating the bitcode by searching the spot pixel in all cycles within a given radius
    cycles = np.zeros((all_spots.shape[0],num_chn*num_cyc),dtype=bool)
    for i in range(all_spots.shape[0]): #all_spots.shape[0]
        if i%100000==0:
            print(i)
        k = 0
        # create a potential set
        spot_i = np.reshape(all_spots[i,:2],[1,2])[0]
        #print(spot_i)
        grid = []
        for dx in range(5):
            for dy in range(5):
                coord1 = tuple([np.max([0,spot_i[0]-dx]).astype(int), np.max([0,spot_i[1]-dy]).astype(int)])
                #print(coord1)
                grid.append(coord1)
                coord2 = tuple([np.min([xmax, spot_i[0]+dx]).astype(int), np.min([ymax, spot_i[1]+dy]).astype(int)])
                #print(coord2)
                grid.append(coord2)
                coord3 = tuple([np.max([0, spot_i[0]-dx]).astype(int), np.min([ymax, spot_i[1]+dy]).astype(int)])
                grid.append(coord3)
                coord4 = tuple([np.min([xmax, spot_i[0]+dx]).astype(int), np.max([0, spot_i[1]-dy]).astype(int)])
                grid.append(coord4)
        grid = set(grid)
        #print(grid)
        #break
        
        for j in spotset: 
            # give the radius is three, we can pre-set a coordinate set
            # find spots that fall within the grid, calculate distance if any
            spcell = np.array([[x[0],x[1]] for x in grid & j])
            if len(spcell>0):
                #print(spcell)
                a = cdist(np.reshape(all_spots[i,:2],[1,2]), spcell, metric='euclidean')
                #print(a)
                if np.min(a) < num_pxl: # less than num_pxl pixels 
                    cycles[i,k]=1
            k = k+1
        #print(cycles[i,:])
    print("--- %s seconds ---" % (time.time() - start_time))

# de-duplicaton
    start_time = time.time()
    spotdict = {}
    guide_spots=0
    print(len(cb_list))
    for j in range(len(cb_list)):
        print(j)
        k = 0
        abc = []
        dedup = []
        for i in range(0, cycles.shape[0]):
            if np.array_equal(cycles[i,:], np.asarray(cb_list[j])):
                k=k+1
                abc.append(all_spots[i])
        spotcycle = np.asarray(abc)
        for sp in range(spotcycle.shape[0]):
            a = cdist(np.reshape(spotcycle[sp,:2],[1,2]), spotcycle[:,:2], metric='euclidean')#distance
            if len(np.where(a<num_pxl)[1]) > 1:
                dedup.append(spotcycle[np.where(a<num_pxl)[1][0],:])
            else:
                dedup.append(spotcycle[sp,:])
        #print(dedup)
        #print(np.unique(np.asarray(dedup),axis=0).shape)
        spotdict[genes[j]] = np.unique(np.asarray(dedup),axis=0)
        guide_spots = guide_spots + len(dedup)
        #print("gene: "+genes[j]+'\tcount: '+str(len(dedup)))
    print("Total guide spots: ", guide_spots)
    #file2.write(f"Total guide spots: {guide_spots}\n")
    print("Guide spots/Total spots:", guide_spots, '/', total_spots, '=', guide_spots/total_spots)
    #file2.write(f"Guide spots/Total spots: {guide_spots} / {total_spots} = {guide_spots/total_spots}\n")
    guide_spots_list.append(guide_spots)
    print("--- %s seconds ---" % (time.time() - start_time))
    #file2.write(f"--- %s seconds --- {time.time() - start_time}\n")
    # Save max projection images for next step in pipeline
    filehandler = open(opdir+'Spots-with-dist-OPT_numpxl='+str(num_pxl)+"_"+str(guide_spots)+"_" + runName + '.pkl', 'wb')
    pkl.dump(spotdict, filehandler)
    filehandler.close()

4
0
100000
--- 223.93669271469116 seconds ---
35
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
Total guide spots:  118288
Guide spots/Total spots: 118288 / 182883 = 0.6467960389976105
--- 37.61781620979309 seconds ---


In [42]:
        # Create a dictionary where every key is a foci coordinate, and the value is the guide identity
        spotMatch = {}
        names = list(spotdict.keys())
        for i, guide in enumerate(spotdict):
            for spot in spotdict[guide]:
                if spotMatch.get((spot[0], spot[1])):
                    spotMatch[(spot[0], spot[1])].append(names[i])
                else:
                    spotMatch[(spot[0], spot[1])] = [names[i]]

        # create an empty dataframe with cell_id and guide names
        mem_df = pd.DataFrame(0,columns = spotdict.keys(), 
                           index = memMaskgd0.keys())        

        # For each nuclei, see which spots in the membrane are indexed in the key 
        for i, cellID in enumerate(memMaskgd0):
            for name in names:
                memMaskgd0[cellID][name] = 0

            # For each Mmebrane Coordinate set, look for a spot
            memCoord = memMaskgd0[cellID]['cellset']
            for coord in memCoord:
                amplicons = spotMatch.get(coord)
                if amplicons:
                    for amplicon in amplicons:
                        memMaskgd0[cellID][amplicon] += 1
                        mem_df.loc[cellID,amplicon] += 1

        mem_max = mem_df.max(axis='columns') # guide with the max num of spots
        mem_sec = mem_df.apply(lambda x: x.nlargest(2).iloc[1], axis=1)
        max_idx = mem_df.idxmax(axis=1)
        guide_num = 0
        for i, cellID in enumerate(ref_mem):
            guide_spot_ratio = mem_max[cellID]/(mem_max[cellID] + mem_sec[cellID]) # guide purity
            if mem_max[cellID] >= 3 and guide_spot_ratio >= 0.66:
                ref_mem[cellID]['Guide ID'] = max_idx[cellID]
                guide_num = guide_num + 1
            else:
                ref_mem[cellID]['Guide ID'] = "None"
        print(guide_num)
        print(guide_num/len(ref_mem))
        mem_df.to_csv(opdir+runName+'_GFP_'+str(guide_spots)+"spots_"+str(guide_num)+'cells_cell2guide.csv')

3252
0.6730132450331126


