# FISH - Pipeline - A Python notebook to simulate FISH data

```
Author: Luis U. Aguilera
Contact Info: luis.aguilera@colostate.edu

Copyright (c) 2021 Munsky Group 
Colorado State University 
Licensed under BSD 3-Clause License.
```

### Libraries

In [1]:
# Importing libraries
import sys
import pathlib
import warnings
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import ndimage
from skimage import morphology
from scipy.ndimage import gaussian_filter
from matplotlib.patches import Rectangle
import tifffile
from skimage import measure

warnings.filterwarnings("ignore")

### Path to source directories

In [2]:
# Defining directories
current_dir = pathlib.Path().absolute()
fa_dir = current_dir.parents[0].joinpath('src')
database_simulation_path = current_dir.parents[0].joinpath('database_simulation')
# Importing fish_analyses module
sys.path.append(str(fa_dir))
import fish_analyses as fa

# Path to credentials
desktop_path = pathlib.Path.home()/'Desktop'
path_to_config_file = desktop_path.joinpath('config.yml')

# Loading experimental data. Order of colors channels in the experimental data.

Laser intensities for each channel
* [Ch0] 405 nm for DAPI
* [Ch1] 488 nm for MS2-MCP-GFP reporter
* [Ch2] 561 nm for cytosol marker
* [Ch3] 647 nm for smiFISH MS2-Cy5



In [3]:
# Path to data
data_folder_path = database_simulation_path.joinpath('MS2-CY5_Cyto543_560_woStim')
path_to_masks_dir = database_simulation_path.joinpath('MS2-CY5_Cyto543_560_woStim__masks_nuc_60__cyto_100')
#dataframe_file_path = pathlib.Path('/home/luisub/Desktop/FISH_Processing/simulation/MS2-CY5_Cyto543_560_woStim__analysis_nuc_60__cyto_100__psfz_350__psfyx_96__ts_538/dataframe_merged.csv')

In [4]:
# Read dataframe
#complete_df = pd.read_csv(dataframe_file_path)
# Read each image and save as a list
list_images, path_files, list_files_names, number_images = fa.ReadImages(directory= data_folder_path).read()
# read mask images
list_images_masks, path_files_masks, list_files_names_masks, number_images_masks = fa.ReadImages(directory= path_to_masks_dir).read()
# Reordering indices
#masks_nuclei_indexes = [index for index, element in enumerate(list_files_names_masks) if 'masks_nuclei_R' in element]

In [5]:
def quantify_fish_spots (image,mask_image):
    # Spot detection parameters
    channels_with_FISH = 3
    channels_with_nucleus = 0
    minimum_spots_cluster = 4
    psf_z=350                                  # Theoretical size of the PSF emitted by a [rna] spot in the z plan, in nanometers
    psf_yx=160                                 # Theoretical size of the PSF emitted by a [rna] spot in the yx plan, in nanometers
    voxel_size_z=500                           # Microscope conversion px to nanometers in the z axis.
    voxel_size_yx=160                          # Microscope conversion px to nanometers in the xy axis.
    list_voxels = [voxel_size_z,voxel_size_yx]
    list_psfs = [psf_z, psf_yx]
    threshold_for_spot_detection=400

    dataframe_complete_image, _ ,_ = fa.SpotDetection(image=image,
                                        FISH_channels=channels_with_FISH,
                                        channels_with_cytosol=None,
                                        channels_with_nucleus=channels_with_nucleus, 
                                        minimum_spots_cluster=minimum_spots_cluster,
                                        masks_nuclei=mask_image, 
                                        list_voxels=list_voxels,
                                        list_psfs=list_psfs, 
                                        show_plots=False,
                                        threshold_for_spot_detection=threshold_for_spot_detection).get_dataframe()
    return dataframe_complete_image

In [6]:
def get_background_pixels(original_image,masks_image,size_output=10000,selected_output_channels=None):
    masks_image_binary =  np.where(masks_image > 0, 1, 0).astype(bool) 
    inv_masks_image_binary = np.logical_not(masks_image_binary)
    num_z_slices = original_image.shape[0]
    num_color_channels = original_image.shape[3]
    image_removing_cells = np.zeros_like(original_image)
    background_elements_in_image = np.zeros((num_color_channels,size_output))
    for j in range(num_color_channels):
        for i in range (num_z_slices):
            rescaled_image = fa.RemoveExtrema(original_image[i,:,:,j],min_percentile=0, max_percentile=90).remove_outliers() 
            temp_non_zeros = rescaled_image [np.nonzero(rescaled_image)]
            min_background_threshold = np.quantile(temp_non_zeros, 0.5)
            image_removing_cells[i,:,:,j] = rescaled_image * inv_masks_image_binary
            background_pixels = image_removing_cells[:,:,:,j][np.nonzero(image_removing_cells[:,:,:,j])].flatten()
            filtered_background_pixels = background_pixels[ background_pixels >= min_background_threshold ] #& (background_pixels < max_background_threshold) ]
        background_elements_in_image[j,:] = np.random.choice(filtered_background_pixels, size = size_output)
        
    if not (selected_output_channels is None):
        background_elements_in_image = background_elements_in_image[selected_output_channels,:]
    return background_elements_in_image

# Creating cell library

In [7]:
def image_cell_selection(image,image_with_bg=None, scaling_value_radius_cell=1.1):
    max_image_selected_cell = np.max(image[:,:,:,0],axis=0)
    cyto_area_px = np.count_nonzero(max_image_selected_cell)
    SCALING_RADIUS_CYTOSOL = scaling_value_radius_cell
    # selecting only the dataframe containing the values for the selected field
    y_max_image_shape = image.shape[1]-1
    x_max_image_shape = image.shape[2]-1
    # Cell location in image
    scaling_value_radius_cell = scaling_value_radius_cell # use this parameter to increase or decrease the number of radius to plot from the center of the cell.
    cyto_loc_y,cyto_loc_x = np.round(ndimage.measurements.center_of_mass(max_image_selected_cell)).astype(int)
    cyto_radius_px = int(np.sqrt(cyto_area_px)*SCALING_RADIUS_CYTOSOL)
    # Detecting if a mask for the cytosol was used. If true, the code will plot the complete cell. Else, it will only plot the cell nucleus.
    x_min_value = cyto_loc_x - cyto_radius_px
    x_max_value = cyto_loc_x + cyto_radius_px
    y_min_value = cyto_loc_y - cyto_radius_px
    y_max_value = cyto_loc_y + cyto_radius_px
    # making sure that the selection doesnt go outside the limits of the original image
    x_min_value = np.max((0,x_min_value ))
    y_min_value = np.max((0,y_min_value ))
    x_max_value = np.min((x_max_value,x_max_image_shape))
    y_max_value = np.min((y_max_value,y_max_image_shape))
    # coordinates to select in the image 
    subsection_image_with_selected_cell = image[:,y_min_value: y_max_value,x_min_value:x_max_value,:]
    # returning image with bg
    if not (image_with_bg is None):
        subsection_image_with_selected_cell_with_bg = image_with_bg[:,y_min_value: y_max_value,x_min_value:x_max_value,:]
    return subsection_image_with_selected_cell,subsection_image_with_selected_cell_with_bg

In [8]:
def extracting_individual_cells(original_image, masks_image, remove_extreme_values=False,dilate_mask=False,selected_output_channels=None):
    # List of masks
    dilation_size=3
    list_cell_masks=[]
    list_library_cells=[]
    list_library_masks=[]
    list_library_cells_with_background=[]
    # Extracting each mask
    n_masks =np.max(masks_image)
    for i in range(1, n_masks+1 ):
        tested_mask = np.where(masks_image == i, 1, 0).astype(bool)
        if dilate_mask == True:
            tested_mask = morphology.binary_dilation(tested_mask,footprint=np.ones((dilation_size, dilation_size)), out=None) 
        list_cell_masks.append(tested_mask)
    list_processed_cell_ids = []
    # Iterate for each and multiply all z-slices by the mask
    for mask_index in range(0,n_masks):
        num_z_slices = original_image.shape[0]
        num_color_channels = original_image.shape[3]
        image_selected_cell = np.zeros_like(original_image)
        image_selected_cell_with_bg = np.zeros_like(original_image)
        # testing if cell is on border
        tested_mask= list_cell_masks[mask_index]
        is_cell_in_border =  np.any( np.concatenate( ( tested_mask[:,0],tested_mask[:,-1],tested_mask[0,:],tested_mask[-1,:] ) ) ) 
        # making zeros all elements outside cell
        if is_cell_in_border == False:
            list_processed_cell_ids.append(mask_index)
            for j in range(num_color_channels):
                for i in range (num_z_slices):
                    if remove_extreme_values == True:
                        rescaled_image = fa.RemoveExtrema(original_image[i,:,:,j],min_percentile=0, max_percentile=99.5).remove_outliers() 
                    else:
                        rescaled_image = original_image[i,:,:,j]
                    image_selected_cell[i,:,:,j] = rescaled_image * tested_mask
                    image_selected_cell_with_bg[i,:,:,j] = rescaled_image
            # Extracting each cell 
            temp_subsection_image_with_selected_cell,temp_subsection_image_with_selected_cell_with_bg = image_cell_selection(image=image_selected_cell, image_with_bg=image_selected_cell_with_bg ,scaling_value_radius_cell=1.1)
            # Identify rows and columns with all zeros and removing these rows and columns from the final image
            zero_rows = np.all(temp_subsection_image_with_selected_cell[0,:,:,0] == 0, axis=1)
            zero_cols = np.all(temp_subsection_image_with_selected_cell[0,:,:,0] == 0, axis=0)
            number_px_rows = np.count_nonzero( ~zero_rows)
            number_px_columns = np.count_nonzero( ~zero_cols)
            subsection_image_with_selected_cell_without_zeros = np.zeros((num_z_slices,number_px_rows,number_px_columns,num_color_channels ))
            subsection_image_with_selected_cell_with_zeros = np.zeros((num_z_slices,number_px_rows,number_px_columns,num_color_channels ))
            subsection_tested_mask = np.zeros_like(tested_mask)
            # Remove zero rows and columns
            for j in range(num_color_channels):
                for i in range (num_z_slices):
                    subsection_image_with_selected_cell = temp_subsection_image_with_selected_cell[i,:,:,j]
                    subsection_image_with_selected_cell_with_bg  = temp_subsection_image_with_selected_cell_with_bg[i,:,:,j] 
                    subsection_image_with_selected_cell_without_zeros[i,:,:,j] = subsection_image_with_selected_cell[~zero_rows,:][:, ~zero_cols]  
                    subsection_image_with_selected_cell_with_zeros[i,:,:,j] = subsection_image_with_selected_cell_with_bg[~zero_rows,:][:, ~zero_cols]  
            # mask
            zero_rows_mask = np.all(tested_mask[:,:] == 0, axis=1)
            zero_cols_mask = np.all(tested_mask[:,:] == 0, axis=0)
            subsection_tested_mask = tested_mask[~zero_rows_mask,:][:, ~zero_cols_mask]  
            # appending all cells into a list
            if not (selected_output_channels is None):
                list_library_cells.append(subsection_image_with_selected_cell_without_zeros[:,:,:,selected_output_channels])
            else:
                list_library_cells.append(subsection_image_with_selected_cell_without_zeros)
            list_library_cells_with_background.append(subsection_image_with_selected_cell_with_zeros)
            list_library_masks.append(subsection_tested_mask)
    
    return list_library_cells, list_library_masks, list_library_cells_with_background,list_processed_cell_ids

In [9]:
def complete_function_to_generate_library (list_images,list_images_masks, show_plots=True,selected_output_channels=None):
    
    # Creating folder to store masks
    cell_library_folder_path = pathlib.Path().cwd().joinpath('cell_library')
    cell_library_folder_path.mkdir(exist_ok=True)
    
    counter=0
    # Spot detection parameters
    minimum_spots_cluster = 10
    # parameters for calculating number of spots
    spot_type=0
    minimal_number_of_spots_to_save_cell = 50
    number_images = len(list_images)
    list_background_arrays =[]
    # initiating dataframe
    #df_library = pd.DataFrame(columns=['cell_id','centroid_y','centroid_x', 'size','number_of_spots','ts_size'])
    df_library = pd.DataFrame(columns=['cell_id', 'size','number_of_spots','ts_size'])
    for img_index in range(number_images):
        cell_id_in_each_image =0
        # Quantifying FISH spots
        dataframe_complete_image = quantify_fish_spots (image=list_images[img_index],
                                                        mask_image=list_images_masks[img_index])
        # Extracting background pixels in image
        background_elements_in_image = get_background_pixels(original_image=list_images[img_index],
                                                        masks_image=list_images_masks[img_index],
                                                        size_output=10000,
                                                        selected_output_channels=selected_output_channels)
        list_background_arrays.append(background_elements_in_image)
        
        # Function to extract individual cells
        list_library_cells_final =[]
        list_library_cells,list_library_masks,_,list_processed_cell_ids = extracting_individual_cells(original_image=list_images[img_index],
                                                        masks_image=list_images_masks[img_index],
                                                        remove_extreme_values=False,
                                                        selected_output_channels=selected_output_channels)
        # iterate the dataframe using the index obtained from list_processed_cell_ids. 
        for k, df_index in enumerate (list_processed_cell_ids):
        # iterating for each cells in the image. Notice that only complete cells are processed
            # Calculate spots in cells
            selected_image = list_library_cells[k]
            selected_mask = list_library_masks[k]
            # number of spots per nucleus
            condition = dataframe_complete_image['cell_id'] ==df_index
            dataframe = dataframe_complete_image[condition]
            number_of_spots = len( dataframe_complete_image.loc[ (dataframe_complete_image['cell_id']==df_index) 
                                            & (dataframe_complete_image['is_cluster']==False)  
                                            & (dataframe_complete_image['is_nuc']==True) 
                                            & (dataframe_complete_image['spot_type']==spot_type)  ] ) 
            # Number of RNA in a TS
            try:
                ts_size =  np.max(dataframe.loc[ (dataframe['cell_id']==df_index) & (dataframe['is_cluster']==True) &   (dataframe['cluster_size']>=minimum_spots_cluster)  ].cluster_size.values)
            except:
                ts_size = 0
            # Size of the nucleus of each cell
            nuc_size = dataframe.loc[   (dataframe['cell_id']==df_index) ].nuc_area_px.values[0]   
            #centroid_y,centroid_x = ndimage.measurements.center_of_mass(selected_mask)

            # Creating a dataframe with the cell properties
            if (number_of_spots>minimal_number_of_spots_to_save_cell):
                # save image as a tif file 
                cell_image_path = cell_library_folder_path.joinpath('cell_'+str(counter)+'.npy')
                #tifffile.imsave(str(cell_library_folder_path.joinpath('cell_'+str(counter)+'.tif')), selected_image, metadata={'axes': 'ZYXC'})
                np.save(cell_image_path, selected_image)
                list_library_cells_final.append(selected_image)
                #df_library.loc[cell_id] = [cell_id, np.round(centroid_y,0),np.round(centroid_x,0),nuc_size,number_of_spots,ts_size]
                # Creating the row save for each cell in the dataframe 
                df_library.loc[counter] = [counter,nuc_size,number_of_spots,ts_size]
                cell_id_in_each_image +=1
                counter+=1
                if show_plots ==True:
                    fa.Plots.plot_single_cell(image=selected_image, 
                                            df=dataframe, 
                                            selected_channel=0, 
                                            min_ts_size=5,
                                            show_spots=True,
                                            show_legend = True,
                                            image_name=None,
                                            microns_per_pixel = 0.16,
                                            max_percentile=95.,
                                            selected_colormap='viridis')
            del dataframe
        del background_elements_in_image, list_library_cells,list_processed_cell_ids,dataframe_complete_image
    
    # changing data typo to int
    new_dtypes = {'cell_id':int,'size':int,'number_of_spots':int, 'ts_size':int}
    df_library = df_library.astype(new_dtypes)
    # Specify the specific file path to save the DataFrame
    dataframe_path = str(cell_library_folder_path.joinpath('dataframe_library.csv'))
    # Save the DataFrame to the specified file path
    df_library.to_csv(dataframe_path, index=False)
    # Saving background as numpy array
    bg_path = str(cell_library_folder_path.joinpath('background_pixels_library.npy'))
    # Concatenate background_array horizontally
    background_elements_in_all_images= np.concatenate(list_background_arrays, axis=1)
    np.save(bg_path, background_elements_in_all_images)
    return list_library_cells_final, df_library, background_elements_in_all_images


In [10]:
list_library_cells_final, df_library, background_elements_in_image = complete_function_to_generate_library (list_images,
                                                                                                            list_images_masks,
                                                                                                            show_plots=False,
                                                                                                            selected_output_channels=[0,3])

In [11]:
df_library

Unnamed: 0,cell_id,size,number_of_spots,ts_size
0,0,7733,476,20
1,1,5166,305,53
2,2,5471,288,18
3,3,5454,292,58
4,4,5888,317,50
...,...,...,...,...
85,85,6311,117,0
86,86,7632,483,87
87,87,6684,207,16
88,88,7723,221,0


In [12]:
list_library_cells_final[0].shape

(27, 112, 92, 2)

In [13]:
background_elements_in_image.shape

(2, 120000)

In [14]:
import os
cell_library_folder_path = pathlib.Path().cwd().joinpath('cell_library')
def get_folder_size(folder_path):
    total_size = 0

    for dirpath, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            total_size += os.path.getsize(file_path)

    return total_size

# Example usage:
size_in_bytes = get_folder_size(cell_library_folder_path)
size_in_kb = size_in_bytes / 1024
size_in_mb = size_in_kb / 1024
size_in_mb

307.37653827667236

In [15]:
raise

RuntimeError: No active exception to reraise

In [None]:
def complete_function_to_generate_library (list_images,list_images_masks,selected_image_number, show_plots=True):
    
    # Spot detection parameters
    channels_with_FISH = [3]
    channels_with_nucleus = [0]
    minimum_spots_cluster = 5
    psf_z=350                                  # Theoretical size of the PSF emitted by a [rna] spot in the z plan, in nanometers
    psf_yx=160                                 # Theoretical size of the PSF emitted by a [rna] spot in the yx plan, in nanometers
    voxel_size_z=500                           # Microscope conversion px to nanometers in the z axis.
    voxel_size_yx=160                          # Microscope conversion px to nanometers in the xy axis.
    list_voxels = [voxel_size_z,voxel_size_yx]
    list_psfs = [psf_z, psf_yx]
    threshold_for_spot_detection=400
    # parameters for calculating number of spots
    spot_type=0
    number_cells_in_crop = 1
    minimal_number_of_spots_to_save_cell = 50
    cell_id =0
    # initiating dataframe
    df_library = pd.DataFrame(columns=['cell_id','centroid_y','centroid_x', 'size','number_of_spots','ts_size'])
    # Function to extract individual cells
    list_library_cells_final =[]
    list_library_cells,list_library_masks,list_library_cells_with_background,list_processed_cell_ids = extracting_individual_cells(original_image=list_images[selected_image_number],
                                                    masks_image=list_images_masks[masks_nuclei_indexes[selected_image_number]],
                                                    remove_extreme_values=False)
    number_of_detected_cells = len(list_library_cells)
    
    # iterate the dataframe using the index obtained from list_processed_cell_ids. 
    #for index, df_index in enumerate (list_processed_cell_ids):
    
    
    
    # iterating for each cells in the image. Notice that only complete cells are processed
    for k in range (number_of_detected_cells):
        # Calculate spots in cells
        selected_image = list_library_cells[k]
        selected_image_with_bg = list_library_cells_with_background[k]
        selected_mask = list_library_masks[k]
        dataframe, _ ,_ = fa.SpotDetection(image=selected_image_with_bg,
                                            FISH_channels=channels_with_FISH,
                                            channels_with_cytosol=None,
                                            channels_with_nucleus=channels_with_nucleus, 
                                            minimum_spots_cluster=minimum_spots_cluster,
                                            masks_nuclei=selected_mask, 
                                            list_voxels=list_voxels,
                                            list_psfs=list_psfs, 
                                            show_plots=False,
                                            threshold_for_spot_detection=threshold_for_spot_detection).get_dataframe()
        # number of spots per nucleus
        number_of_spots = np.asarray([len( dataframe.loc[  (dataframe['cell_id']==i) & (dataframe['is_cluster']==False)  & (dataframe['is_nuc']==True) & (dataframe['spot_type']==spot_type)  & (dataframe['is_cell_fragmented']!=-1)    ].spot_id) for i in range(0, number_cells_in_crop)])[0]
        # Number of RNA in a TS
        try:
            ts_size =  np.max(dataframe.loc[ (dataframe['is_cluster']==True) &   (dataframe['cluster_size']>=minimum_spots_cluster)  ].cluster_size.values)
        except:
            ts_size = 0
        # Size of the nucleus of each cell
        nuc_size = [dataframe.loc[   (dataframe['cell_id']==i) ].nuc_area_px.values[0] for i in range(0, number_cells_in_crop)][0]
        nuc_size = np.asarray(nuc_size)
        centroid_y,centroid_x = ndimage.measurements.center_of_mass(selected_mask)
        # Creating a dataframe with the cell properties
        if number_of_spots>minimal_number_of_spots_to_save_cell:
            list_library_cells_final.append(selected_image)
            df_library.loc[cell_id] = [cell_id, np.round(centroid_y,0),np.round(centroid_x,0),nuc_size,number_of_spots,ts_size]
            cell_id +=1
            if show_plots ==True:
                fa.Plots.plot_single_cell(image=selected_image, 
                                        df=dataframe, 
                                        selected_channel=3, 
                                        min_ts_size=5,
                                        show_spots=True,
                                        show_legend = True,
                                        image_name=None,
                                        microns_per_pixel = 0.16,
                                        max_percentile=95.,
                                        selected_colormap='viridis')
        del dataframe
    new_dtypes = {'cell_id':int, 'centroid_y':int, 'centroid_x':int,'size':int,'number_of_spots':int, 'ts_size':int}
    df_library = df_library.astype(new_dtypes)
            
    return list_library_cells_final, df_library


In [None]:
list_library_cells_final, df_library = complete_function_to_generate_library (list_images,list_images_masks,selected_image_number=0, show_plots=True)

In [None]:
df_library

In [None]:
raise