# FISH - Data interpretation - A Python interactive notebook to interpret FISH data

```
Author: Luis U. Aguilera
Contact Info: luis.aguilera@colostate.edu

Copyright (c) 2021 Munsky Group 
Colorado State University 
Licensed under BSD 3-Clause License.

```

### Notebook summary 


- Load a directory with data quantified by FISH_pipeline.ipynb
- Establish a connection to Network-attached storage (NAS) using [pysmb](https://github.com/miketeo/pysmb)
- Compares quantifications for multiple conditions in a single plot
- Compares mRNA spots in the  nucleus, cytosol and the comple cell.
  
----

<img src= /home/luisub/Desktop/FISH_Processing/docs/images/code_architecture.png alt="drawing" width="1000"/>

## Importing libraries

In [7]:
import sys
import matplotlib.pyplot as plt 
from  matplotlib.ticker import FuncFormatter
import numpy as np 
import pandas as pd
import pathlib
import warnings
import glob
import seaborn as sns
import zipfile
import shutil
import scipy.stats as stats
warnings.filterwarnings("ignore")

## Defining paths

In [8]:
# Defining directories
current_dir = pathlib.Path().absolute()
fa_dir = current_dir.parents[0].joinpath('src')
# Importing fish_analyses module
sys.path.append(str(fa_dir))
import fish_analyses as fa
# Local folder path
local_folder_path = pathlib.Path().absolute().joinpath('temp_zip_analyses')
local_folder_path
# Path to credentials
desktop_path = pathlib.Path.home()/'Desktop'
# Connection to munsky-nas
path_to_config_file = desktop_path.joinpath('config.yml')
share_name = 'share'

## List of folders to process

In [9]:
def Huy_60X(mandatory_substring):
    list_dirs=(
    'smFISH_images/Linda_smFISH_images/Confocal/20220714/MS2-CY5_Cyto543_560_woStim',
    'smFISH_images/Linda_smFISH_images/Confocal/20220714/MS2-CY5_Cyto543_560_18minTPL_5uM' ,
    'smFISH_images/Linda_smFISH_images/Confocal/20220714/MS2-CY5_Cyto543_560_5hTPL_5uM' )
    list_labels = [ 'woSTM','18minTPL_5uM','5hTPL_5uM']
    plot_title_suffix= "MS2_CY5_60X"
    mandatory_substring = mandatory_substring      #'nuc_80__cyto_0__psfz_350__psfyx_160__ts_220'
    return list_dirs, list_labels, plot_title_suffix, mandatory_substring

In [10]:
def Huy_100X(mandatory_substring):
    list_dirs=(
    'smFISH_images/Linda_smFISH_images/Confocal/20211014/MS2-CY5-0minTPL',
    'smFISH_images/Linda_smFISH_images/Confocal/20211014/MS2-CY5-3minTPL',
    'smFISH_images/Linda_smFISH_images/Confocal/20211015/MS2-CY5-6minTPL',
    'smFISH_images/Linda_smFISH_images/Confocal/20211015/MS2-CY5-9minTPL', 
    'smFISH_images/Linda_smFISH_images/Confocal/20211015/MS2-CY5-12minTPL', 
    'smFISH_images/Linda_smFISH_images/Confocal/20211019/MS2-CY5-15minTPL',
    'smFISH_images/Linda_smFISH_images/Confocal/20211019/MS2-CY5-18minTPL', 
    'smFISH_images/Linda_smFISH_images/Confocal/20211019/MS2-CY5-21minTPL', 
    'smFISH_images/Linda_smFISH_images/Confocal/20211021/MS2-CY5-24minTPL', 
    'smFISH_images/Linda_smFISH_images/Confocal/20211021/MS2-CY5-27minTPL', 
    'smFISH_images/Linda_smFISH_images/Confocal/20211021/MS2-CY5-30minTPL', 
    'smFISH_images/Linda_smFISH_images/Confocal/20211021/MS2-CY5-60minTPL', 
    'smFISH_images/Linda_smFISH_images/Confocal/20210921/MS2-Cy5-120minTPL', 
    'smFISH_images/Linda_smFISH_images/Confocal/20210921/MS2-Cy5-240minTPL' )
    list_labels = [ '0min_TPL','3min_TPL','6min_TPL','9min_TPL','12min_TPL','15min_TPL','18min_TPL','21min_TPL','24min_TPL','27min_TPL','30min_TPL','60min_TPL', '120min_TPL','240min_TPL']
    plot_title_suffix= "MS2_CY5_100X"
    mandatory_substring = mandatory_substring      #'nuc_180__cyto_0__psfz_350__psfyx_120__ts_auto'
    return list_dirs, list_labels, plot_title_suffix, mandatory_substring

----

# Running the codes

----

To run the code please provide the list of folder to process. The code also needs a **mandatory_substring** this string describes the parameters used to generate the data. In short, the string can look like this:  'nuc_80__cyto_200__psfz_350__psfyx_160__ts_220'.

and this means:

nuc_80    : the size used to segment the nucleus \
cyto_200  : the size used to segment the cytosol \
psfz_350  : the size of the psf in z \
psfyx_160 : the size of the  psf in yx \
ts_220 : a threshold for spot detection 

In [11]:
# To download data from NAS it is necessary to use CSU network or use the CSU VPN.
#list_dirs, list_labels, plot_title_suffix, mandatory_substring = Huy_100X(mandatory_substring='nuc_80__cyto_0__psfz_350__psfyx_160__ts_220')
list_dirs, list_labels, plot_title_suffix, mandatory_substring = Huy_60X(mandatory_substring='nuc_180__cyto_0__psfz_350__psfyx_120__ts_auto')
minimal_TS_size = 3 # Just for the plotting
connect_to_NAS = True


## Connecting to NAS and extracting data

In [12]:
if connect_to_NAS == True:
    # Reading the data from NAS, unziping files, organizing data as single dataframe for comparison. 
    list_local_files = fa.Utilities.read_zipfiles_from_NAS(list_dirs,path_to_config_file,share_name, mandatory_substring, local_folder_path)
    list_local_folders = fa.Utilities.unzip_local_folders(list_local_files,local_folder_path)
else: 
    list_local_folders = list_dirs # Use this line to process files from a local repository
# Extracting data from each repository
list_spots_total, list_spots_nuc, list_spots_cytosol, list_number_cells, list_transcription_sites,list_cell_size,list_dataframes,list_nuc_size = fa.Utilities.extracting_data_for_each_df_in_directory(  list_local_folders=list_local_folders,current_dir=current_dir,minimal_TS_size=minimal_TS_size)


Connection established
Connection established
Connection established


In [13]:
print('number of cells in each dataset: ', list_number_cells)

number of cells in each dataset:  []


# Converting Dataframe to use in Brian's Matlab code.

# Extracting data for Matlab codes


In [14]:
psf_z=350                # Theoretical size of the PSF emitted by a [rna] spot in the z plan, in nanometers
psf_yx=160               # Theoretical size of the PSF emitted by a [rna] spot in the yx plan, in nanometers
voxel_size_z=500         # Microscope conversion px to nanometers in the z axis.
voxel_size_yx=160        # Microscope conversion px to nanometers in the xy axis.

In [15]:
scale = np.array ([ voxel_size_z/psf_z, voxel_size_yx/psf_yx, voxel_size_yx/psf_yx ])

In [16]:
def extract_spot_classification_from_df(df,threshold_in_pixels=2,show_plots = False):
    number_cells = df['cell_id'].nunique()
    array_spot_type_per_cell = np.zeros((number_cells, 7)).astype(int) # this array will store the spots separated  as types: spot_0_only, spot_1_only, or spot_0_1
    for cell_id in range(number_cells):
        # retrieving the coordinates for spots type 0 and 1 for each cell 
        array_spots_0 = np.asarray( df[['z','y','x']][(df["cell_id"] == cell_id) & (df["spot_type"] == 0)] ) # coordinates for spot_type_0 with shape [num_spots_type_0, 3]
        array_spots_1 = np.asarray( df[['z','y','x']][(df["cell_id"] == cell_id) & (df["spot_type"] == 1)] ) # coordinates for spot_type_1 with shape [num_spots_type_1, 3]
            
        total_spots0 = array_spots_0.shape[0]
        total_spots1 = array_spots_1.shape[0]
        # Concatenating arrays from spots 0 and 1
        array_all_spots = np.concatenate((array_spots_0,array_spots_1), axis=0) 
        # Calculating a distance matrix. 
        distance_matrix = np.zeros( (array_all_spots.shape[0], array_all_spots.shape[0])) #  the distance matrix is an square matrix resulting from the concatenation of both spot  types.
        for i in range(len(array_all_spots)):
            for j in range(len(array_all_spots)):
                if j<i:
                    distance_matrix[i,j] = np.linalg.norm( ( array_all_spots[i,:]-array_all_spots[j,:] ) * scale )
        # masking the distance matrix. Ones indicate the distance is less or equal than threshold_in_pixels
        mask_distance_matrix = (distance_matrix <= threshold_in_pixels) 
        # Selecting the right-lower quadrant as a subsection of the distance matrix that compares one spot type versus the other. 
        subsection_mask_distance_matrix = mask_distance_matrix[total_spots0:, 0:total_spots0].copy()
        if show_plots == True:
            plt.imshow(mask_distance_matrix, cmap='Greys_r')
            plt.imshow(subsection_mask_distance_matrix,cmap='Greys_r')
        # Calculating each type of spots in cell
        is_spot_only_type_0 = np.all(~subsection_mask_distance_matrix, axis =1 ) # Testing if all the columns are ones of inv(subsection_mask_distance_matrix). Representing spot type 0.
        is_spot_only_type_1 = np.all(~subsection_mask_distance_matrix, axis =0 ) #  Testing if all the rows are ones of inv(subsection_mask_distance_matrix). Representing spot type 1.
        num_type_0_only = np.sum(is_spot_only_type_0) 
        num_type_1_only =np.sum(is_spot_only_type_1) 
        num_type_0_1 = (total_spots0 - num_type_0_only) + (total_spots1 - num_type_1_only) # Number of spots in both channels
        array_spot_type_per_cell[cell_id,:] = np.array([cell_id, num_type_0_only, num_type_1_only, num_type_0_1, num_type_0_only+num_type_0_1, 
                                                        num_type_1_only+num_type_0_1, num_type_0_only+num_type_1_only+num_type_0_1]).astype(int)
        list_labels = ['cell_id','number_spots_type_0','number_spots_type_1','number_spots_type_0_1','munber_0', 'number_1','total']
        # creating a dataframe
        df_spots_classification = pd.DataFrame(data=array_spot_type_per_cell, columns=list_labels)
    return df_spots_classification  


In [17]:
# creating a folder to store all plots
destination_folder = pathlib.Path().absolute().joinpath('results', 'data_'+plot_title_suffix+'__'+mandatory_substring)
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder) 

In [18]:
# Iterating for each time point (experimental condition)
num_time_points = len(list_dataframes)
for i in range(0, num_time_points):
    df_spots_classification = extract_spot_classification_from_df(df=list_dataframes[i],threshold_in_pixels=2,show_plots = False)
    df_spots_classification.to_csv(pathlib.Path().absolute().joinpath(destination_folder,plot_title_suffix+'_classification_'+'time_'+str(i)+'.csv'))
    # saving the  original dataframe  back to the same folder
    list_dataframes[i].to_csv(pathlib.Path().absolute().joinpath(destination_folder,plot_title_suffix+'_complete_'+'time_'+str(i)+'.csv'))


In [19]:
# remove temporary folder
shutil.rmtree(local_folder_path)

FileNotFoundError: [Errno 2] No such file or directory: '/home/luisub/Desktop/FISH_Processing/notebooks/temp_zip_analyses'