# Integrated Simulations

In [None]:
import os; from os import listdir; from os.path import isfile, join
import re  
from skimage.io import imread
from skimage.exposure import rescale_intensity
import numpy as np 
from tqdm.notebook import tqdm
from timeit import default_timer as timer
import scipy
import pandas as pd
import shutil
import pathlib
import sys
import seaborn as sns
import rsnapsim as rss
import scipy.stats as stats
import matplotlib.pyplot as plt 

In [None]:
# Defining directories
current_dir = pathlib.Path().absolute()
sequences_dir = current_dir.parents[1].joinpath('DataBases','gene_files')
video_dir = current_dir.parents[1].joinpath('DataBases','videos_for_sim_cell')
trajectories_dir = current_dir.parents[1].joinpath('DataBases','rsnapsim_simulations','kdm5b_ssa.npy')
rsnaped_dir = current_dir.parents[1].joinpath('rsnaped')
gene_file = current_dir.parents[1].joinpath('DataBases','gene_files','KDM5B_withTags.txt')
masks_dir = current_dir.parents[1].joinpath('DataBases','masks_for_sim_cell')

In [None]:
# Importing rSNAPed
sys.path.append(str(rsnaped_dir))
import rsnaped as rsp

In [None]:
rsp.Banner().print_banner()

In [None]:
# These are the parameters that need to be tested. 
number_of_simulated_cells = 8       # PLEASE TEST MIN 1 MAX 10
number_spots_per_cell = 40           # PLEASE TEST MIN 5 MAX 200
simulation_time_in_sec = 40          # PLEASE TEST MIN 10 MAX 100
diffusion_coefficient = 3          # PLEASE TEST MIN 0.1 MAX 2
min_percentage_time_tracking = 0.3   # (normalized) minimum time to consider a trajectory.
average_cell_diameter = 400

In [None]:
intensity_calculation_method = 'disk_donut'  # options are : 'total_intensity' and 'disk_donut' 'gaussian_fit'
mask_selection_method = 'max_area' # options are : 'max_spots' and 'max_area' 
use_optimization_for_tracking = 1 # 0 not using, 1 is using optimization

selected_channel_tracking = 0
selected_channel_segmentation = 1

frame_selection_empty_video = 'generate_from_gaussian' # Options are: 'constant' , 'shuffle' and 'loop' 'linear_interpolation'
show_plot = 1  # Flag to show plots for the detection and tracking process.
dataframe_format = 'long' # 'short'  'long'

save_as_tif = True # option to save the simulated video
save_dataframe = True # option to save the simulation output as a dataframe in format csv. 
store_videos_in_memory = False

In [None]:
spot_size = 7 # spot size for the simulation and tracking.
spot_sigma = 1
elongation_rate = 10
initiation_rate = 0.03

simulated_RNA_intensities_method = 'random_values'
frame_selection_empty_video = 'generate_from_gaussian' # Options are: 'constant' , 'shuffle' and 'loop' 'generate_from_gaussian'

In [None]:
basal_intensity_in_background_video = 10000
scale_intensity_in_base_video=True

In [None]:
intensity_scale_ch0 = 2
intensity_scale_ch1 = 2
intensity_scale_ch2 = None

In [None]:
particle_detection_size = spot_size

## Running the simulations

In [None]:
list_videos, list_dataframe_simulated_cell, merged_dataframe_simulated_cells, ssa_trajectories, list_files_names, video_path, dataframe_path = rsp.simulate_cell( video_dir, 
                                                                        list_gene_sequences = gene_file,
                                                                        list_number_spots= number_spots_per_cell,
                                                                        list_target_channels_proteins = 1,
                                                                        list_target_channels_mRNA = 0, 
                                                                        list_diffusion_coefficients=diffusion_coefficient,
                                                                        list_elongation_rates=elongation_rate,
                                                                        list_initiation_rates=initiation_rate,
                                                                        masks_dir=masks_dir, 
                                                                        list_label_names=1,
                                                                        number_cells = number_of_simulated_cells,
                                                                        simulation_time_in_sec = simulation_time_in_sec,
                                                                        step_size_in_sec = 1,
                                                                        save_as_tif = save_as_tif, 
                                                                        save_dataframe = save_dataframe,
                                                                        frame_selection_empty_video=frame_selection_empty_video,
                                                                        spot_size = spot_size,
                                                                        spot_sigma = spot_sigma,
                                                                        intensity_scale_ch0 = intensity_scale_ch0,
                                                                        intensity_scale_ch1 = intensity_scale_ch1,
                                                                        intensity_scale_ch2 = intensity_scale_ch2,
                                                                        dataframe_format = 'long',
                                                                        simulated_RNA_intensities_method=simulated_RNA_intensities_method,
                                                                        store_videos_in_memory= store_videos_in_memory,
                                                                        scale_intensity_in_base_video=scale_intensity_in_base_video,
                                                                        basal_intensity_in_background_video=basal_intensity_in_background_video)

In [None]:
number_images = len(list_videos)

In [None]:
# # Reads the folder with the results and import the simulations as lists
list_files_names = sorted([f for f in listdir(video_path) if isfile(join(video_path, f)) and ('.tif') in f], key=str.lower)  # reading all tif files in the folder
list_files_names.sort(key=lambda f: int(re.sub('\D', '', f)))  # sorting the index in numerical order
path_files = [ str(video_path.joinpath(f).resolve()) for f in list_files_names ] # creating the complete path for each file

# # Reading the microscopy data
number_images = number_of_simulated_cells
number_images

# Display simulations for single time point

## <span style="color:red">Channel 0</span>

In [None]:
# Showing the simulated images
list_videos = [imread(f)[:,:,:,:] for f in  path_files] # List with all the videos
rsp.VisualizerImage(list_videos,list_files_names=list_files_names,selected_channel =0,selected_time_point= 0,normalize=0,individual_figure_size=7).plot()

## <span style="color:lightgreen">Channel 1</span>

In [None]:
# Showing the simulated images
rsp.VisualizerImage(list_videos,list_files_names=list_files_names,selected_channel =1,selected_time_point= 0,normalize=0,individual_figure_size=7).plot()
del list_videos

In [None]:
# list_DataFrame_particles_intensities= []
# list_array_intensities = []
# list_time_vector = []
# list_selected_mask = []
# for i in tqdm(range(0,number_images)): 
#     selected_video = imread(path_files[i]) # Loading the video
#     DataFrame_particles_intensities, selected_mask, array_intensities, time_vector, _,_, _, _ = rsp.PipelineTracking(selected_video,
#                                                                                                                     particle_size=particle_detection_size,
#                                                                                                                     file_name=list_files_names[i],
#                                                                                                                     selected_channel_tracking = selected_channel_tracking,
#                                                                                                                     selected_channel_segmentation = selected_channel_segmentation,
#                                                                                                                     intensity_calculation_method =intensity_calculation_method, 
#                                                                                                                     mask_selection_method = mask_selection_method,
#                                                                                                                     show_plot=show_plot,
#                                                                                                                     use_optimization_for_tracking=use_optimization_for_tracking,
#                                                                                                                     real_positions_dataframe = list_dataframe_simulated_cell[i],
#                                                                                                                     average_cell_diameter=average_cell_diameter,
#                                                                                                                     print_process_times=1,
#                                                                                                                     min_percentage_time_tracking=min_percentage_time_tracking,
#                                                                                                                     dataframe_format=dataframe_format).run()    
#     list_DataFrame_particles_intensities.append(DataFrame_particles_intensities)
#     list_array_intensities.append(array_intensities)
#     list_time_vector.append(time_vector)
#     list_selected_mask.append(selected_mask)


In [None]:
list_DataFrame_particles_intensities, list_array_intensities, list_time_vector, list_selected_mask = rsp.image_processing( files_dir_path=video_path,
                                                                                                                            particle_size=particle_detection_size,
                                                                                                                            selected_channel_tracking = selected_channel_tracking,
                                                                                                                            selected_channel_segmentation = selected_channel_segmentation,
                                                                                                                            intensity_calculation_method =intensity_calculation_method, 
                                                                                                                            mask_selection_method = mask_selection_method,
                                                                                                                            show_plot=show_plot,
                                                                                                                            use_optimization_for_tracking=use_optimization_for_tracking,
                                                                                                                            real_positions_dataframe = list_dataframe_simulated_cell,
                                                                                                                            average_cell_diameter=average_cell_diameter,
                                                                                                                            print_process_times=1,
                                                                                                                            min_percentage_time_tracking=min_percentage_time_tracking,
                                                                                                                            dataframe_format=dataframe_format)

# Dataframe

In [None]:
list_DataFrame_particles_intensities[0].head()

# Comparing intensity distributions

## "Real" intensities from SSA

In [None]:
selected_time_point = 0 #simulation_time_in_sec-1

In [None]:
def remove_extrema(vector ,max_percentile = 98):
    '''This function is intended to remove extrema data given by the max percentiles specified by the user'''
    vector = vector [vector>0]
    max_val = np.percentile(vector, max_percentile)
    new_vector = vector [vector< max_val] # = np.percentile(vector,max_percentile)
    print(0 ,round(max_val,2))
    return new_vector

In [None]:
ssa_trajectories = np.load(str(trajectories_dir))
ssa_trajectories_timePoint = ssa_trajectories[:,selected_time_point].flatten()
#ssa_trajectories_timePoint= remove_extrema(ssa_trajectories_timePoint)
ssa_trajectories_timePoint_normalized = (ssa_trajectories_timePoint-np.amin(ssa_trajectories_timePoint))/ (np.amax(ssa_trajectories_timePoint)-np.amin(ssa_trajectories_timePoint))

In [None]:
list_array_intensities[1].shape

## Recovered intensities from tracking

In [None]:
all_cells_green_int = np.array([])
for i in range(0,number_images): 
    all_cells_green_int = np.append(all_cells_green_int,list_array_intensities[i][:,selected_time_point,1].flatten())   
all_cells_green_int = all_cells_green_int[all_cells_green_int>0]
all_cells_green_int= remove_extrema(all_cells_green_int)
all_cells_green_int_normalized = (all_cells_green_int-np.amin(all_cells_green_int))/ (np.amax(all_cells_green_int)-np.amin(all_cells_green_int))

In [None]:
merged_dataframe_simulated_cells.head()

## Loading intensities from image. "Perfect tracking"

In [None]:
dataframe_path

In [None]:
def extract_intensity_from_dataframe(dataframe_path, number_images,selected_time=0,selected_field='green_int_mean', remove_negative_values=True,remove_extreme_values=True):
    list_with_extracted_data = []
    temporal_dataframe = pd.read_csv(dataframe_path)
    extracted_data = temporal_dataframe.loc[(temporal_dataframe['frame']==selected_time)][selected_field].values
    # Negative values may appear if the intensity in the background is higher than the intensity in the spot. This option allows to remove negative values.
    if remove_negative_values == True:
        extracted_data =  extracted_data[extracted_data>=0] 
    # To plot the histogram, we could remove extreme values. Extreme values are defined as those above the 98 percentile. 
    if remove_extreme_values==True:
        extracted_data= remove_extrema(extracted_data)  
    # Option to normalize data to the maximum value.
    normalized_extracted_data = (extracted_data-np.min(extracted_data))/ (np.max(extracted_data)-np.min(extracted_data))  
    return extracted_data,normalized_extracted_data

In [None]:
intensity_values_in_image_flat,intensity_values_in_image_normalized =  extract_intensity_from_dataframe(dataframe_path=dataframe_path, number_images=number_images,selected_time=0,selected_field='green_int_mean')

# Intensity histograms with au

In [None]:
# plotting
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
axes[0].hist(ssa_trajectories_timePoint,bins=60,density=True, stacked=True, color='orangered' )     
axes[0].set(title='Simulation')
axes[0].set(xlabel='intensities (au)')
axes[0].set(ylabel='count')

axes[1].hist(intensity_values_in_image_flat,bins=60,density=True, stacked=True, color='cyan' )     
axes[1].set(title='Image')
axes[1].set(xlabel='intensities (au)')
axes[1].set(ylabel='count')

axes[2].hist(all_cells_green_int,bins=60,density=True, stacked=True, color='chartreuse' )     
axes[2].set(title='Tracking')
axes[2].set(xlabel='intensities (au)')
axes[2].set(ylabel='count')
plt.tight_layout()

## Normalizing intensities to 1.

$ X_{norm} = \frac{X -min(X)}{max(X) - min(X)} $

In [None]:
# plotting normalized intensities
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
axes[0].hist(ssa_trajectories_timePoint_normalized,bins=100,density=True, stacked=True, color='orangered' )     
axes[0].set(title='Simulation')
axes[0].set(xlabel='intensities (norm)')
axes[0].set(ylabel='count')

axes[1].hist(intensity_values_in_image_normalized,bins=100,density=True, stacked=True, color='cyan' )     
axes[1].set(title='Image')
axes[1].set(xlabel='intensities (au)')
axes[1].set(ylabel='count')

axes[2].hist(all_cells_green_int_normalized,bins=100,density=True, stacked=True, color='chartreuse' )     
axes[2].set(title='Tracking')
axes[2].set(xlabel='intensities (norm)')
axes[2].set(ylabel='count')

plt.tight_layout()

### Statisics from normalized distributions

In [None]:
print('mean ssa:              ', np.round(np.mean(ssa_trajectories_timePoint_normalized),3) )
print('mean perfect tracking: ', np.round(np.mean(intensity_values_in_image_normalized),3)  )
print('mean tracking:         ', np.round(np.mean(all_cells_green_int_normalized),3)  )

In [None]:
print('std ssa:              ', np.round(np.std(ssa_trajectories_timePoint_normalized),3) )
print('std perfect tracking: ', np.round(np.std(intensity_values_in_image_normalized),3)  )
print('std tracking:         ', np.round(np.std(all_cells_green_int_normalized),3)  )

## Cummulative frequencies

In [None]:
# Data
data1 = ssa_trajectories_timePoint_normalized
data_sorted_1 = np.sort(data1)
p_1 =np.linspace(0, 1, len(data1), endpoint=False)

data2 = all_cells_green_int_normalized
data_sorted_2 = np.sort(data2)
p_2 =np.linspace(0, 1, len(data2), endpoint=False)

data3 = intensity_values_in_image_normalized
data_sorted_3 = np.sort(data3)
p_3 =np.linspace(0, 1, len(data3), endpoint=False)

# Plotting
plt.plot(data_sorted_1, p_1, 'orangered',linewidth=3,label ='Simulation')
plt.plot(data_sorted_2, p_2,'chartreuse',linewidth=3,label ='tracking')
plt.plot(data_sorted_3, p_3,'cyan',linewidth=3,label ='Image')

plt.legend()
plt.title('cumfreq')
plt.ylabel('Cumulative probability')
plt.xlabel('Normalized intensity')
plt.show()

# Print number of spots
print('Number of spots for Simulation:',len(data1))
print('Number of spots recovered from tracking:',len(data2))
print('Number of spots recovered from image:',len(data3))

## Comparison using the KS-distance

In [None]:
# Calculating Kolmogorov distance

ks_distance = scipy.stats.kstest(data1,data2).statistic
print('The KS-distance between SSA and tracking is:' , round(ks_distance,2))

ks_distance = scipy.stats.kstest(data1,data3).statistic
print('The KS-distance between SSA and image is:' , round(ks_distance,2))

#ks_distance = scipy.stats.kstest(data3,data2).statistic
#print('The KS-distance between image and tracking is:' , round(ks_distance,2))

## Comparison using the Anderson-Darling distance


In [None]:
ad_distance,_,_ = scipy.stats.anderson_ksamp([data1,data2],midrank=False)
print('The AD-distance between SSA and tracking is:' , round(ad_distance,2))

ad_distance,_,_ = scipy.stats.anderson_ksamp([data1,data3],midrank=False)
print('The AD-distance between SSA and image is:' , round(ad_distance,2))

#
## Comparison using likelihood function
#

In [None]:
def LL_fun(real_data,simulation_data,nbins=30):
    hist_exp_data, hist_exp_bins = np.histogram( real_data , bins=nbins)
    dist_sim_data, dist_sim_bins = np.histogram(simulation_data, bins=hist_exp_bins, density=True)
    dist_sim_data[dist_sim_data ==0] = 1e-7
    LL_int_distb = np.dot(hist_exp_data,np.log(dist_sim_data))    # likelihood function for comparing distributions
    return LL_int_distb


In [None]:
## LOG LIKELIHOOD OF THE INTENSITY DISTRIBUTIONS

LL_ssa_tracking = LL_fun(real_data= data_sorted_1,simulation_data=data_sorted_2,nbins=100)
print('The Likelihood between SSA and tracking is:' , round(LL_ssa_tracking,2))

LL_ssa_img = LL_fun(real_data=data_sorted_1, simulation_data=data_sorted_3,nbins=100)
print('The Likelihood between SSA and image is:' , round(LL_ssa_img,2))

# Scatter Plots

In [None]:
def plot_scatter_spots_cell_size(x,y,plot_title,selected_color = '#1C00FE',xlabel='',ylabel=''):
    r, p = stats.pearsonr(x, y)
    df_join_distribution = pd.DataFrame({'X':x,'Y':y})
    #plt.figure(figsize=(6,5))
    sns.set(font_scale = 1.3)
    b = sns.jointplot(data=df_join_distribution, y='Y', x='X', color= selected_color , marginal_kws=dict(bins=40, rug=True))
    b.plot_joint(sns.rugplot, height=0, color=[0.7,0.7,0.7], clip_on=True)
    b.plot_joint(sns.kdeplot, color=[0.5,0.5,0.5], levels=5)
    b.plot_joint(sns.regplot,scatter_kws={'color': 'orangered',"s":10, 'marker':'o'}, line_kws={'color': selected_color,'lw': 2} )
    blank_plot, = b.ax_joint.plot([], [], linestyle="", alpha=0)
    b.ax_joint.legend([blank_plot],['r={:.2f}'.format( np.round(r,2))],loc='upper left',)
    b.ax_joint.set_xlim(np.percentile(x,1), np.percentile(x,99))
    b.ax_joint.set_ylim(np.percentile(y,1), np.percentile(y,99))
    b.fig.suptitle(plot_title)
    b.ax_joint.set_xlabel(xlabel)
    b.ax_joint.set_ylabel(ylabel)
    b.ax_joint.collections[0].set_alpha(0)
    b.fig.tight_layout()
    b.fig.subplots_adjust(top=0.92) 
    #name_plot = plot_title +'.pdf'  
    #plt.savefig(name_plot, transparent=False,dpi=1200, bbox_inches = 'tight', format='pdf')
    plt.show()
    #pathlib.Path().absolute().joinpath(name_plot).rename(pathlib.Path().absolute().joinpath(destination_folder,name_plot))
    return

In [None]:
# Scatter plots intensity comparing all channels
#red_int_mean	green_int_mean	blue_int_mean	red_int_std	green_int_std	blue_int_std	x	y	SNR_red	SNR_green	SNR_blue	background_int_mean_red	background_int_mean_green	background_int_mean_blue	background_int_std_red	background_int_std_green	background_int_std_blue
int_red,_ =  extract_intensity_from_dataframe(dataframe_path=dataframe_path, number_images=number_images,selected_time=0,selected_field='red_int_mean', remove_negative_values=False,remove_extreme_values=False)
int_green,_ =  extract_intensity_from_dataframe(dataframe_path=dataframe_path, number_images=number_images,selected_time=0,selected_field='green_int_mean', remove_negative_values=False,remove_extreme_values=False)
plot_scatter_spots_cell_size(int_red,int_green,plot_title='Green vs Red',selected_color = '#1C00FE',xlabel='red int',ylabel='green int')

## <span style="color:lightgreen">Channel 1</span>

In [None]:
# Scatter plots intensity comparing all channels # background_int_std_green #SNR_green
background,_ =  extract_intensity_from_dataframe(dataframe_path=dataframe_path, number_images=number_images,selected_time=0,selected_field='background_int_std_green', remove_negative_values=False,remove_extreme_values=False)
intensity,_ =  extract_intensity_from_dataframe(dataframe_path=dataframe_path, number_images=number_images,selected_time=0,selected_field='green_int_mean', remove_negative_values=False,remove_extreme_values=False)
plot_scatter_spots_cell_size(intensity,background,plot_title='Green signal',selected_color = '#1C00FE',ylabel='std(background int)',xlabel='Intensity')

# Scatter plots intensity comparing all channels # background_int_std_green #SNR_green
SNR,_ =  extract_intensity_from_dataframe(dataframe_path=dataframe_path, number_images=number_images,selected_time=0,selected_field='SNR_green', remove_negative_values=False,remove_extreme_values=False)
intensity,_ =  extract_intensity_from_dataframe(dataframe_path=dataframe_path, number_images=number_images,selected_time=0,selected_field='green_int_mean', remove_negative_values=False,remove_extreme_values=False)
plot_scatter_spots_cell_size(intensity,SNR,plot_title='Green signal',selected_color = '#1C00FE',xlabel='Intensity',ylabel='SNR')

## <span style="color:red">Channel 0</span>

In [None]:

# Scatter plots intensity comparing all channels # background_int_std_green #SNR_green
background,_ =  extract_intensity_from_dataframe(dataframe_path=dataframe_path, number_images=number_images,selected_time=0,selected_field='background_int_std_red', remove_negative_values=False,remove_extreme_values=False)
intensity,_ =  extract_intensity_from_dataframe(dataframe_path=dataframe_path, number_images=number_images,selected_time=0,selected_field='red_int_mean', remove_negative_values=False,remove_extreme_values=False)
plot_scatter_spots_cell_size(intensity,background,plot_title='Red signal',selected_color = '#1C00FE',ylabel='std(background int)',xlabel='Intensity')

# Scatter plots intensity comparing all channels # background_int_std_green #SNR_green
SNR,_ =  extract_intensity_from_dataframe(dataframe_path=dataframe_path, number_images=number_images,selected_time=0,selected_field='SNR_red', remove_negative_values=False,remove_extreme_values=False)
intensity,_ =  extract_intensity_from_dataframe(dataframe_path=dataframe_path, number_images=number_images,selected_time=0,selected_field='red_int_mean', remove_negative_values=False,remove_extreme_values=False)
plot_scatter_spots_cell_size(intensity,SNR,plot_title='Red signal',selected_color = '#1C00FE',xlabel='Intensity',ylabel='SNR')