### Postprocessing the Baysor segmentations
##### Baysor has many instances where a cell does not overlap with a nucleus, or a cell contains multiple nuclei. This script seeks to correct that. We are very confident in our nuclei segmentations, and therefore are able to make these adjustments with confidence

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from collections import Counter
import glob
import alphashape
import geopandas as gpd
import seaborn as sns
from shapely.ops import transform
import imageio as io
from core_functions.baysor_postprocessing import *
import warnings
from concurrent.futures import ThreadPoolExecutor 


##### Put the path to the folders where the Baysor runs are stored

In [3]:
data_dir = 'D:/amonell/timecourse_final'

##### Create anndatas from processing Baysor Segmentation

In [4]:
input_folders = glob.glob(os.path.join(data_dir, 'day*'))

#### To run without multithreading

In [7]:
warnings.filterwarnings("ignore")
for input_file in tqdm(input_folders):
    print(input_file)
    try: 
        os.mkdir(os.path.join(input_file, 'adatas'))
    except:
        print('Adatas dir already exists')

    print('Preparing Transcripts...', end = ' ')
    transcripts, transcripts_cellpose = prepare_transcripts(input_file)
    print('done')
    
    print('Assigning nuclei to Baysor Cells...', end = ' ')
    result = assign_nuclei_to_cells(transcripts, transcripts_cellpose)
    print('done')

    print('Finding the most common nucleus per cell...', end = ' ')
    transcripts_with_gt_and_main_nucleus_filtered,  groupby_most_common_nucleus = find_main_nucleus(transcripts, transcripts_cellpose, result)
    print('done')

    print('Splitting cells with multiple nucleus assignments...', end = ' ')
    transcripts_with_gt_and_main_nucleus_filtered = reassign_multiple_nuclei(transcripts_with_gt_and_main_nucleus_filtered, groupby_most_common_nucleus)
    print('done')

    print('Making adata...', end = ' ')
    anndata = make_adata(transcripts_with_gt_and_main_nucleus_filtered)
    print('done')
    
    anndata.write(os.path.join(input_file, 'adatas', '01_preprocessed.h5ad'))



#### To run with multithreading

In [6]:
def process_input_folder(input_file):
    print(input_file)
    try: 
        os.mkdir(os.path.join(input_file, 'adatas'))
    except:
        print('Adatas dir already exists')

    transcripts, transcripts_cellpose = prepare_transcripts(input_file)
    
    result = assign_nuclei_to_cells(transcripts, transcripts_cellpose)

    transcripts_with_gt_and_main_nucleus_filtered,  groupby_most_common_nucleus = find_main_nucleus(transcripts, transcripts_cellpose, result)

    transcripts_with_gt_and_main_nucleus_filtered = reassign_multiple_nuclei(transcripts_with_gt_and_main_nucleus_filtered, groupby_most_common_nucleus)

    anndata = make_adata(transcripts_with_gt_and_main_nucleus_filtered)
    
    anndata.write(os.path.join(input_file, 'adatas', '01_preprocessed.h5ad'))
with ThreadPoolExecutor(max_workers=16) as executor:  # You can adjust max_workers as needed
    list(tqdm(executor.map(process_input_folder, input_folders), total=len(input_folders)))