### Postprocessing the Baysor segmentations
##### Baysor has many instances where a cell does not overlap with a nucleus, or a cell contains multiple nuclei. This script seeks to correct that. We are very confident in our nuclei segmentations, and therefore are able to make these adjustments with confidence

##### This code uses timecourse_env_01 as the anaconda environment

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from collections import Counter
import glob
import alphashape
import geopandas as gpd
import seaborn as sns
from shapely.ops import transform
import imageio as io
from core_functions.baysor_postprocessing import *
import warnings
from concurrent.futures import ThreadPoolExecutor

##### Put the path to the folders where the Baysor runs are stored

In [None]:
data_dir = "D:/amonell/timecourse_final"

##### Create anndatas from processing Baysor Segmentation

In [None]:
input_folders = glob.glob(os.path.join(data_dir, "day*"))

#### To run without multithreading

In [None]:
warnings.filterwarnings("ignore")
for input_file in tqdm(input_folders):
    print(input_file)
    try:
        os.mkdir(os.path.join(input_file, "adatas"))
    except:
        print("Adatas dir already exists")

    print("Preparing Transcripts...", end=" ")
    transcripts, transcripts_cellpose = prepare_transcripts(input_file)
    print("done")

    print("Assigning nuclei to Baysor Cells...", end=" ")
    result = assign_nuclei_to_cells(transcripts, transcripts_cellpose)
    print("done")

    print("Finding the most common nucleus per cell...", end=" ")
    transcripts_with_gt_and_main_nucleus_filtered, groupby_most_common_nucleus = (
        find_main_nucleus(transcripts, transcripts_cellpose, result)
    )
    print("done")

    print("Splitting cells with multiple nucleus assignments...", end=" ")
    transcripts_with_gt_and_main_nucleus_filtered = reassign_multiple_nuclei(
        transcripts_with_gt_and_main_nucleus_filtered, groupby_most_common_nucleus
    )
    print("done")

    print("Making adata...", end=" ")
    anndata = make_adata(transcripts_with_gt_and_main_nucleus_filtered)
    print("done")

    anndata.write(os.path.join(input_file, "adatas", "01_preprocessed.h5ad"))

#### To run with multithreading

In [None]:
def process_input_folder(input_file):
    print(input_file)
    try:
        os.mkdir(os.path.join(input_file, "adatas"))
    except:
        print("Adatas dir already exists")

    transcripts, transcripts_cellpose = prepare_transcripts(input_file)

    result = assign_nuclei_to_cells(transcripts, transcripts_cellpose)

    transcripts_with_gt_and_main_nucleus_filtered, groupby_most_common_nucleus = (
        find_main_nucleus(transcripts, transcripts_cellpose, result)
    )

    transcripts_with_gt_and_main_nucleus_filtered = reassign_multiple_nuclei(
        transcripts_with_gt_and_main_nucleus_filtered, groupby_most_common_nucleus
    )

    anndata = make_adata(transcripts_with_gt_and_main_nucleus_filtered)

    anndata.write(os.path.join(input_file, "adatas", "01_preprocessed.h5ad"))


with ThreadPoolExecutor(
    max_workers=16
) as executor:  # You can adjust max_workers as needed
    list(
        tqdm(
            executor.map(process_input_folder, input_folders), total=len(input_folders)
        )
    )

In [None]:
transcript_subset_fov = transcripts_with_gt_and_main_nucleus_filtered[
    (minx < transcripts_with_gt_and_main_nucleus_filtered.y * (1 / 0.2125))
    & (transcripts_with_gt_and_main_nucleus_filtered.y * (1 / 0.2125) < maxx)
    & (miny < transcripts_with_gt_and_main_nucleus_filtered.x * (1 / 0.2125))
    & (transcripts_with_gt_and_main_nucleus_filtered.x * (1 / 0.2125) < maxy)
]


def make_alphashape(points: pd.DataFrame, alpha: float):
    points = np.array(points)
    shape = alphashape.alphashape(points, alpha=alpha)
    return shape


shapes = (
    transcript_subset_fov[~pd.isnull(transcript_subset_fov.cell)]
    .groupby("split_cell")[["x", "y"]]
    .apply(make_alphashape, alpha=0.05)
)

In [None]:
shapes = gpd.GeoSeries(shapes)

In [None]:
import json


def get_pixel_size(path: str) -> float:
    file = open(
        os.path.join(
            "D:/amonell/timecourse/output-XETG00095__0011274__SI_d6__20230825__004851",
            "experiment.xenium",
        )
    )
    experiment = json.load(file)
    pixel_size = experiment["pixel_size"]
    return pixel_size


pixel_size = get_pixel_size("")


def scale_to_image(x, y):
    return (x / pixel_size, y / pixel_size)


# ax.set_xlim((0, np.max(transcript_subset_fov.x.values)))
# ax.set_ylim((np.max(transcript_subset_fov.y.values), 0))

colors = sns.color_palette()[3]
shapes2 = shapes.apply(lambda x: transform(scale_to_image, x))


from shapely.affinity import scale

fig, ax = plt.subplots(1, 1, figsize=(15, 15))
img_cropped = img[minx:maxx, miny:maxy]
ax.imshow(img_cropped, vmax=np.percentile(img_cropped, 99.9))
# Create an empty GeoDataFrame to store adjusted polygons
adjusted_shapes = []

# Iterate through the shapes DataFrame and adjust each polygon
for original_polygon in shapes2:
    scaled_polygon = sa.translate(original_polygon, -miny, -minx)
    adjusted_shapes.append(scaled_polygon)
adjusted_shapes = gpd.GeoSeries(adjusted_shapes)
# Plot the adjusted polygons
adjusted_shapes.plot(facecolor=colors, edgecolor="none", alpha=0.2, ax=ax)
adjusted_shapes.plot(facecolor="none", edgecolor=colors, alpha=0.7, ax=ax)
ax.set_xlim((0, 1000))
# ax.set_ylim((1500, 500))
plt.scatter(
    (
        transcript_subset_all[transcript_subset_all.overlaps_nucleus == 1].x.values
        / pixel_size
    )
    - miny,
    (
        transcript_subset_all[transcript_subset_all.overlaps_nucleus == 1].y.values
        / pixel_size
    )
    - minx,
    s=1,
    linewidths=0.01,
    alpha=0.5,
    c="white",
)
plt.savefig("C:/Users/amonell/Downloads/newest_seg.png")
plt.show()

In [None]:
merge_dic = cyto_nuc.merge(keydf, left_index=True, right_on="cell_number", how="left")
merge_dic["inds"] = [i for i in range(len(merge_dic.index))]
groupby_most_common_nucleus = merge_dic.groupby("nucleus")

new_cyto_nuc = []
new_cell_by_gene = []
names = []
sets = {}
for group_name, group_data in tqdm(groupby_most_common_nucleus):
    indices = group_data.inds.values
    names.append(group_data.cell_number.values[0])
    for m in group_data.cell_number.values:
        sets[m] = group_data.cell_number.values[0]
    new_cyto_nuc.append(np.sum(cyto_nuc.iloc[indices].values, axis=0))
    new_cell_by_gene.append(np.sum(cell_by_gene.iloc[indices].values, axis=0))

new_cell_by_gene = np.array(new_cell_by_gene)
new_cyto_nuc = np.array(new_cyto_nuc)
new_cell_by_gene = pd.DataFrame(
    new_cell_by_gene, columns=cell_by_gene.columns, index=names
)
new_cyto_nuc = pd.DataFrame(new_cyto_nuc, index=names, columns=cyto_nuc.columns)

In [None]:
new_cell_column = [sets.get(p) for p in transcripts.cell]

In [None]:
transcripts["new_cell"] = new_cell_column

##### Splitting multi-nucleus cells

In [None]:
transcripts

In [None]:
anndata = sc.AnnData(
    new_cell_by_gene.values,
    var=pd.DataFrame(index=new_cell_by_gene.columns),
    obs=new_cyto_nuc,
)

anndata.layers["raw"] = anndata.X
anndata.obs["cytoplasmic_transcripts"] = (
    anndata.obs["total_transcripts"] - anndata.obs["nuclear_transcripts"]
)
anndata.obs["nuclear_transcript_percentage"] = (
    anndata.obs["nuclear_transcripts"] / anndata.obs["total_transcripts"]
)
anndata.var["gene"] = anndata.var.index.values
anndata.obs["cell"] = anndata.obs.index.values
cell_spatial = transcripts.groupby("new_cell")[["x", "y"]].mean()
anndata.uns["points"] = transcripts
anndata.obs = anndata.obs.merge(
    cell_spatial, how="left", left_index=True, right_index=True
)
anndata.obsm["X_spatial"] = anndata.obs[["x", "y"]].values
anndata = anndata[
    :,
    ~(
        (anndata.var.index.str.contains("BLANK"))
        | (anndata.var.index.str.contains("NegControl"))
    ),
]

In [None]:
merged_transcripts = transcripts[transcripts.overlaps_nucleus == 1].merge(
    transcripts_cellpose[transcripts_cellpose.overlaps_nucleus == 1],
    how="outer",
    right_index=True,
    left_index=True,
)

In [None]:
transcript_subset = transcripts[transcripts["new_cell"].isin(anndata.obs.index)]

In [None]:
minx = 8000
miny = 20000
maxx = 10000
maxy = 28000

In [None]:
transcript_subset_fov = transcript_subset[
    transcript_subset["new_cell"].isin(anndata.obs.index)
    & (minx < transcript_subset.y * (1 / 0.2125))
    & (transcript_subset.y * (1 / 0.2125) < maxx)
    & (miny < transcript_subset.x * (1 / 0.2125))
    & (transcript_subset.x * (1 / 0.2125) < maxy)
]

In [None]:
transcript_subset_all = transcripts[
    (minx < transcripts.y * (1 / 0.2125))
    & (transcripts.y * (1 / 0.2125) < maxx)
    & (miny < transcripts.x * (1 / 0.2125))
    & (transcripts.x * (1 / 0.2125) < maxy)
]

In [None]:
def make_alphashape(points: pd.DataFrame, alpha: float):
    points = np.array(points)
    shape = alphashape.alphashape(points, alpha=alpha)
    return shape


shapes = (
    transcript_subset_fov[~pd.isnull(transcript_subset_fov.cell)]
    .groupby("new_cell")[["x", "y"]]
    .apply(make_alphashape, alpha=0.05)
)

In [None]:
def make_alphashape(points: pd.DataFrame, alpha: float):
    points = np.array(points)
    shape = alphashape.alphashape(points, alpha=alpha)
    return shape


shapes_all = (
    transcript_subset_all[~pd.isnull(transcript_subset_all.cell)]
    .groupby("cell")[["x", "y"]]
    .apply(make_alphashape, alpha=0.05)
)

In [None]:
shapes = gpd.GeoSeries(shapes)

In [None]:
shapes_all = gpd.GeoSeries(shapes_all)

In [None]:
import imageio as io


def import_image(path: str):
    file = os.path.join(path, "morphology_mip.ome.tif")
    img = io.imread(file)
    return img


img = import_image(
    "D:/amonell/timecourse/output-XETG00095__0011274__SI_d6__20230825__004851"
)

In [None]:
import shapely.affinity as sa

In [None]:
import json


def get_pixel_size(path: str) -> float:
    file = open(
        os.path.join(
            "D:/amonell/timecourse/output-XETG00095__0011274__SI_d6__20230825__004851",
            "experiment.xenium",
        )
    )
    experiment = json.load(file)
    pixel_size = experiment["pixel_size"]
    return pixel_size


pixel_size = get_pixel_size("")


def scale_to_image(x, y):
    return (x / pixel_size, y / pixel_size)


# ax.set_xlim((0, np.max(transcript_subset_fov.x.values)))
# ax.set_ylim((np.max(transcript_subset_fov.y.values), 0))

colors = sns.color_palette()[3]
shapes2 = shapes.apply(lambda x: transform(scale_to_image, x))


from shapely.affinity import scale

fig, ax = plt.subplots(1, 1, figsize=(15, 15))
img_cropped = img[minx:maxx, miny:maxy]
ax.imshow(img_cropped, vmax=np.percentile(img_cropped, 99.9))
# Create an empty GeoDataFrame to store adjusted polygons
adjusted_shapes = []

# Iterate through the shapes DataFrame and adjust each polygon
for original_polygon in shapes2:
    scaled_polygon = sa.translate(original_polygon, -miny, -minx)
    adjusted_shapes.append(scaled_polygon)
adjusted_shapes = gpd.GeoSeries(adjusted_shapes)
# Plot the adjusted polygons
adjusted_shapes.plot(facecolor=colors, edgecolor="none", alpha=0.2, ax=ax)
adjusted_shapes.plot(facecolor="none", edgecolor=colors, alpha=0.7, ax=ax)
ax.set_xlim((0, 1000))
# ax.set_ylim((1500, 500))
plt.scatter(
    (
        transcript_subset_all[transcript_subset_all.overlaps_nucleus == 1].x.values
        / pixel_size
    )
    - miny,
    (
        transcript_subset_all[transcript_subset_all.overlaps_nucleus == 1].y.values
        / pixel_size
    )
    - minx,
    s=1,
    linewidths=0.01,
    alpha=1,
    c="white",
)
plt.savefig("C:/Users/amonell/Downloads/seg_no_tan.png")
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 15))
img_cropped = img[minx:maxx, miny:maxy]
ax.imshow(img_cropped, vmax=np.percentile(img_cropped, 99.9))
new_adjusted_shapes = []
for a in adjusted_shapes:
    try:
        new_adjusted_shapes.append(a)
    except:
        print("Not polygon")
new_adjusted_shapes = gpd.GeoSeries(new_adjusted_shapes)
# Plot the adjusted polygons
new_adjusted_shapes.plot(facecolor=colors, edgecolor="none", alpha=0.2, ax=ax)
new_adjusted_shapes.plot(facecolor="none", edgecolor=colors, alpha=0.7, ax=ax)
ax.set_xlim((0, 1000))
# ax.set_ylim((1500, 500))
plt.scatter(
    (
        transcript_subset_all[transcript_subset_all.overlaps_nucleus == 1].x.values
        / pixel_size
    )
    - miny,
    (
        transcript_subset_all[transcript_subset_all.overlaps_nucleus == 1].y.values
        / pixel_size
    )
    - minx,
    s=1,
    linewidths=0.01,
    alpha=1,
    c="white",
)
plt.savefig("C:/Users/amonell/Downloads/seg_no_tan.png")
plt.show()

In [None]:
shapes_all2 = shapes_all.apply(lambda x: transform(scale_to_image, x))
fig, ax = plt.subplots(1, 1, figsize=(15, 15))
img_cropped = img[minx:maxx, miny:maxy]
ax.imshow(img_cropped, vmax=np.percentile(img_cropped, 99.9))
# Create an empty GeoDataFrame to store adjusted polygons
adjusted_shapes = []

# Iterate through the shapes DataFrame and adjust each polygon
for original_polygon in shapes_all2:
    scaled_polygon = sa.translate(original_polygon, -miny, -minx)
    adjusted_shapes.append(scaled_polygon)
adjusted_shapes = gpd.GeoSeries(adjusted_shapes)
# Plot the adjusted polygons
adjusted_shapes.plot(facecolor=colors, edgecolor="none", alpha=0.2, ax=ax)
adjusted_shapes.plot(facecolor="none", edgecolor=colors, alpha=0.7, ax=ax)
plt.scatter(
    (transcript_subset_all.x.values / pixel_size) - miny,
    (transcript_subset_all.y.values / pixel_size) - minx,
    s=1,
    linewidths=0.01,
    alpha=1,
    c="white",
)
ax.set_xlim((0, 1000))
# ax.set_ylim((1500, 500))
plt.show()

In [None]:
input_file = input_folders[2]

In [None]:
tc = pd.read_csv(os.path.join(input_file, "transcripts_cellpose.csv"), index_col=0)
t = pd.read_csv(os.path.join(input_file, "transcripts.csv"), index_col=0)

In [None]:
t_ = t.dropna(subset=["cell"])
cell_by_gene = t_.groupby(["cell", "gene"]).size().unstack(fill_value=0)

In [None]:
transcripts_nucleus = t_[t_["overlaps_nucleus"] == 1]
cell_by_gene_nucleus = (
    transcripts_nucleus.groupby(["cell", "gene"]).size().unstack(fill_value=0)
)
cell_by_gene_tpc = np.sum(cell_by_gene, axis=1)
cell_by_gene_nucleus_tpc = np.sum(cell_by_gene_nucleus, axis=1)
cyto_nuc = pd.DataFrame(cell_by_gene_tpc).merge(
    pd.DataFrame(cell_by_gene_nucleus_tpc),
    how="outer",
    left_index=True,
    right_index=True,
)
cyto_nuc.columns = ["total_transcripts", "nuclear_transcripts"]

In [None]:
cyto_nuc = cyto_nuc.fillna(0).astype(int)
tc.index = tc.transcript_id.values

In [None]:
cell_values = t_.cell

In [None]:
cell_values.fillna("Not_assigned-0", inplace=True)
t_["cell_number"] = cell_values

In [None]:
overlap = t_[t_.overlaps_nucleus == 1]

In [None]:
nuclei_associated = tc.loc[overlap.index.values]

In [None]:
overlap["associated_nucleus"] = nuclei_associated.cell_id.values
cell_numbers = overlap.cell_number.values
associated_nuclei = overlap.associated_nucleus.values

In [None]:
# Create a dictionary to store the most common value for each unique cell number
most_common_values = {}

# Iterate through the pairs of cell numbers and associated nuclei
for cell_number, nucleus in tqdm(zip(cell_numbers, associated_nuclei)):
    if cell_number not in most_common_values:
        most_common_values[cell_number] = Counter()

    most_common_values[cell_number][nucleus] += 1

# Calculate the most common nucleus for each unique cell number
result = {
    cell_number: counter.most_common(1)[0][0]
    for cell_number, counter in most_common_values.items()
}

In [None]:
keys = list(result.keys())
values = list(result.values())
index = [i for i in range(len(keys))]

In [None]:
nuclei_per = []
for cell_number, counter in most_common_values.items():
    nuclei_per.append(len(counter.items()))

In [None]:
non_overlap = t_[t_.overlaps_nucleus == 0]

In [None]:
zeros = len(np.unique(non_overlap.cell.values)) - len(
    set(np.unique(non_overlap.cell.values)).intersection(
        set(np.unique(overlap.cell.values))
    )
)

In [None]:
for i in range(zeros):
    nuclei_per.append(0)

In [None]:
plt.hist(np.clip(nuclei_per, 0, 10), bins=20)
plt.xlim(0, 10)
plt.xlabel("Number of Cellpose Nuclei")
plt.ylabel("Number of Baysor Cells")
plt.xticks([i for i in range(10)])
plt.title("D6")
plt.grid(False)
plt.show()

In [None]:
t_subset = t.iloc[[i for i in range(1000000)]]

In [None]:
t_sub = t_subset[t_subset.overlaps_nucleus == 1]

In [None]:
t_sub_ = t_sub.dropna(subset=["cell"])

In [None]:
# unique_values = np.unique(t_sub_.cell.values)

# import random
# color_map = {value: (random.random(), random.random(), random.random()) for value in tqdm(unique_values)}

In [None]:
# colors1 = [color_map[value] for value in tqdm(t_sub_.cell.values)]

In [None]:
# plt.scatter(t_sub.x.values[:3000], t_sub.y.values[:3000], s=1, c=colors[:3000])
# plt.axis('equal')

In [None]:
# plt.scatter(t_.x.values[:3000], t_.y.values[:3000], s=1, c=t_.overlaps_nucleus.values[:3000])

# plt.axis('equal')

In [None]:
tc_sub = t_subset.merge(tc, left_index=True, right_index=True, how="left")

In [None]:
ts_sub2 = tc_sub[tc_sub.overlaps_nucleus_y == 1]

In [None]:
# unique_values = np.unique(ts_sub2.cell_id.values)

# import random
# color_map = {value: (random.random(), random.random(), random.random()) for value in tqdm(unique_values)}

In [None]:
# colors = [color_map[value] for value in tqdm(ts_sub2.cell_id.values)]

In [None]:
# fig, (ax1, ax2) = plt.subplots(1, 2)
# ax1.scatter(ts_sub2.x_location.values[:3000], ts_sub2.y_location.values[:3000], s=1, c=colors[:3000])
# ax2.scatter(t_sub_.x.values[:3000], t_sub_.y.values[:3000], s=1, c=colors1[:3000])
# ax1.axis('equal')
# ax2.axis('equal')
# plt.show()

In [None]:
# Create a dictionary to store the most common value for each unique cell number
most_common_values = {}

# Iterate through the pairs of cell numbers and associated nuclei
for cell_number, nucleus in tqdm(zip(ts_sub2.cell.values, ts_sub2.cell_id.values)):
    if cell_number not in most_common_values:
        most_common_values[cell_number] = Counter()

    most_common_values[cell_number][nucleus] += 1

# Calculate the most common nucleus for each unique cell number
result = {
    cell_number: counter.most_common(1)[0][0]
    for cell_number, counter in most_common_values.items()
}

In [None]:
from sklearn.metrics import adjusted_mutual_info_score

# Example clusterings (replace with your own data)
predicted = [result.get(i) for i in ts_sub2.cell.values[:20000]]
real = ts_sub2.cell_id.values[:20000]

ami = adjusted_mutual_info_score(predicted, real)
print("Adjusted Mutual Information:", ami)

In [None]:
ol_d6 = sc.read(
    r"D:\amonell\timecourse\output-XETG00095__0011274__SI_d6__20230825__004851\adatas\preprocessed_and_filtered_02.h5ad"
)

In [None]:
rc = np.unique(tc.cell_id.values, return_counts=True)

In [None]:
rt = np.unique(transcripts_nucleus.cell.values, return_counts=True)

In [None]:
rt[0][np.argmax(rt[1][1:])]

In [None]:
ol_d6

In [None]:
len(np.where(rt[1] > 20)[0])

In [None]:
plt.hist(rt[1][1:], bins=100)
plt.show()

In [None]:
plt.hist(rc[1][1:], bins=1000)
plt.show()

In [None]:
plt.hist(ol_d6.obs["transcript_counts"])

In [None]:
plt.hist(np.unique(ts_sub2.cell_id, return_counts=True)[1], bins=100)
plt.show()

In [None]:
experiment = sc.read(os.path.join(input_folders[1], "adatas", "preprocessed_01.h5ad"))

In [None]:
experiment

In [None]:
len(np.where(experiment.obs.nuclear_transcripts.values > 20)[0])

In [None]:
plt.hist(experiment.obs.nuclear_transcripts.values, bins=1000)
plt.show()

In [None]:
input_file = input_folders[1]

In [None]:
transcripts_cellpose = pd.read_csv(
    os.path.join(input_file, "transcripts_cellpose.csv"), index_col=0
)
transcripts = pd.read_csv(os.path.join(input_file, "transcripts.csv"), index_col=0)

transcripts = transcripts.dropna(subset=["cell"])
cell_by_gene = transcripts.groupby(["cell", "gene"]).size().unstack(fill_value=0)
transcripts_nucleus = transcripts[transcripts["overlaps_nucleus"] == 1]
cell_by_gene_nucleus = (
    transcripts_nucleus.groupby(["cell", "gene"]).size().unstack(fill_value=0)
)
cell_by_gene_tpc = np.sum(cell_by_gene, axis=1)
cell_by_gene_nucleus_tpc = np.sum(cell_by_gene_nucleus, axis=1)
cyto_nuc = pd.DataFrame(cell_by_gene_tpc).merge(
    pd.DataFrame(cell_by_gene_nucleus_tpc),
    how="outer",
    left_index=True,
    right_index=True,
)
cyto_nuc.columns = ["total_transcripts", "nuclear_transcripts"]

cyto_nuc = cyto_nuc.fillna(0).astype(int)
transcripts_cellpose.index = transcripts_cellpose.transcript_id.values
cell_values = transcripts.cell
cell_values.fillna("Not_assigned-0", inplace=True)
transcripts["cell_number"] = cell_values
overlap = transcripts[transcripts.overlaps_nucleus == 1]
nuclei_associated = transcripts_cellpose.loc[overlap.index.values]
overlap["associated_nucleus"] = nuclei_associated.cell_id.values
cell_numbers = overlap.cell_number.values
associated_nuclei = overlap.associated_nucleus.values

In [None]:

    
    # Create a dictionary to store the most common value for each unique cell number
    most_common_values = {}

    # Iterate through the pairs of cell numbers and associated nuclei
    for cell_number, nucleus in tqdm(zip(cell_numbers, associated_nuclei)):
        if cell_number not in most_common_values:
            most_common_values[cell_number] = Counter()
    
        most_common_values[cell_number][nucleus] += 1
    
    # Calculate the most common nucleus for each unique cell number
    result = {cell_number: counter.most_common(1)[0][0] for cell_number, counter in most_common_values.items()}

In [None]:
    keys = list(result.keys())
    values = list(result.values())
    index = [i for i in range(len(keys))]

    

In [None]:
keydf = pd.DataFrame(
    zip(keys, values, index), columns=["cell_number", "nucleus", "inds"]
)
merge_dic = cyto_nuc.merge(keydf, left_index=True, right_on="cell_number", how="left")
merge_dic["inds"] = [i for i in range(len(merge_dic.index))]
groupby_most_common_nucleus = merge_dic.groupby("nucleus")

new_cyto_nuc = []
new_cell_by_gene = []
names = []
for group_name, group_data in tqdm(groupby_most_common_nucleus):
    indices = group_data.inds.values
    names.append(group_data.cell_number.values[0])
    new_cyto_nuc.append(np.sum(cyto_nuc.iloc[indices].values, axis=0))
    new_cell_by_gene.append(np.sum(cell_by_gene.iloc[indices].values, axis=0))

new_cell_by_gene = np.array(new_cell_by_gene)
new_cyto_nuc = np.array(new_cyto_nuc)
new_cell_by_gene = pd.DataFrame(
    new_cell_by_gene, columns=cell_by_gene.columns, index=names
)
new_cyto_nuc = pd.DataFrame(new_cyto_nuc, index=names, columns=cyto_nuc.columns)

anndata = sc.AnnData(
    new_cell_by_gene.values,
    var=pd.DataFrame(index=new_cell_by_gene.columns),
    obs=new_cyto_nuc,
)

anndata.layers["raw"] = anndata.X
anndata.obs["cytoplasmic_transcripts"] = (
    anndata.obs["total_transcripts"] - anndata.obs["nuclear_transcripts"]
)
anndata.obs["nuclear_transcript_percentage"] = (
    anndata.obs["nuclear_transcripts"] / anndata.obs["total_transcripts"]
)
anndata.var["gene"] = anndata.var.index.values
anndata.obs["cell"] = anndata.obs.index.values
cell_spatial = transcripts.groupby("cell")[["x", "y"]].mean()
anndata.obs = anndata.obs.merge(
    cell_spatial, how="left", left_index=True, right_index=True
)
anndata.obsm["X_spatial"] = anndata.obs[["x", "y"]].values
anndata = anndata[
    :,
    ~(
        (anndata.var.index.str.contains("BLANK"))
        | (anndata.var.index.str.contains("NegControl"))
    ),
]

In [None]:
anndata[anndata.obs["nuclear_transcripts"] > 10]

In [None]:
for input_file in tqdm(input_folders):
    experiment = sc.read(os.path.join(input_file, "adatas", "preprocessed_01.h5ad"))
    try:
        df = pd.DataFrame(
            experiment.X.A,
            columns=experiment.var.index.values,
            index=experiment.obs.index.values,
        )
    except:
        df = pd.DataFrame(
            experiment.X,
            columns=experiment.var.index.values,
            index=experiment.obs.index.values,
        )

    metadata = experiment.obs
    print("QC metrics for batch " + os.path.basename(input_file))

    plot_qc_feature(df, metadata, False)

    default_parameters = input("Do you want to use default filtering cutoffs (y/n)?")

    if default_parameters == "n":

        min_transcript_threshold = float(input("Min transcripts threshold: "))
        max_transcript_threshold = float(input("Max transcripts threshold: "))

        min_nuclear_transcripts = float(input("Min nuclear transcripts: "))
        max_nuclear_transcripts = float(input("Max nuclear transcripts: "))

        min_cyto_transcripts = float(input("Min cyto transcripts: "))
        max_cyto_transcripts = float(input("Max cyto transcripts: "))

        min_nuc_pct = float(input("Min nuclear transcripts / total transcripts: "))
        max_nuc_pct = float(input("Max nuclear transcripts / total transcripts: "))

        experiment = qc_before_clustering(
            experiment,
            min_transcript_threshold,
            max_transcript_threshold,
            min_nuclear_transcripts,
            max_nuclear_transcripts,
            min_cyto_transcripts,
            max_cyto_transcripts,
            min_nuc_pct,
            max_nuc_pct,
        )
    else:
        experiment = qc_before_clustering(experiment)
    experiment.write(
        os.path.join(input_file, "adatas", "preprocessed_and_filtered_02.h5ad")
    )
    # Add lines to save out figures into new analysis subfolder

In [None]:
for input_file in tqdm(input_folders):
    experiment = sc.read(
        os.path.join(input_file, "adatas", "preprocessed_and_filtered_02.h5ad")
    )
    sc.tl.pca(experiment)
    sc.pp.neighbors(experiment)
    sc.tl.leiden(experiment, key_added="original_leiden")
    sc.tl.umap(experiment)
    experiment.write(
        os.path.join(input_file, "adatas", "initial_umap_calculated_03.h5ad")
    )