In [None]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy.spatial.distance import cdist

In [None]:
clusters = 100

df = pd.read_csv(f"./ms_brain_multisection2_spots_sigma3.0_stride1.0_clusters{clusters}-allinfo.csv")

Compute distance to the nucleus membrane for each transcript, using 10x nuclei membranes and cell index of transcripts.

In [None]:
# import nuclei boundaries from 10x
nuclei_boundaries = pd.read_csv('nucleus_boundaries.csv')

# Create an empty list to store the data for the new dataframe
new_data = []

# Define the number of interpolated points in the nucleus boundary
num_interpolated_points = 5

# Group df by 'cell_id'
grouped_df = df.groupby('cell_id')

# Iterate over unique 'cell_id' values
for cell_id, group in tqdm(grouped_df):
    if cell_id == -1:
        continue
    # Filter the corresponding nucleus boundaries
    nucleus_boundaries = nuclei_boundaries[nuclei_boundaries['cell_id'] == cell_id]

    # Create interpolated points between nucleus boundary points
    interpolated_points = []
    for i in range(len(nucleus_boundaries) - 1):
        for j in range(num_interpolated_points):
            t = j / num_interpolated_points
            x_interpolated = (1 - t) * nucleus_boundaries.iloc[i]['vertex_x'] + t * nucleus_boundaries.iloc[i + 1]['vertex_x']
            y_interpolated = (1 - t) * nucleus_boundaries.iloc[i]['vertex_y'] + t * nucleus_boundaries.iloc[i + 1]['vertex_y']
            interpolated_points.append((x_interpolated, y_interpolated))

    # Combine nucleus boundary points and interpolated points
    all_points = np.concatenate((nucleus_boundaries[['vertex_x', 'vertex_y']].values, interpolated_points), axis=0)

    # Calculate the Euclidean distance for each point in 'group' to all points
    distance_matrix = cdist(group[['x_location', 'y_location']], all_points, metric='euclidean')

    # Find the minimum distance for each row in 'group'
    min_distances = distance_matrix.min(axis=1)

    # Determine whether 'overlaps_nucleus' is 0 or 1 and assign positive or negative distances
    min_distances = np.where(group['overlaps_nucleus'] == 0, min_distances, -min_distances)

    # Append the results to 'new_data'
    new_data.extend(zip(group.index, min_distances))

# Create a new dataframe from 'new_data'
result_df = pd.DataFrame(new_data, columns=['index', 'distance_nuc_border'])

# Set 'index' as the index of the resulting dataframe (if needed)
result_df.set_index('index', inplace=True)

# Combine the resulting dataframe with the original dataframe 'df'
df = pd.concat([df, result_df], axis=1)

Update 'distance_nuc_border' column based on 'overlaps_nucleus' column (negative inside, positive outside)

In [31]:
df['distance_nuc_border'] = np.where(df['overlaps_nucleus'] == 0, df['distance_nuc_border'], -df['distance_nuc_border'])

Save the csv file with distance to nucleus membrane for each transcript

In [83]:
df.to_csv("ms_brain_multisection2_spots_sigma3.0_stride1.0_clusters100-distance.csv", index=False)