In [None]:
!pip install pyFlowSOM

In [None]:
#Importing the relevant libraries for doing the analysis (there are probably still too many that you do not need)
import pandas as pd
from pyFlowSOM import map_data_to_nodes, som
import seaborn as sns 
import numpy as np
import scanpy as sc
import squidpy as sq
import os
import gc
import random
from matplotlib import rcParams
import matplotlib.pyplot as plt
from pathlib import Path
from skimage.io  import imread, imsave
import skimage.io
import anndata as ad
import seaborn as sns 
from scipy import stats
import warnings
import shutil
import math
import openpyxl

In [None]:
#set up the base directory
base_dir = "/data/preprocessing"

#set up the directory with the cell tables after running deepcell or cellpose
cell_table_dir = base_dir 

#set up the directory for saving the plots of this notebook
preprocessing_python_dir = os.path.join(base_dir, "annotations/")

In [None]:
# Read the cell size normalized table
df = pd.read_csv(os.path.join(cell_table_dir,'cell_table_transformed.csv'), index_col=0)

In [None]:
df

In [None]:
# Getting the names of the columns in order to define the relevant channels
df.columns

## Gating approach

In [None]:
# Extract the relevant columns for the first comparison
columns_to_compare = ['SMA', 'CD31', 'CD163', 'CD68', 'CD8', 'CD45',
       'PanCK', 'MPO', 'CD7']

# Create a new column to store the result of the comparison
df['highest_value_column'] = df[columns_to_compare].idxmax(axis=1)

# Display the updated dataframe
print(df)


In [None]:
# Specify the columns for second comparison using the highest_value_column to filter for certain rows
columns_to_compare = ["CD3e", "CD8", 'CD7', "CD14", "MPO",'CD20', 'CD68', "CD163", "HLADRa"]

# Check if "highest_value_column" is "CD45", if yes, compare the specified columns
mask_cd45 = df['highest_value_column'] == 'CD45'
df.loc[mask_cd45, 'type'] = df.loc[mask_cd45, columns_to_compare].idxmax(axis=1)

# For rows not labeled CD45, copy the existing label from "highest_value_column" to the new column
df.loc[~mask_cd45, 'type'] = df.loc[~mask_cd45, 'highest_value_column']

# Display the updated dataframe
print(df)


In [None]:
# Specify the columns for second comparison using the new_column to filter for certain rows
columns_to_compare = ["CD4", "CD8", 'CD7']

# Check if "highest_value_column" is "CD45", if yes, compare the specified columns
mask_cd45 = df['type'] == 'CD3e'
df.loc[mask_cd45, 'cell_type'] = df.loc[mask_cd45, columns_to_compare].idxmax(axis=1).str.replace('.tiff', '')

# For rows not labeled CD45, copy the existing label from "highest_value_column" to the new column
df.loc[~mask_cd45, 'cell_type'] = df.loc[~mask_cd45, 'type']

# Display the updated dataframe
print(df)


In [None]:
# Select rows you need for the mantis viewer
selected_columns = df[["fov", "label", "cell_type"]]

In [None]:
selected_columns

In [None]:
# Save the new DataFrame to a CSV file without a header so that i can be loaded in mantis viewer
os.makedirs(preprocessing_python_dir, exist_ok=True)
selected_columns.to_csv(os.path.join(preprocessing_python_dir, 'gating_types.csv'), header=False, index=False)

In [None]:
selected_columns.cell_type.unique()

In [None]:
# Heatmap visualisation
# Specify the columns for the heatmap
columns_for_heatmap = ['SMA', 'CD4', 'CD31', 'CD163', 'CD68', 'CD8', 'CD3e', 'HLADRa', 'CD14', 'CD45',
       'PanCK', 'MPO', 'CD7', 'CD20', 'DCN'] # Replace with your actual column names

# Create a pivot table to prepare data for the heatmap
heatmap_data = df.pivot_table(index='cell_type', values=columns_for_heatmap)

# Create a heatmap using seaborn
plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data, cmap="YlGnBu", annot=True, fmt=".2f", linewidths=.5)
plt.title("Heatmap for Categories from cell_type")
plt.xlabel("Columns")
plt.ylabel("Categories")
plt.show()
plt.savefig(os.path.join(preprocessing_python_dir, 'heatmap.png'))

## SpatialSort annotations

### Format expression table

In [None]:
lineage_cols = ['file_id',"CD4","CD8","CD3e","FoxP3","SMA","CD31","CD45","CD68","CD163","CD7","Vimentin","PanCK","MPO","DCN","CD20","HLADRa","CD14"]

In [None]:
df["fov"] = df["fov"].astype("category")

new_categories_dict = {g: i for (i,g) in enumerate(df["fov"].cat.categories)}
print(new_categories_dict)

In [None]:
df["file_id"] = df["fov"].cat.rename_categories(new_categories_dict)

In [None]:
df[lineage_cols].to_csv(os.path.join(preprocessing_python_dir, 'ssort_expression.csv'), index = False)

### Format location table

In [None]:
df[["file_id", "centroid-0", "centroid-1"]].to_csv(os.path.join(preprocessing_python_dir, 'ssort_location.csv'), index = False)

### Format neighborhood table

In [None]:
from scipy.spatial.distance import pdist

In [None]:
def get_dist_pairs(id, thr):
    subdata = df.loc[df["file_id"] == id]
    distvec = pdist(subdata[["centroid-0", "centroid-1"]])
    distid = [(i,j) for i in range(subdata.shape[0]) for j in range(subdata.shape[0]) if i < j]
    return [(id, i[0], i[1]) for (i,j) in zip(distid, distvec) if j < thr]


In [None]:
distthr = 70 # Below 50 pixels in both x and y
all_inter = pd.concat([pd.DataFrame(get_dist_pairs(id, distthr)) for id in df["file_id"].unique()])
all_inter.columns = ["file_id", "0", "1"]
all_inter.to_csv(os.path.join(preprocessing_python_dir, 'ssort_relation.csv'), index = False)

### Format marker table

In [None]:
# Read proposed table
stringent_matrix_raw = pd.read_csv(os.path.join(preprocessing_python_dir, 
                                                'scyan_clustering_stringent.csv'), sep = ";")
stringent_matrix = stringent_matrix_raw.iloc[:,3:-1]
# Invert -1 and 0 (0 should be low and -1 unknown)
stringent_matrix.to_numpy()[stringent_matrix_raw.iloc[:,3:-1] == -1] = 0
stringent_matrix.to_numpy()[stringent_matrix_raw.iloc[:,3:-1] == 0] = -1
stringent_matrix.to_csv(os.path.join(preprocessing_python_dir, 'ssort_prior_strict.csv'),
                         index = False)

In [None]:
# Simulate how the matrix is loaded by SpatialSort
# stm = pd.read_csv(os.path.join(preprocessing_python_dir, 'ssort_prior_strict.csv')).to_numpy()

In [None]:
# Read proposed table
permissive_matrix_raw = pd.read_csv(os.path.join(preprocessing_python_dir, 
                                                'scyan_clustering_less_stringent.csv'), 
                                                sep = ";")
permissive_matrix = permissive_matrix_raw.iloc[:,3:-1]
# Invert -1 and 0 (0 should be low and -1 unknown)
permissive_matrix.to_numpy()[permissive_matrix_raw.iloc[:,3:-1] == -1] = 0
permissive_matrix.to_numpy()[permissive_matrix_raw.iloc[:,3:-1] == 0] = -1
permissive_matrix.to_csv(os.path.join(preprocessing_python_dir, 'ssort_prior_permissive.csv'),
                         index = False)

In a terminal, with the `/data/preprocessing/annotations` at the root folder and a Python environment with SpatialSort installed:
```bash
mkdir ssort
SpatialSort infer --exp-csv 'ssort_expression.csv' --loc-csv 'ssort_location.csv' --rel-csv 'ssort_relation.csv' -k 20 -s 2 -t 1000 -o "ssort/"
```