# How to do filtering on MIBI data using BioProfiling.jl in Python
You will need an environment with both Julia and Python running (e.g. in a [Docker image](https://hub.docker.com/u/koalive)). This relies on the `julia` library in Python and the `PyCall` library in Julia. 
This example uses the TMA data for the spatial metabolic profiling manuscript and the cellpose segmentation masks.  
Example command to start a Jupyter server using the required Docker image:  
`docker run -p 8889:8888 -v `pwd`:/home/jovyan -v "/Volumes/VERBATIM HD/Hamburg Project/gradient_correction/image_data_corrected":/data/ koalive/mibiprofiling:alpha`

## Load Python libraries

In [None]:
import os
import pandas as pd
from plotnine import *

In [None]:
os.getcwd()

## Setting up BioProfiling.jl

In [None]:
# Just ignore the scary error
from julia.api import Julia
jl = Julia(compiled_modules=False)
from julia import DataFrames, Main
import julia.BioProfiling as bp

In [None]:
%load_ext julia.magic

In [None]:
%%julia 
using DataFrames, BioProfiling, PyCall
function pd_to_df(df_pd)
    df= DataFrame()
    for col in df_pd.columns
        df[!, col] = getproperty(df_pd, col).values
        if eltype(df[:, col]) == PyObject
            # Strings are not automatically converted
            df[!, col] = PyCall.py"list($df_pd[$col])"
        end
    end
    return(df)
end

In [None]:
%%julia
PyCall.py"""
class SymStr():
    def __init__(self, *args, **kwargs):
        self.s = str(*args, **kwargs)
    def __str__(self):
        return self.s.__str__()
    def __repr__(self):
        return f'SymStr("{self.__str__()}")'
"""

sym_str_py_type = PyCall.py"SymStr";

PyCall.PyObject( s :: Symbol ) = PyCall.py"SymStr($(string(s)))"o
function PyCall.convert( ::Type{Symbol}, po :: PyCall.PyObject ) 
    sym_str = PyCall.pyisinstance( po, sym_str_py_type ) ? po.s : po;
    return Symbol(PyCall.convert(AbstractString, sym_str))
end
PyCall.pytype_mapping(sym_str_py_type, Symbol)
nothing

## Load data

In [None]:
# Intensity per cells from cellpose segmentation
fov = "A1e"
markers = pd.read_csv(f"/data/{fov}_cell_table.csv",
                      index_col=0)
markers["fov"] = fov

In [None]:
markers["url"] = [f"/data/{fov}/nuclei_clahe.tiff" for fov in markers.fov]

In [None]:
# Used by BioProfiling to describe center of cells
markers["AreaShape_Center_X"] = markers["centroid-1"]
markers["AreaShape_Center_X_1"] = markers["centroid-1"]
markers["AreaShape_Center_Y"] = markers["centroid-0"]
markers["AreaShape_Center_Y_1"] = markers["centroid-0"]

In [None]:
markers["NuclearIntensity"] = markers.area * markers["nuclei_clahe.tiff"]

In [None]:
# Let BioProfiling.jl handle the rest
xp = bp.Experiment(Main.pd_to_df(markers))

In [None]:
# Which markers do we want to compare? 
cols_to_discard = ["chan_39.tiff", 
                        "chan_48.tiff", 
                        "chan_69.tiff", 
                        "chan_71.tiff",
                        "Fe.tiff",
                        "Noodle.tiff",
                        "ASCT2_clahe.tiff",
                        "membrane.tiff",
                        "Au.tiff",
                        "membrane_clahe.tiff",
                        "GLUT1_clahe.tiff",
                        "nuclei_clahe.tiff"]
filters = [bp.NameSelector(lambda ft : ft not in cols_to_discard)]

In [None]:
ggplot(markers, aes(x = 'area')) +\
    geom_histogram(binwidth = 20) +\
    geom_vline(xintercept = 120, linetype = "dotted") +\
    geom_vline(xintercept = 1500, linetype = "dotted") +\
    theme_minimal()

In [None]:
# We define a filter to be used by BioProfiling
filters.append(jl.eval("Filter(120, :area, compare = >)"))

# We want to keep cells large enough, but we want to visualize the ones that are excluded
# so we define an "inverse filter"
negf1 = bp.negation(filters[-1])

In [None]:
# Same for maximum size, this is an equivalent syntax
filters.append(bp.Filter(1500, Main.Symbol("area"), 
                         compare = lambda a, b: a < b))
# Which cells are excluded?
negf2 = bp.negation(filters[-1])

In [None]:
ggplot(markers, aes(x = 'NuclearIntensity')) +\
    geom_histogram(bins = 100) +\
    scale_x_log10() +\
    geom_vline(xintercept = 2e4, linetype = "dotted") +\
    theme_minimal()

In [None]:
filters.append(bp.Filter(2e3, 
                         Main.Symbol("NuclearIntensity"), 
                         compare = lambda a, b: a > b))
# Which cells are excluded?
negf4 = bp.negation(filters[-1])

In [None]:
ggplot(markers, aes(x = 'Au.tiff')) +\
    geom_histogram(bins = 200) +\
    scale_x_log10() +\
    geom_vline(xintercept = 1000, linetype = "dotted") +\
    theme_minimal()

In [None]:
filters.append(bp.Filter(1000, 
                         Main.Symbol("Au.tiff"), 
                         compare = lambda a, b: a < b))
# Which cells are excluded?
negf5 = bp.negation(filters[-1])

### Show on images

In [None]:
bp.diagnostic_path(xp, negf1, jl.eval(":url"))[:5]

In [None]:
# Folder where the filtered cell images will be stored
filterfolder = "./filtered_images/"

# Visualize all filters
for i,fltr in enumerate([negf1, negf2, negf4, negf5]):
    # If something is filtered out
    if len(bp.filter_entries(xp, fltr)):
        bp.diagnostic_images(xp, fltr, jl.eval(":url"),
                            path = filterfolder,
                            center = True,
                            show = False, saveimages = True, 
                            savelimit = 1, # How many example images do you want?
                            keepsubfolders = 1)

        # Move filtered image to keep individual filters separate
        os.rename(f"{filterfolder}{fov}/nuclei_clahe.tiff", 
                f"{filterfolder}{fov}/new_f{i+1}.tiff")

## Dataset filtering
Export filtered data as a CSV file

In [None]:
all_markers = pd.DataFrame(columns=markers.columns)
# List only folders in /data directory
fovs = [f for f in os.listdir("/data/") if os.path.isdir(f"/data/{f}")]
for fov in fovs:
    markers = pd.read_csv(f"/data/{fov}_cell_table.csv",
                        index_col=0)

    # Define additional columns
    markers["fov"] = fov
    # path to nuclear images
    markers["url"] = [f"/data/{fov}/nuclei_clahe.tiff" for fov in markers.fov]
    # Used by BioProfiling to describe center of cells
    markers["AreaShape_Center_X"] = markers["centroid-1"]
    markers["AreaShape_Center_X_1"] = markers["centroid-1"]
    markers["AreaShape_Center_Y"] = markers["centroid-0"]
    markers["AreaShape_Center_Y_1"] = markers["centroid-0"]
    markers["NuclearIntensity"] = markers.area * markers["nuclei_clahe.tiff"]

    # Let BioProfiling.jl handle the rest
    xp = bp.Experiment(Main.pd_to_df(markers))

    # Apply filters
    bp.select_b(xp, filters)

    print(f"{fov}: {len(xp.selected_entries)}/{len(markers)} kept")

    if len(xp.selected_entries) < 300:
        print("Not enough cells, skipping")
        continue

    all_markers = pd.concat([all_markers, markers.iloc[xp.selected_entries - 1]],
                            ignore_index=True)

In [None]:
# Export all features for selected cells only
all_markers.to_csv("/data/cell_table.csv")

In [None]:
all_markers.fov.value_counts()

In [None]:
all_markers.fov.value_counts().median()