In [1]:
# requires cell_annotation environment
import sys
import os
import io
import json
import importlib
import numpy as np

import seaborn as sns
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.gridspec import GridSpec

import annotation_utils
import anno_class
import core_class
import classifier_class

In [2]:
# notebook directory
current_dir = os.getcwd()

# project directory
root_dir = os.path.abspath(os.path.join(current_dir, '..', '..'))
os.chdir(root_dir)

# for importing segmentation utils
sys.path.append(os.path.join(root_dir, 'scripts', 'notebooks'))


In [3]:
markers = pd.read_csv(os.path.join(root_dir, 'example_data', 'kidney', 'raw', 'channelnames.txt'), header = None)[0]
kidney = annotation_utils.read_ome_tiff(os.path.join(os.getcwd(), 'example_data', 'kidney', 'processed', 'normalized_kidney.tif'))
kidney.shape # 18 colors, 2048 x 2048 pixels

(18, 2048, 2048)

In [4]:
marker_info = {marker:[i, 0, 255] for i, marker in enumerate(markers)}
marker_info # shows range of data to be viewed

{'DAPI': [0, 0, 255],
 'CD14': [1, 0, 255],
 'CD16': [2, 0, 255],
 'CD45RA': [3, 0, 255],
 'CD4': [4, 0, 255],
 'CD49a': [5, 0, 255],
 'HLA-DR': [6, 0, 255],
 'CD3': [7, 0, 255],
 'CD56': [8, 0, 255],
 'CD103': [9, 0, 255],
 'Va72': [10, 0, 255],
 'CD11c': [11, 0, 255],
 'CD69': [12, 0, 255],
 'CD8': [13, 0, 255],
 'Ki67': [14, 0, 255],
 'CD161': [15, 0, 255],
 'CD163': [16, 0, 255],
 'CD31': [17, 0, 255]}

In [5]:
# load back segmentations (outlines)
outlines_path = os.path.join(root_dir, 'example_data', 'kidney', 'segmentations', 'kidney_expanded_cell_geometries.json')

with open(outlines_path, 'r') as file:
    expanded_cell_outlines = json.load(file)


In [6]:
# expanded_cell_outlines[list(expanded_cell_outlines.keys())[0]] # contains negative values in y - needs to be converted

In [7]:
# confirm outlines are as expected
def plot_polygons(polygon_dict):
    fig, ax = plt.subplots()
    
    for name, coords in polygon_dict.items():
        x, y = coords
        polygon = Polygon(np.column_stack([x, y]), fill = None, edgecolor = 'r', linewidth = 0.5)
        ax.add_patch(polygon)
    ax.autoscale()
    ax.set_aspect('equal')
    plt.show()

# plot_polygons(expanded_cell_outlines)

In [8]:
expression_path = os.path.join(root_dir, 'example_data', 'kidney', 'summaries', 'exported_kidney_cells.csv')
exprs = pd.read_csv(expression_path)
exprs['X_coord'] = exprs['Centroid_X'] # specifically for core class, can modify
exprs['Y_coord'] = exprs['Centroid_Y'] 
exprs.set_index('Object.ID', inplace = True)

In [9]:
exprs_filt = exprs.loc[exprs['size_qc'] == 'keep',:]

print(exprs.shape)
print(exprs_filt.shape) # filtered out ~300 segmentations (12%)

(2576, 27)
(2254, 27)


In [10]:
my_core = core_class.core_data(expression_data = exprs_filt, 
                               image = kidney,
                               segments = expanded_cell_outlines,
                               core = 'A', # placeholder, if comparing multiple images (cores)
                               marker_info = marker_info)

In [11]:
# features of interest (use a subset of dim reduction, clustering for improved sampling)
foi = list(exprs.columns[~exprs.columns.str.contains('orig|Cent|coord|area|Objec|DAPI|size')])
# foi

In [12]:
# include states for reproducibility
my_core.select_features(feats = foi)
my_core.run_pca()
my_core.run_kmeans(num_clust = 10, random_state = 5)
my_core.run_gmm(n_components = 10, random_state = 5)
my_core.run_leiden(resolution = 0.8, random_state = 5) 
my_core.approximate_bounds() # helps for visual imspection when training models

Running PCA...


In [13]:
def plot_multiple_markers(core_object, marker_list):
    # Determine grid size
    n = len(marker_list)
    rows = cols = int(n**0.5)
    if rows * cols < n:
        cols += 1
    
    # Create figure and GridSpec
    fig = plt.figure(figsize = (1.5*cols, 1.5*rows))
    gs = GridSpec(rows, cols, figure = fig)
    
    # Plot each marker
    for i, marker in enumerate(marker_list):
        ax = fig.add_subplot(gs[i // cols, i % cols])
        values = core_object.marker_info[marker]
        ax.imshow(core_object.image[values[0], :, :], 
                  vmin=values[1], 
                  vmax=values[2],
                  cmap="magma")
        ax.set_title(marker)
        ax.axis('off')
    
    plt.tight_layout()
    plt.show()

# use image to adjust bounds
plot_multiple_markers(core_object = my_core, marker_list = list(marker_info.keys()))


2025-04-07 13:44:17.765 python3.13[20903:1362072] +[IMKClient subclass]: chose IMKClient_Modern
2025-04-07 13:44:17.765 python3.13[20903:1362072] +[IMKInputSession subclass]: chose IMKInputSession_Modern


In [14]:
def update_marker_info(core_object, updates):
    for marker, (vmin, vmax) in updates.items():
        if marker in core_object.marker_info:
            core_object.marker_info[marker] = [core_object.marker_info[marker][0], vmin, vmax]
        else:
            print(f"Warning: Marker '{marker}' not found in marker_info.")

# Example usage:
updates = {
    'DAPI': (2, 80),
    'CD14': (0, 80),
    'CD16': (3, 100),
    'CD45RA': (5, 80),
    'CD4': (8, 80),
    'CD49a': (10, 80),
    'HLA-DR': (10, 100),
    'CD3': (8, 100),
    'CD56': (5, 85),
    'CD103': (5, 85),
    'Va72': (5, 80),
    'CD11c': (5, 80),
    'CD69': (10, 100),
    'CD8': (5, 100),
    'Ki67': (0, 45),
    'CD161': (8, 80),
    'CD163': (5, 80),
    'CD31': (8, 80)
}

update_marker_info(my_core, updates)


In [15]:
for cluster_type in ['kmeans_10', 'leiden_0.8', 'gmm']:
    n_clusters = len(my_core.plot_df[cluster_type].unique())
    palette = sns.color_palette("tab20", n_colors = n_clusters)
    
    sns.scatterplot(x = 'X_coord', 
                    y = 'Y_coord', 
                    data = my_core.plot_df, 
                    hue = cluster_type, 
                    edgecolor = 'black',
                    linewidth = 0.2,
                    s = 15,
                    palette = palette)
    
    plt.title(f'{cluster_type}')
    plt.legend(title = f'{cluster_type}', bbox_to_anchor = (1.05, 1), loc = 'upper left', markerscale = 3)
    plt.tight_layout()
    plt.show()

In [16]:
for cluster_type in ['kmeans_10', 'leiden_0.8', 'gmm']:
    n_clusters = len(my_core.plot_df[cluster_type].unique())
    palette = sns.color_palette("tab20", n_colors = n_clusters)
    
    sns.scatterplot(x = 'PC1', 
                    y = 'PC2', 
                    data = my_core.plot_df, 
                    hue = cluster_type, 
                    edgecolor = 'black',
                    linewidth = 0.2,
                    s = 15,
                    palette = palette)
    
    plt.title(f'{cluster_type}')
    plt.legend(title = f'{cluster_type}', bbox_to_anchor = (1.05, 1), loc = 'upper left', markerscale = 3)
    plt.tight_layout()
    plt.show()

In [399]:
plot_df = my_core.expression_data
plot_df.loc[:,'leiden_0.8'] = [str(clust) for clust in my_core.plot_df['leiden_0.8']]

In [401]:
from sklearn.preprocessing import StandardScaler

cell_mean_cols = [col for col in plot_df.columns if 'Cell_Mean' in col]
df_subset = plot_df.loc[:, cell_mean_cols + ['leiden_0.8']]

# Group by 'leiden_0.8' and calculate mean for each group
grouped_df = df_subset.groupby('leiden_0.8').mean()

# Standardize the grouped data using StandardScaler
scaler = StandardScaler()
zscored_df = pd.DataFrame(
    scaler.fit_transform(grouped_df),
    index=grouped_df.index,
    columns=grouped_df.columns
)
plt.figure(figsize=(5, 8)) 
ax = sns.heatmap(
    zscored_df.T,
    cmap='vlag',  # Yellow -> Green -> Blue palette
    center=0,
    linewidths=.5,
    cbar_kws={'label': 'Z-Score'},
    yticklabels=True  # Force all row labels to show
)

# Improve label visibility
ax.set_yticklabels(
    ax.get_yticklabels(),
    rotation=0,
    fontsize=8  # Adjust font size as needed
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=0,
    ha='center',
    fontsize=8
)
plt.title('Expression by Leiden_0.8', pad=10)
plt.xlabel('Leiden Cluster', labelpad=10)
plt.ylabel('Markers', labelpad=10)
plt.tight_layout()
plt.show()

In [17]:
my_core.sampled_cells = []

In [18]:
my_core.cell_sampler(cluster_col = 'leiden_0.8', 
                     max_sample = 20, 
                     keep_oob = False, 
                     tolerance = 1, 
                     random_state = 11, 
                     use_fps = True) # selects 20 cells randomly for each cluster (100 total for annotations)

## T cell subsets
CD3+ CD4+ T cells: Helper T cells   
CD3+ CD8+ T cells: Cytotoxic T cells     
CD3+ CD4+ CD45RA+: Naive helper T cells     
CD3+ CD8+ CD45RA+: Naive cytotoxic T cells     
CD3+ CD69+: Activated T cells  
CD3+ CD103+: Tissue-resident memory T cells    
CD3+ CD161+: MAIT cells or other innate-like T cells  

## NK cells
CD3- CD56+: Natural killer cells  
CD3- CD56+ CD16+: Cytotoxic NK cells  

## Myeloid subsets
CD14+ CD16-: Classical monocytes  
CD14+ CD16+: Non-classical monocytes  
CD14+ HLA-DR+: Activated monocytes/macrophages  
CD11c+ HLA-DR+: Dendritic cells  
CD163+: M2-like macrophages  

## Other cell types
CD31+: Endothelial cells  
Ki67+: Proliferating cells (can be present in any cell type)  

## Specific subsets
CD4+ CD49a+: Tissue-resident helper T cells  
CD8+ CD103+: Tissue-resident cytotoxic T cells  
CD14+ CD16+ HLA-DR+: Intermediate monocytes  
CD11c+ CD103+: CD103+ dendritic cells  

In [47]:
my_core.annotate(show_markers = ['DAPI', 'CD31', 'CD103', 'CD3', 
                                 'CD4', 'CD8', 'CD69', 'CD45RA', 
                                 'CD11c', 'CD14', 'CD16', 'CD161',
                                'CD163', 'HLA-DR', 'CD56', 'Va72'],
                 cell_types = ['Stromal', 'CD4_T', 'CD8_T', 
                               'NK', 'Mac_CD163','MAIT', 
                               'Myeloid', 'DC', 'unclassified', 'bad'])

In [352]:
df = pd.DataFrame.from_dict(my_core.annotations, orient = 'index', columns = ['CellType'])

# Reset the index and rename it
df = df.reset_index().rename(columns={'index': 'Object.ID'})
df.set_index('Object.ID', inplace = True)

# df.to_csv(os.path.join(root_dir, 'example_data', 'kidney', 'temp_annotations', 'annos.csv'))

In [367]:
# read back in annotaions:
anno = pd.read_csv(os.path.join(root_dir, 'example_data', 'kidney', 'temp_annotations', 'annos.csv'))
anno.loc[anno['CellType'] == 'unlcassified','CellType'] = 'Unclassified'

anno.set_index('Object.ID', inplace = True)

overwrite_annos = {k:v for k, v in zip(anno.index, anno.loc[:,'CellType'])}
my_core.annotations = overwrite_annos

In [368]:
#my_core.expression_data
my_classifier = classifier_class.classify_cells(core_class = my_core)


In [369]:
params = my_classifier.expression_data.columns.str.contains('Cell')
my_classifier.train(use_params = my_classifier.expression_data.columns[params], 
                    use_smote = True, 
                    split = 0.5, 
                    use_imbalanced_rf = True)
my_classifier.fit()


Initial Class Counts:
annotations
Unclassified    26
CD8_T           24
Myeloid         23
CD4_T           14
NK               8
MAIT             8
Mac_CD163        1
Stromal          1
Name: count, dtype: int64

Class Counts after filtering classes with < 2 samples:
annotations
Unclassified    26
CD8_T           24
Myeloid         23
CD4_T           14
MAIT             8
NK               8
Name: count, dtype: int64

Class counts in y_train after train_test_split:
annotations
Unclassified    13
CD8_T           12
Myeloid         12
CD4_T            7
MAIT             4
NK               4
Name: count, dtype: int64

Class counts in y_test after train_test_split:
annotations
Unclassified    13
CD8_T           12
Myeloid         11
CD4_T            7
MAIT             4
NK               4
Name: count, dtype: int64
min class count 4

Applying SMOTE to the entire *training* set with k_neighbors=3

SMOTE applied successfully to the entire *training* set.

Class counts in y_train after SMOTE:


In [372]:
plot_data = pd.DataFrame(
    {'pc1': my_classifier.plot_data['PC1'],
     'pc2': my_classifier.plot_data['PC2'],  
     'x': my_classifier.plot_data['X_coord'], 
     'y': my_classifier.plot_data['Y_coord'],
     'prediction':my_classifier.expression_data['predicted_annotation']})

# Create the scatter plot using Seaborn
fig, ax = plt.subplots(figsize = (8, 5)) #create figure and axes objects.
scatter = sns.scatterplot(data = plot_data, x = 'x', y = 'y', hue = 'prediction', ax = ax)
ax.legend(title='Cell Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout(rect=[0, 0, 0.9, 1]) # Adjust the right boundary
plt.show()

In [373]:
# Create the scatter plot using Seaborn
fig, ax = plt.subplots(figsize = (8, 5)) #create figure and axes objects.
scatter = sns.scatterplot(data = plot_data, x = 'pc1', y = 'pc2', hue = 'prediction', ax = ax)
ax.legend(title='Cell Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout(rect=[0, 0, 0.9, 1]) # Adjust the right boundary
plt.show()

In [374]:
probs = my_classifier.expression_data.loc[:, my_classifier.expression_data.columns.str.contains('prob')]

unique_classes = my_classifier.expression_data.loc[:, 'predicted_annotation'].unique()
palette = dict(zip(unique_classes, sns.color_palette("tab10", n_colors=len(unique_classes))))
row_colors = my_classifier.expression_data.loc[:,'predicted_annotation'].map(palette)
row_colors.name = ''

In [375]:
# Compact clustermap layout
clustermap = sns.clustermap(
    probs,
    annot = False,
    cmap = 'mako',
    fmt = ".3f",
    yticklabels = False,
    row_colors = row_colors,
    cbar_kws = {'orientation': 'horizontal'},
    figsize = (6, 6),  # Compact overall figure size
    dendrogram_ratio = (0.3, 0.05),  # Shrink dendrograms further
    colors_ratio = 0.04  # Thinner row color strip
)


# Compact legend (left side)
handles = [plt.Line2D([0], [0], color = color, label = label, lw = 5) 
           for label, color in palette.items()]

legend = clustermap.ax_heatmap.legend(
    handles = handles,
    title = 'Cell Types',
    bbox_to_anchor = (1.3, 1),  # Move closer to heatmap
    prop = {'size': 5},  # Smaller text size
    frameon = True,
    title_fontsize = 6
)
plt.setp(clustermap.ax_heatmap.get_xticklabels(), rotation = 45, ha = "right")

# Adjust colorbar tick labels
clustermap.ax_cbar.tick_params(labelsize = 10)

# Remove the y-axis label that says "predicted_annotation"
clustermap.ax_heatmap.set_ylabel('Cell', fontsize = 12)
clustermap.ax_heatmap.set_xlabel('Cell Type Probabilities', fontsize=12)

# Minimal padding around elements
plt.tight_layout(h_pad = -5, w_pad = -2)
plt.show()

