In [1]:
# requires cell_annotation environment
import sys
import os
import io
import json
import importlib
import numpy as np
import collections
import scipy
import sklearn
from pySankey.sankey import sankey # move to plot_utils in future

import seaborn as sns
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.gridspec import GridSpec

from sklearn.preprocessing import StandardScaler

In [2]:
# reading in util functions:
# notebook directory
current_dir = os.getcwd()

# project directory
root_dir = os.path.abspath(os.path.join(current_dir, '..', '..'))
os.chdir(root_dir)

# for importing utils
sys.path.append(os.path.join(root_dir, 'src', 'functions'))

import annotation_utils
import anno_class
import core_class_test
import classifier_class
import plot_utils

# Core class set-up

1. Reads in channel information, i.e. which markers are associated with each image slice and creates a dictionary containing each marker(key) and the value representing the range to visualize the image slice.

2. Reads in image (tif file), coordinates of segmentations, and expression information


In [3]:
markers = pd.read_csv(os.path.join(root_dir, 'example_data', 'anal_cancer', 'raw', 'channelnames.txt'), header = None)[0]
marker_info = {marker:[i, 0, 255] for i, marker in enumerate(markers)}

# Set up as list if wanting to store more than one TMA core at a time
summaries_dir = os.path.join(root_dir, 'example_data', 'anal_cancer', 'summaries')

core_dict = collections.defaultdict()
# set up in case more than one
for core in ['C-7']:
    core_coords = pd.read_csv(os.path.join(summaries_dir, f'{core}_coords.csv'))
    core_coords.set_index('Object.ID', inplace = True)
    
    core_exprs = pd.read_csv(os.path.join(summaries_dir, f'{core}_expression.csv'))
    core_exprs.set_index('Object.ID', inplace = True)
    core_exprs.loc[core_coords.index,'X_coord'] = core_coords['Centroid.X.um']
    core_exprs.loc[core_coords.index,'Y_coord'] = -1*core_coords['Centroid.Y.um'] # inverted y, for how images are displayed
    core_image = annotation_utils.read_ome_tiff(os.path.join(root_dir, 'example_data', 'anal_cancer', 'processed', f'{core}.ome.tif'))

    # first segmentation is bounding box of core, not needed - Qupath specific workflow
    core_segments = annotation_utils.read_geom_json(os.path.join(root_dir, 'example_data', 'anal_cancer','segmentations', f'{core}.geojson'))[1:]
    core_segs_restr = {}
    
    for feature in core_segments:
        core_segs_restr.update(annotation_utils.transform_geometry(feature)) # converts geom_json object into dictionary of coordinates

    core_dict[core] = core_class_test.core_data(expression_data = core_exprs, 
                                           image = core_image, # tif
                                           segments = core_segs_restr, # restructured segmentations
                                           core = core, # name
                                           marker_info = marker_info) # marker channels / slice

In [4]:
# visualization of what the expression data shows, each row is a cell ID, columns represent normalized expression data
c7_core = core_dict['C-7']
temp_df = c7_core.expression_data.iloc[1:10,c7_core.expression_data.columns.str.contains('Cell_Mean')].copy()
temp_df.columns = temp_df.columns.str.replace('_Cell_Mean', '')
temp_df.reset_index(inplace = True)
temp_df.drop(labels = 'Object.ID', axis = 1, inplace = True)
temp_df.round(3)

Unnamed: 0,DAPI,Ki67,CD31,FOXP3,CD56,CD34,CD4,CD20,CD45,CD163,...,PD.1,CD44,CD3e,CD45RO,CD68,Granzyme.B,HLA.DR,ICOS,HIF1A,CK17
0,5.797,0.734,1.94,2.411,2.221,0.594,2.564,0.0,1.811,1.236,...,0.027,0.459,0.0,0.0,0.387,0.0,0.014,0.277,1.194,1.061
1,6.171,3.429,1.843,2.427,2.396,1.127,2.251,0.014,1.922,1.461,...,0.052,1.465,0.115,0.024,0.513,0.019,0.08,0.31,0.908,0.678
2,5.107,0.61,1.935,2.479,2.756,0.724,2.505,0.012,2.05,2.179,...,0.085,1.136,0.115,0.128,0.256,0.012,0.023,0.302,1.023,1.37
3,5.079,0.751,1.943,2.374,2.024,0.515,2.196,0.0,1.509,1.206,...,0.178,0.613,0.007,0.01,0.172,0.0,0.014,0.182,1.052,0.559
4,4.782,0.385,1.903,1.994,2.039,0.476,2.081,0.006,2.01,1.292,...,0.013,0.934,0.004,0.001,0.343,0.001,0.013,0.113,0.895,0.338
5,5.44,0.711,2.043,2.435,2.291,0.631,1.997,0.0,3.369,1.823,...,0.131,0.775,0.069,0.542,0.36,0.017,0.049,0.373,1.537,0.526
6,5.982,0.967,1.953,2.469,2.356,0.529,2.403,0.01,1.969,1.962,...,0.032,0.177,0.19,0.037,0.373,0.022,0.07,0.282,1.041,0.579
7,6.111,0.395,2.226,1.803,1.203,2.189,1.421,0.0,1.401,0.298,...,0.0,0.038,0.031,0.003,0.245,0.0,0.045,0.062,0.576,0.346
8,6.491,0.802,1.99,2.628,2.425,1.454,1.877,0.05,5.731,1.751,...,1.222,2.884,3.039,2.95,1.496,0.404,0.179,0.382,0.969,0.726


In [5]:
# Example slices of image
n = 4
ranges = [[10, 255], # vmin and vmax of images to be shown
          [5, 80],
          [5, 30],
          [5, 30]]

fig, axes = plt.subplots(1, n, figsize = (n * 2.5, 3)) # Adjust figsize width multiplier (2.5) as needed
for i in range(n):
    ax = axes[i] # Get the specific subplot axis for this layer
    layer_data = core_image[i, :, :]
    im = ax.imshow(layer_data, cmap = 'viridis', vmin = ranges[i][0], vmax = ranges[i][1])

    ax.set_title(f'Layer {i}')
    ax.set_xticks([])
    ax.set_yticks([])

plt.tight_layout()
plt.show()

In [6]:
# visualize segmentations:
plot_utils.plot_segmentation(c7_core)

In [7]:
# features of interest (all markers) averaged across entire cell segmentation (Cell_Mean) from Qupath workflow
foi = list(core_exprs.columns[~core_exprs.columns.str.contains('DAPI|orig|Cent|coord|Membr|Nucl|Cyto')])
foi

['Ki67_Cell_Mean',
 'CD31_Cell_Mean',
 'FOXP3_Cell_Mean',
 'CD56_Cell_Mean',
 'CD34_Cell_Mean',
 'CD4_Cell_Mean',
 'CD20_Cell_Mean',
 'CD45_Cell_Mean',
 'CD163_Cell_Mean',
 'HLA.A_Cell_Mean',
 'LAG3_Cell_Mean',
 'CD8_Cell_Mean',
 'SMA_Cell_Mean',
 'PDL1_Cell_Mean',
 'CD21_Cell_Mean',
 'IDO1_Cell_Mean',
 'b.Catenin1_Cell_Mean',
 'CD14_Cell_Mean',
 'PD.1_Cell_Mean',
 'CD44_Cell_Mean',
 'CD3e_Cell_Mean',
 'CD45RO_Cell_Mean',
 'CD68_Cell_Mean',
 'Granzyme.B_Cell_Mean',
 'HLA.DR_Cell_Mean',
 'ICOS_Cell_Mean',
 'HIF1A_Cell_Mean',
 'CK17_Cell_Mean']

Functions for choosing which features to scale, apply for principal components analysis, and clsutering).

Approximate bounds adjusts range of viewing markers for easier annotation when provided through the window, but manual correction is often needed.

In [8]:
for key, core in core_dict.items():
    core.select_features(feats = foi)
    core.run_pca()
    core.run_leiden(resolution = 0.5, random_state = 5) 
    core.run_leiden(resolution = 0.8, random_state = 5)
    core.approximate_bounds()

Running PCA...


Sampling strategies can improve when you more broadly identify cell types of interest. Below I use Immune, stromal, and epithelia lineage markers to define larger lineage level clusters first. This clustering is based on the softmax expression for the markers in 'marker_dict' and kmeans where k is equal to the number of provided lineage markers.

In [9]:
# defining lineage level markers to guide subsclustering steps
marker_dict = {'immune': 'CD45_Cell_Mean', 
               'cd31_stroma':'CD31_Cell_Mean', 
               'sma_stroma':'SMA_Cell_Mean', 
               'epithelia':'Pan.Cytokeratin_Cell_Mean'}

c7_core.lineage_split(marker_dictionary = marker_dict, random_state = 5)

In [10]:
# visualizing broad lineage level clustering
plot_utils.cell_plot(core = c7_core, 
                     figsize = (6,4),
                     plot_type = 'PC',
                     col = 'kmeans_lineage', 
                     coloring_type = 'categorical', 
                     size = 3,
                     alpha = 1,
                     palette = 'tab10') #

In [11]:
# plotting softmeax expression on the segmentations itself supports non-overlapping lineages
for marker in list(marker_dict.values()):
    clr = marker.replace('_Cell_Mean', '_softmax')
    plot_utils.cell_plot(c7_core, 
                         plot_type = 'cell', 
                         coloring_type = 'continuous', 
                         col = clr, 
                         figsize = (4,3), 
                         size = 1)

Here we define higher resolutions for immune rich clusters which was based on the kmeans results of lineage level clustering. This function loops into each of the kmeans clusters, rescales the data, and calculates new 'sub' clustering with more specific resolutions. As immune cells were more diverse, I opted for higher clustering resolutions to improve the sampling approach.

In [12]:
resolution_dict = {'0': 0.3, # epithelia
                   '1': 0.5, # immune
                   '2': 0.5, # immune mixed
                   '3': 0.2} # stroma 2

c7_core.run_stratified_clustering(resolution_dict = resolution_dict, 
                                  cluster_method = 'leiden')

In [13]:
# leiden clustering alone overclusters epithelial cells
plot_utils.cell_plot(core = c7_core, 
                     figsize = (6,4),
                     plot_type = 'PC',
                     col = 'leiden_0.8', 
                     coloring_type = 'categorical', 
                     size = 3,
                     alpha = 1,
                     palette = 'tab20')

In [14]:
# stratitifed clustering improves resolution of immune cells without inflating epithelial clsuters
plot_utils.cell_plot(core = c7_core, 
                     figsize = (6,4),
                     plot_type = 'PC',
                     col = 'stratified_cluster', 
                     coloring_type = 'categorical', 
                     size = 3,
                     alpha = 1,
                     palette = 'tab20')

Below we use our stratified clsutering strategy to sample diverse cells that will be fed through the annotation window. From these a subset of high quality cells were annotated and used for training our models.  

In [15]:
c7_core.cell_sampler(cluster_col = 'stratified_cluster', 
                     max_sample = 75, 
                     keep_oob = False, 
                     tolerance = 1, 
                     use_fps = True, 
                     random_state = 16)

len(c7_core.sampled_cells)

1309

In [16]:
# example of what cells were sampled from the cell_sampler function visualized on the first 2 principal componenets
# used to visually support cell sampling comes from a diverse collection of cells

temp_df = c7_core.plot_df.copy()
temp_df.loc[:,'Sampled'] = False
temp_df.loc[c7_core.sampled_cells,'Sampled'] = True

temp_df = temp_df.sort_values('Sampled')

plt.figure(figsize = (4,3))
sns.scatterplot(data = temp_df,  
                x = 'PC1', 
                y = 'PC2', 
                s = 5,
                hue = 'Sampled', 
                palette = ['lightgray', 'red'])

plt.tight_layout()
plt.xticks([])
plt.yticks([])
plt.show()

In [6]:
# these were fine tuned to improve visability of cell types during annotation.
# currently in the core class, a dictionary of each marker exists, with a list where:
# 0th index is what slice of the image contains the specified marker
# 1st index is lower bound of what to visualize (in image plots)
# 2nd index is upper bound of what to visualize (in image plots)

c7_core.marker_info['DAPI'] = [0,5,250]
c7_core.marker_info['PanCK'] = [16,0,30]
c7_core.marker_info['CD31'] = [2,0,20]
c7_core.marker_info['SMA'] = [13,0,30]
c7_core.marker_info['CD45'] = [8,5,100]
c7_core.marker_info['CD3e'] = [22,0,20]
c7_core.marker_info['CD4'] = [6,0,20]
c7_core.marker_info['CD8'] = [12,0,30]
c7_core.marker_info['FOXP3'] = [3,5,20]
c7_core.marker_info['CD14'] = [19,0,20]
c7_core.marker_info['CD68'] = [24,3,20]
c7_core.marker_info['CD163'] = [9,0,15]
c7_core.marker_info['HLA-DR'] = [26,0,15]
c7_core.marker_info['CD56'] = [4,2,15]
c7_core.marker_info['CD20'] = [7,2,20]
c7_core.marker_info['CK17'] = [29,0,15]

c7_core.plot_marker(marker = 'PanCK') # adjust manually above and determine appropriate threshold by viewing on whole core

In [18]:
# cell annotations
c7_core.annotate(show_markers = ['DAPI', 'PanCK', 'CD31', 'SMA',
                                 'CD45', 'CD3e', 'CD4', 'CD8', 
                                 'FOXP3', 'CD14', 'CD68', 'CD163',
                                 'HLA-DR', 'CD56', 'CD20', 'CK17'], 
                   cell_types = ['Epi', 'Endothelia', 'Stroma', 
                                 'CD4_T', 'CD8_T','Treg', 'NK', 'Bcell',
                                 'Myeloid','Other_immune', 'Unknown'])

In [18]:
# annotations can be saved part way through after quitting the application and saved out / read back in
annotations_dir = os.path.join(root_dir, 'example_data', 'anal_cancer', 'temp_annotations')

result = pd.read_csv(os.path.join(annotations_dir, 'C7_annotations.csv'))
result.columns = ['Object.ID', 'Annotation']
result.value_counts('Annotation') # many more annotated than actually used in training

result = result.set_index('Object.ID')['Annotation'].to_dict()
c7_core.annotations = result 

# pd.DataFrame.from_dict(result, orient = 'index').to_csv(os.path.join(annotations_dir, 'C7_annotations.csv'))


# DELETE BELOW:
# pd.DataFrame.from_dict(c7_core.annotations, orient = 'index').to_csv('/Users/jabrand2/Desktop/c7_annotations.csv')
# pd.DataFrame.from_dict(c7_core.annotations, orient = 'index').to_csv('/Users/jabrand2/Desktop/c7_annotations2.csv')
# pd.DataFrame.from_dict(c7_core.annotations, orient = 'index').to_csv('/Users/jabrand2/Desktop/c7_annotations.csv')
# pd.DataFrame.from_dict(c7_core.annotations, orient = 'index').to_csv('/Users/jabrand2/Desktop/c7_annotations4.csv')

In [19]:
#file_loc = '/Users/jabrand2/Desktop/'
#result = annotation_utils.merge_annotations(dfs = [pd.read_csv(file_loc + 'c7_annotations.csv'),
#                                                   pd.read_csv(file_loc + 'c7_annotations2.csv'),
#                                                   pd.read_csv(file_loc + 'c7_annotations3.csv'), 
#                                                  pd.read_csv(file_loc + 'c7_annotations4.csv')], 
#                                            exclude_pattern = 'Other|Unknown|Bcell')  


result = pd.read_csv(os.path.join(annotations_dir, 'C7_annotations.csv'))
result.columns = ['Object.ID', 'Annotation']
result = result.set_index('Object.ID')['Annotation'].to_dict()

c7_core.annotations = result # CD31 stroma and SMA stroma renamed later on


User annotations after annotation were read back in and downsampled to balance classes. This was also used to support the minimum number of cells needed to achieve a good accuracy of cell types in the data. 

In [20]:
np.random.seed(5) # repdroduce sampling
cells_per_class = 40

# Initialize defaultdict with list as the default factory
new_dict = collections.defaultdict(list)

# Group IDs by cell type
for ids, cell_type in result.items():
    new_dict[cell_type].append(ids)  # Append IDs to the appropriate cell type list

# Downsample to n IDs per cell type (or keep all if <n)
downsampled_ids = []
for cell_type, ids in new_dict.items():
    sample_size = min(cells_per_class, len(ids))  # Use 40 or the total available if fewer than n
    selected_ids = np.random.choice(ids, size = sample_size, replace = False)
    downsampled_ids.extend(selected_ids.tolist())

# Create final downsampled dictionary
downsampled_annotations = {ids: c7_core.annotations[ids] for ids in downsampled_ids}

c7_core.annotations = downsampled_annotations

In [21]:
len(c7_core.annotations.keys())

270

In [22]:
c7_core.plot_df['annotations'] = 'not_annotated' 

# Convert annotations to a list if they aren't
annotations_list = list(c7_core.annotations.values()) #Convert

#Check that keys exist in index before hand!
keys_in_index = [key for key in c7_core.annotations.keys() if key in c7_core.plot_df.index]

# Assign values based on the keys
c7_core.plot_df.loc[keys_in_index, 'annotations'] = annotations_list[:len(keys_in_index)]


In [23]:
# this plots the cells that were annotated by the user back in the original segmentation data
plot_utils.cell_plot(core = c7_core, 
                     figsize = (6,4),
                     plot_type = 'cell', 
                     col = 'annotations', 
                     size = 4, 
                     coloring_type = 'categorical', 
                     palette = 'tab20')

When samples have been annotated, the core class is handed off to a new python class that handles model training, prediction, and cross validation. Options for different models include the use of imbalanced random forests which handle sampling of minority classes differently and the use of SMOTE, which generates synthetic minority classes in an attempt to better define features that identify those.

In [24]:
my_classifier = classifier_class.classify_cells(core_class = c7_core)

rf_feats = c7_core.expression_data.columns[c7_core.expression_data.columns.str.contains('Cell_Mean')]

In [25]:
cv = my_classifier.k_fold_cross_validation(use_params = rf_feats, 
                                           n_splits = 5, 
                                           random_state = 15, 
                                           use_imbalanced_rf = True, 
                                           use_smote = True, 
                                           n_trees = 200)

cv

Unnamed: 0,fold,class,f1,precision,recall,accuracy
0,1,CD4_T,0.75,0.75,0.75,0.888889
1,1,CD8_T,1.0,1.0,1.0,0.888889
2,1,Endothelia,0.941176,0.888889,1.0,0.888889
3,1,Epi,0.933333,1.0,0.875,0.888889
4,1,Myeloid,0.888889,0.8,1.0,0.888889
5,1,Stroma,0.769231,1.0,0.625,0.888889
6,1,Treg,0.923077,0.857143,1.0,0.888889
7,2,CD4_T,0.941176,0.888889,1.0,0.888889
8,2,CD8_T,1.0,1.0,1.0,0.888889
9,2,Endothelia,0.823529,0.777778,0.875,0.888889


In [48]:
print(np.mean(cv['accuracy'])) # 90.4% accuracy
print(np.std(cv['accuracy'])) # 2.7% sd

0.9037037037037036
0.027216552697590875


In [49]:
plot_utils.plot_metrics(cv, metrics = ['recall', 'precision', 'f1'])

In [50]:
# full model for classification
my_classifier.train(split = None, 
                    use_params = rf_feats, 
                    random_state = 19, 
                    use_imbalanced_rf = True, 
                    use_smote = True)
my_classifier.fit()


Initial Class Counts:
annotations
Myeloid       40
Stroma        40
Epi           40
CD4_T         40
Endothelia    40
CD8_T         40
Treg          30
Name: count, dtype: int64

Class Counts after filtering classes with < 2 samples:
annotations
Myeloid       40
Stroma        40
Epi           40
CD4_T         40
Endothelia    40
CD8_T         40
Treg          30
Name: count, dtype: int64

Class counts in y_train after train_test_split:
annotations
Myeloid       40
Stroma        40
Epi           40
CD4_T         40
Endothelia    40
CD8_T         40
Treg          30
Name: count, dtype: int64

Class counts in y_test after train_test_split:
Series([], Name: count, dtype: int64)
min class count 30

Applying SMOTE to the entire *training* set with k_neighbors=5

SMOTE applied successfully to the entire *training* set.

Class counts in y_train after SMOTE:
annotations
Myeloid       40
Stroma        40
Epi           40
CD4_T         40
Endothelia    40
Treg          40
CD8_T         40
Name:

In [51]:
my_classifier.plot_data['prediction'] = ''
c7_core.plot_df.loc[my_classifier.expression_data.index, 'prediction'] = my_classifier.expression_data['predicted_annotation']

In [52]:
plot_utils.cell_plot(core = c7_core, 
                     plot_type = 'cell', 
                     figsize = (6,4),
                     coloring_type = 'categorical', 
                     palette = 'tab20', 
                     size = 1.5,
                     col = 'prediction')

In [53]:
plot_utils.contingency_plot(core = c7_core, 
                            column1 = 'leiden_0.8',
                            column2 = 'prediction')

In [54]:
plot_utils.expression_heatmap(core = c7_core, 
                              cluster_col = 'leiden_0.8', 
                              cell_mean_substring = 'Cell_Mean', 
                              cmap = 'vlag', 
                              figsize = (4,6))

Higher clustering resolutions were applied to see if Tregs and CD4+ T cells could be separated. Leiden unable to do on its own. 

In [55]:
core.run_leiden(resolution = 1.0, random_state = 5)

plot_utils.expression_heatmap(core = c7_core, 
                              cluster_col = 'leiden_1.0', 
                              cell_mean_substring = 'Cell_Mean', 
                              cmap = 'vlag', 
                              figsize = (4,6))

In [56]:
plot_utils.expression_heatmap(core = c7_core, 
                              cluster_col = 'prediction', 
                              cell_mean_substring = 'Cell_Mean', 
                              cmap = 'vlag', 
                              figsize = (4,6))

In [57]:
my_classifier.expression_data['annotations'].value_counts()
my_classifier.expression_data['predicted_annotation'].value_counts()

predicted_annotation
Epi           8167
Myeloid        341
Endothelia     258
Stroma         232
CD8_T          211
CD4_T          135
Treg            71
Name: count, dtype: int64

In [59]:
# Used in figure 1C (2 fold cross validation)
my_classifier = classifier_class.classify_cells(core_class = c7_core)
my_classifier.train(split = 0.5, # half train half test 
                    use_params = rf_feats, 
                    random_state = 19, 
                    use_imbalanced_rf = False)
my_classifier.fit()



Initial Class Counts:
annotations
Myeloid       40
Stroma        40
Epi           40
CD4_T         40
Endothelia    40
CD8_T         40
Treg          30
Name: count, dtype: int64

Class Counts after filtering classes with < 2 samples:
annotations
Myeloid       40
Stroma        40
Epi           40
CD4_T         40
Endothelia    40
CD8_T         40
Treg          30
Name: count, dtype: int64

Class counts in y_train after train_test_split:
annotations
CD4_T         20
CD8_T         20
Endothelia    20
Epi           20
Myeloid       20
Stroma        20
Treg          15
Name: count, dtype: int64

Class counts in y_test after train_test_split:
annotations
CD4_T         20
CD8_T         20
Endothelia    20
Epi           20
Myeloid       20
Stroma        20
Treg          15
Name: count, dtype: int64
Model Accuracy: 0.87

Classification Report:
              precision    recall  f1-score   support

       CD4_T       0.82      0.70      0.76        20
       CD8_T       0.91      1.00      0.9

In [37]:
df = my_classifier.expression_data.loc[:,['annotations', 'predicted_annotation']]
df.loc[:,'leiden_0.8'] = my_classifier.plot_data['leiden_0.8']
df = df.loc[df['annotations'] != '', :]
df = df.loc[my_classifier.plot_data['used_in_training'] == False,:]

In [38]:
plot_utils.contingency_plot(core = None, 
                            column1 = df['annotations'], 
                            column2 = df['predicted_annotation'], 
                           figsize = (3.5,3))

In [39]:
plot_utils.contingency_plot(core = None, 
                            column1 = df['annotations'], 
                            column2 = df['leiden_0.8'], 
                           figsize = (3.5,3))

In [45]:
plot_utils.expression_heatmap(core = c7_core, 
                              cluster_col = 'leiden_0.8', 
                              cell_mean_substring = 'Cell_Mean', 
                              cmap = 'vlag', 
                              figsize = (4,6))

In [43]:
#leiden mapping (0.8)
expression_key = {'0': 'epi', '1':'epi', '2':'epi', '3':'epi', 
                  '4':'epi', '5':'stroma', '6':'Myeloid', '7':'other',
                  '8':'other', '9':'CD8_T', '10':'epi', '11':'CD4_T/Treg', 
                  '12':'other'}

In [44]:
converted = df['leiden_0.8'].map(expression_key)
df.loc[:,'leiden_key'] = converted
plot_utils.contingency_plot(core = None, 
                            column1 = df['annotations'], 
                            column2 = df['leiden_key'],
                           figsize = (3.5,3))

In [43]:
# full model prediction
my_classifier.plot_data['prediction'] = ''
my_classifier.train(split = None, 
                    use_params = rf_feats, 
                    random_state = 19, 
                    use_imbalanced_rf = True, 
                    use_smote = True)

my_classifier.fit()
c7_core.plot_df.loc[my_classifier.expression_data.index, 'prediction'] = my_classifier.expression_data['predicted_annotation']

c7_core.plot_df.loc[:,'prediction'].value_counts()


Initial Class Counts:
annotations
Myeloid        40
SMA_Stroma     40
Epi            40
CD4_T          40
CD31_Stroma    40
CD8_T          40
Treg           30
Name: count, dtype: int64

Class Counts after filtering classes with < 2 samples:
annotations
Myeloid        40
SMA_Stroma     40
Epi            40
CD4_T          40
CD31_Stroma    40
CD8_T          40
Treg           30
Name: count, dtype: int64

Class counts in y_train after train_test_split:
annotations
Myeloid        40
SMA_Stroma     40
Epi            40
CD4_T          40
CD31_Stroma    40
CD8_T          40
Treg           30
Name: count, dtype: int64

Class counts in y_test after train_test_split:
Series([], Name: count, dtype: int64)
min class count 30

Applying SMOTE to the entire *training* set with k_neighbors=5

SMOTE applied successfully to the entire *training* set.

Class counts in y_train after SMOTE:
annotations
Myeloid        40
SMA_Stroma     40
Epi            40
CD4_T          40
CD31_Stroma    40
Treg         

prediction
Epi            8168
Myeloid         353
CD31_Stroma     269
SMA_Stroma      218
CD8_T           214
CD4_T           123
Treg             70
Name: count, dtype: int64

In [95]:
# for consistent plotting when there are multiple 
my_pal = sns.color_palette('tab20').as_hex()

cmap = {'not_annotated':'gray', 'Epi':my_pal[0], 
        'Myeloid': my_pal[1], 'Endothelia':my_pal[2],
        'Stroma':my_pal[3], 'CD8_T':my_pal[4],
        'CD4_T':my_pal[5], 'Treg':my_pal[6]}

In [96]:
for i,clst in enumerate(np.unique(df['leiden_0.8'])):
    cmap[str(clst)] = my_pal[i+7] # 7 is offset for number used in past cmap

In [99]:
sankey(df['annotations'], 
       df['leiden_0.8'], 
       aspect = 5, fontsize = 10,
       colorDict = cmap, 
       rightColor = False)
plt.tight_layout()
plt.show()

In [100]:
plot_utils.cell_plot(core = c7_core, 
                     color_map = cmap,
                     plot_type = 'cell', 
                     figsize = (6,4),
                     coloring_type = 'categorical', 
                     palette = 'tab20', 
                     size = 1,
                     col = 'prediction')