In [1]:
# this notebook was used to determine the application of one trained dataset onto another core
# expression data was merged and cell annotations were included

In [1]:
# requires cell_annotation environment
import sys
import os
import io
import json
import importlib
import numpy as np
import collections
import scipy
import sklearn
from pySankey.sankey import sankey

import seaborn as sns
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.gridspec import GridSpec

from sklearn.preprocessing import StandardScaler

In [2]:
# reading in util functions:
# notebook directory
current_dir = os.getcwd()

# project directory
root_dir = os.path.abspath(os.path.join(current_dir, '..', '..'))
os.chdir(root_dir)

# for importing utils
sys.path.append(os.path.join(root_dir, 'src', 'functions'))

import annotation_utils
import anno_class
import core_class_test
import classifier_class
import plot_utils

In [3]:
# reading in c7 as single core, will add n12 manually

In [4]:
markers = pd.read_csv(os.path.join(root_dir, 'example_data', 'anal_cancer', 'raw', 'channelnames.txt'), header = None)[0]
marker_info = {marker:[i, 0, 255] for i, marker in enumerate(markers)}

# Set up as list if wanting to store more than one TMA core at a time
summaries_dir = os.path.join(root_dir, 'example_data', 'anal_cancer', 'summaries')

cores = collections.defaultdict()
# set up in case more than one
for core in ['C-7', 'N-12']:
    core_coords = pd.read_csv(os.path.join(summaries_dir, f'{core}_coords.csv'))
    core_coords.set_index('Object.ID', inplace = True)
    
    core_exprs = pd.read_csv(os.path.join(summaries_dir, f'{core}_expression.csv'))
    core_exprs.set_index('Object.ID', inplace = True)
    core_exprs.loc[core_coords.index,'X_coord'] = core_coords['Centroid.X.um']
    core_exprs.loc[core_coords.index,'Y_coord'] = -1*core_coords['Centroid.Y.um'] # inverted y, for how images are displayed
    core_image = annotation_utils.read_ome_tiff(os.path.join(root_dir, 'example_data', 'anal_cancer', 'processed', f'{core}.ome.tif'))

    # first segmentation is bounding box of core, not needed - Qupath specific workflow
    core_segments = annotation_utils.read_geom_json(os.path.join(root_dir, 'example_data', 'anal_cancer','segmentations', f'{core}.geojson'))[1:]
    core_segs_restr = {}
    
    for feature in core_segments:
        core_segs_restr.update(annotation_utils.transform_geometry(feature)) # converts geom_json object into dictionary of coordinates

    cores[core] = core_class_test.core_data(expression_data = core_exprs, 
                                           image = core_image, # tif
                                           segments = core_segs_restr, # restructured segmentations
                                           core = core, # name
                                           marker_info = marker_info) # marker channels / slice

Saving each as their own core

In [5]:
c7 = cores['C-7']
annotations_dir = os.path.join(root_dir, 'example_data', 'anal_cancer', 'temp_annotations')

result = pd.read_csv(os.path.join(annotations_dir, 'C7_annotations.csv'))
result.columns = ['Object.ID', 'Annotation']
result.value_counts('Annotation') # many more annotated than actually used in training

result = result.set_index('Object.ID')['Annotation'].to_dict()
c7.annotations = result

In [6]:
n12 = cores['N-12']
annotations_dir = os.path.join(root_dir, 'example_data', 'anal_cancer', 'temp_annotations')

result = pd.read_csv(os.path.join(annotations_dir, 'N12_annotations.csv'))
result.columns = ['Object.ID', 'Annotation']
result.value_counts('Annotation') # many more annotated than actually used in training

result = result.set_index('Object.ID')['Annotation'].to_dict()
n12.annotations = result

In [7]:
# subsample from each to create smaller dataset

In [7]:
import copy
merged = copy.deepcopy(c7) # using as example and will manually merge dataframes in each

merged.expression_data = pd.concat([c7.expression_data, n12.expression_data], axis = 0)
merged.plot_df = pd.concat([c7.plot_df, n12.plot_df], axis = 0)

In [8]:
# ~ subset to about 40 if each exists
np.random.seed(5) # repdroduce sampling
cells_per_class = 40

# sampling used from each, independently

core_dict = {'c7':[], 'n12':[]}
# Group IDs by cell type
for core in [c7, n12]:
    new_dict = collections.defaultdict(list)
    for ids, cell_type in core.annotations.items():
        new_dict[cell_type].append(ids)  # Append IDs to the appropriate cell type list
    
    # Downsample to n IDs per cell type (or keep all if <n)
    downsampled_ids = []
    for cell_type, ids in new_dict.items():
        sample_size = min(cells_per_class, len(ids))  # Use 40 or the total available if fewer than n
        selected_ids = np.random.choice(ids, size = sample_size, replace = False)
        downsampled_ids.extend(selected_ids.tolist())
    
    # Create final downsampled dictionary
    downsampled_annotations = {ids: core.annotations[ids] for ids in downsampled_ids}
    if core is c7:
        core_dict['c7'] = downsampled_annotations
    else:
        core_dict['n12'] = downsampled_annotations
    

In [9]:
both_annotations = core_dict['c7'].copy()
both_annotations.update(core_dict['n12'])

In [10]:
merged.annotations = both_annotations
print(len(merged.annotations.keys()))

554


In [11]:
merged.expression_data.loc[both_annotations.keys(), :].shape

(554, 89)

In [12]:
# defining features for random forest
my_classifier = classifier_class.classify_cells(core_class = merged)

rf_feats = merged.expression_data.columns[merged.expression_data.columns.str.contains('Cell_Mean')]

In [13]:
cv = my_classifier.k_fold_cross_validation(use_params = rf_feats, 
                                           n_splits = 5, 
                                           random_state = 15, 
                                           use_imbalanced_rf = True, 
                                           use_smote = True, 
                                           n_trees = 200)

In [14]:
plot_utils.plot_metrics(cv, metrics = ['recall', 'precision', 'f1'])

In [15]:
print(np.mean(cv['accuracy']))
print(np.std(cv['accuracy']))

0.8809500409500408
0.02961974407384372


In [18]:
# defining features for random forest
my_classifier = classifier_class.classify_cells(core_class = merged)
merged.annotations = both_annotations

my_classifier.train(split = 0.5,
                    use_params = rf_feats, 
                    random_state = 15, 
                    use_imbalanced_rf = True, 
                    use_smote = True, 
                    n_trees = 200)
my_classifier.fit()


Initial Class Counts:
annotations
Myeloid       80
Stroma        80
Epi           80
Endothelia    80
CD8_T         80
CD4_T         70
Treg          70
Bcell         14
Name: count, dtype: int64

Class Counts after filtering classes with < 2 samples:
annotations
Myeloid       80
Stroma        80
Epi           80
Endothelia    80
CD8_T         80
CD4_T         70
Treg          70
Bcell         14
Name: count, dtype: int64

Class counts in y_train after train_test_split:
annotations
Endothelia    40
CD8_T         40
Myeloid       40
Epi           40
Stroma        40
CD4_T         35
Treg          35
Bcell          7
Name: count, dtype: int64

Class counts in y_test after train_test_split:
annotations
Endothelia    40
CD8_T         40
Myeloid       40
Epi           40
Stroma        40
CD4_T         35
Treg          35
Bcell          7
Name: count, dtype: int64
min class count 7

Applying SMOTE to the entire *training* set with k_neighbors=5

SMOTE applied successfully to the entire *tra

In [21]:
df = my_classifier.expression_data.loc[:,['annotations', 'predicted_annotation']]
df = df.loc[df['annotations'] != '', :]
df = df.loc[my_classifier.plot_data['used_in_training'] == False,:]

plot_utils.contingency_plot(core = None, 
                            column1 = df['annotations'], 
                            column2 = df['predicted_annotation'], 
                            figsize = (3.5,3))


In [27]:
# train one test other (trained on n12, predicted on c7)
merged.annotations = core_dict['n12']

my_classifier = classifier_class.classify_cells(core_class = merged)
rf_feats = merged.expression_data.columns[merged.expression_data.columns.str.contains('Cell_Mean')]

# full model for classification
my_classifier.train(split = None, 
                    use_params = rf_feats, 
                    random_state = 19, 
                    use_imbalanced_rf = True, 
                    use_smote = False)

my_classifier.fit()


Initial Class Counts:
annotations
Treg          40
Myeloid       40
Epi           40
Stroma        40
Endothelia    40
CD8_T         40
CD4_T         30
Bcell         14
Name: count, dtype: int64

Class Counts after filtering classes with < 2 samples:
annotations
Treg          40
Myeloid       40
Epi           40
Stroma        40
Endothelia    40
CD8_T         40
CD4_T         30
Bcell         14
Name: count, dtype: int64

Class counts in y_train after train_test_split:
annotations
Treg          40
Myeloid       40
Epi           40
Stroma        40
Endothelia    40
CD8_T         40
CD4_T         30
Bcell         14
Name: count, dtype: int64

Class counts in y_test after train_test_split:
Series([], Name: count, dtype: int64)


In [28]:
df = pd.DataFrame(my_classifier.expression_data.loc[core_dict['c7'].keys(),'predicted_annotation'])
df.loc[:,'annotations'] = list(core_dict['c7'].values())

plot_utils.contingency_plot(core = None, 
                            column1 = df['annotations'], 
                            column2 = df['predicted_annotation'], 
                            figsize = (3.5,3))

In [39]:
acc = np.sum(df.iloc[:,0] == df.iloc[:,1]) / df.shape[0]
print(acc)

0.6185185185185185


In [41]:
# train one test other (trained on c7, predicted on n12)
merged.annotations = core_dict['c7']

my_classifier = classifier_class.classify_cells(core_class = merged)
rf_feats = merged.expression_data.columns[merged.expression_data.columns.str.contains('Cell_Mean')]

# full model for classification
my_classifier.train(split = None, 
                    use_params = rf_feats, 
                    random_state = 19, 
                    use_imbalanced_rf = True, 
                    use_smote = False)

my_classifier.fit()


Initial Class Counts:
annotations
Myeloid       40
Stroma        40
Epi           40
CD4_T         40
Endothelia    40
CD8_T         40
Treg          30
Name: count, dtype: int64

Class Counts after filtering classes with < 2 samples:
annotations
Myeloid       40
Stroma        40
Epi           40
CD4_T         40
Endothelia    40
CD8_T         40
Treg          30
Name: count, dtype: int64

Class counts in y_train after train_test_split:
annotations
Myeloid       40
Stroma        40
Epi           40
CD4_T         40
Endothelia    40
CD8_T         40
Treg          30
Name: count, dtype: int64

Class counts in y_test after train_test_split:
Series([], Name: count, dtype: int64)


In [42]:
df = pd.DataFrame(my_classifier.expression_data.loc[core_dict['n12'].keys(),'predicted_annotation'])
df.loc[:,'annotations'] = list(core_dict['n12'].values())

plot_utils.contingency_plot(core = None, 
                            column1 = df['annotations'], 
                            column2 = df['predicted_annotation'], 
                            figsize = (3.5,3))

In [43]:
acc = np.sum(df.iloc[:,0] == df.iloc[:,1]) / df.shape[0]
print(acc)

0.676056338028169
