In [None]:
# Associate manual annotation of celltypes (x,y positions)
# to single cells
# Run training using CellSighter
# authors: Pacome Prompsy
# contact: pacome.prompsy@chuv.ch
# Guenova Lab
# CHUV (Centre Hospitalier Universitaire Vaudois), Lausanne, Suisse


In [4]:
import sys
sys.path.append(".")
import os
import argparse
import numpy as np
import json
import os
import pandas as pd
import numpy as np
from tifffile import imread
import random
import tifffile
from scipy.spatial.distance import cdist


In [106]:
output_dir = "../output/CellSighter"
cell_marker_file = "../annotation/cell_markers.csv"
tiff_dir = "../output/input/"
segmentation_dir  = "../output/segmentation/"
cell_table_dir  = "../output/cell_table/"

In [135]:
sample = "ROI-15"
annotator = "Christoph"
if len(annotator) > 0:
    name = sample + "-" + annotator
else:
    name = sample + "-" 
annotation_dir = "../output/CellSighter/manual_annotation_celltype/" + name

In [136]:
cell_type = pd.read_csv(os.path.join(cell_marker_file))

cell_types_unique = cell_type.cell_type[np.sort(np.unique(cell_type.cell_type, return_index=True)[1])]
cell_types_unique = np.flip(cell_types_unique)

marker_unique = cell_type.marker[np.sort(np.unique(cell_type.marker, return_index=True)[1])]
marker_unique = np.flip(marker_unique)

In [137]:
image = imread(os.path.join(tiff_dir, sample, "DAPI.tiff"))


In [138]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
base_dir = os.path.join(output_dir, "cell_classification")
if not os.path.exists(base_dir):
    os.makedirs(base_dir)
cellType_dir = os.path.join(base_dir, "CellTypes")
if not os.path.exists(cellType_dir):
    os.makedirs(cellType_dir)
cells_dir = os.path.join(cellType_dir, "cells")
if not os.path.exists(cells_dir):
    os.makedirs(cells_dir)
cells2labels_dir = os.path.join(cellType_dir, "cells2labels")
if not os.path.exists(cells2labels_dir):
    os.makedirs(cells2labels_dir)
data_dir = os.path.join(cellType_dir, "data")
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
images_dir =  os.path.join(data_dir, "images")
if not os.path.exists(images_dir):
    os.makedirs(images_dir)    


In [139]:
# Load segmentation 
whole_cell = imread(os.path.join(segmentation_dir, sample + "_whole_cell.tiff"))

#x1 = polygon[0][1]
#x2 = polygon[1][1]
#y1 = polygon[0][0]
#y2 = polygon[1][0]

#if x1 == 0:
#    x2 = x2 + 250
#if x1 != 0:
#    x1 = x1 - 250
#if y1 == 0:
#    y2 = y2 + 250
#if y1 != 0:
#    y1 = y1 - 250

# Crop to the right corner region
#whole_cell_cropped = whole_cell[y1:y2, x1:x2]

# Save in the "cells" folder
np.savez(os.path.join(cells_dir, name +".npz"), data = whole_cell)

In [140]:
# Load the labels
csv_files = [f for f in os.listdir(annotation_dir) if f.endswith('.csv')]
dfs = []
for csv_file in csv_files:
    # Read the CSV file into a dataframe
    df = pd.read_csv(os.path.join(annotation_dir, csv_file))
    
    # Get the cell type from the file name
    cell_type = os.path.basename(csv_file).split('-')[0]
    
    # Add a column for the cell type
    df['cell_type'] = cell_type
    
    # Append the dataframe to the list of dataframes
    dfs.append(df)
result_df = pd.concat(dfs, ignore_index=True)


In [141]:
# Reading the cell centroids
cells = pd.read_csv(os.path.join(cell_table_dir, sample + "_cell_table_size_normalized.csv.gz"))


In [142]:
# Get a list of unique cell types in the result dataframe
cell_types = result_df['cell_type'].unique()


In [143]:
cell_types

array(['Fibroblast', 'Leukocyte', 'APC', 'Monocytic_Lineage',
       'Keratinocyte', 'T_helper', 'T_regulatory', 'T_cytotoxic',
       'Monocytes', 'Macrophages', 'Basophil', 'pDC', 'NKT', 'B_cell',
       'Lymphatic', 'Endothelial'], dtype=object)

In [144]:
cells["cell_type"] = ""

In [145]:
# Loop through each cell in the "cells" dataframe
for i, row in result_df.iterrows():
    # Extract the x and y coordinates of the cell centroid
    point_x, point_y, cell_class = row["axis-0"], row["axis-1"], row["cell_type"]
    
    # Initialize a dictionary to store the distances to the closest point in each cell type
    distances = {}
    
    # Loop through each cell type in the "result_df" dataframe
    
    # Filter the "result_df" dataframe to include only the points for the current cell type
    cells_locations = cells[['centroid-0', 'centroid-1', 'label']].values
    cells_locations = cells_locations[((cells_locations[...,0] > point_x - 200) & (cells_locations[...,0] < point_x + 200)) &
                             ((cells_locations[...,1] > point_y - 200) & (cells_locations[...,1] < point_y + 200))]
    
    if cells_locations.shape[0] > 0:
        # Compute the distances from the cell centroid to each point in the filtered dataframe
        cell_distances = cdist([[point_x, point_y]], cells_locations[...,0:2]).flatten()
        label = cells_locations[np.where(cell_distances==np.min(cell_distances))[0][0],2]
        
        # Add the closest cell type
        if np.min(cell_distances) < 150: 
            cells.loc[(cells.label == label),"cell_type"] = cell_class


In [146]:
np.unique(cells["cell_type"].values)

array(['', 'APC', 'B_cell', 'Basophil', 'Endothelial', 'Fibroblast',
       'Keratinocyte', 'Leukocyte', 'Lymphatic', 'Macrophages',
       'Monocytes', 'Monocytic_Lineage', 'NKT', 'T_cytotoxic', 'T_helper',
       'T_regulatory', 'pDC'], dtype=object)

In [147]:
name

'ROI-15-Christoph'

In [148]:
cells_df = cells[["fov","label", "cell_type"]]
cells_df.to_csv("../output/CellSighter/celltype/Predictions/" + name + "_true_labels.csv")

In [93]:
# Retrieve only cropped cells, set to -1 others.
unique_strings = set(cells["cell_type"])
string_to_int = {
 "":-1,
 "Macrophages":0,
 "APC":1,
 "B_cell":2,
 "T_regulatory":3,
 "Monocytes":4,
 "Neutrophils":5,
 "NKT":6,
 "Keratinocyte":7,
 "Leukocyte":8,
 "Endothelial":9,
 "T_cytotoxic":10,
 "pDC":11,
 "T_helper":12,
 "Fibroblast":13,
 "Lymphatic":14,
 "Basophil":15,
 "Monocytic_Lineage":16
                }
cell_type_int = [string_to_int[s] for s in cells["cell_type"]]
cells["cell_type_int"] = cell_type_int


In [94]:
cell_type_int

[-1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 -1,
 7,
 7,
 -1,
 -1,
 -1,
 -1,
 7,
 6,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 7,
 7,
 -1,
 7,
 -1,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 7,
 -1,
 7,
 7,
 -1,
 -1,
 -1,
 7,
 -1,
 11,
 -1,
 -1,
 9,
 7,
 12,
 -1,
 -1,
 7,
 -1,
 -1,
 0,
 -1,
 9,
 7,
 -1,
 15,
 7,
 -1,
 -1,
 15,
 -1,
 7,
 0,
 10,
 7,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 7,
 9,
 -1,
 -1,
 7,
 12,
 7,
 7,
 -1,
 -1,
 -1,
 9,
 -1,
 -1,
 -1,
 9,
 -1,
 -1,
 7,
 -1,
 7,
 -1,
 -1,
 2,
 -1,
 -1,
 7,
 -1,
 7,
 -1,
 -1,
 7,
 7,
 -1,
 -1,
 7,
 -1,
 7,
 7,
 -1,
 -1,
 -1,
 7,
 -1,
 7,
 -1,
 -1,
 12,
 -1,
 7,
 -1,
 -1,
 -1,
 7,
 10,
 -1,
 -1,
 -1,
 -1,
 7,
 -1,
 7,
 7,
 9,
 -1,
 12,
 9,
 9,
 -1,
 -1,
 -1,
 -1,
 7,
 7,
 7,
 -1,
 8,
 -1,
 -1,
 -1,
 12,
 -1,
 9,
 16,
 9,
 7,
 7,
 -1,
 7,
 7,
 7,
 -1,
 -1,
 10,
 9,
 -1,
 -1,
 0,
 7,
 7,
 7,
 -1,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 1,
 7,
 9,
 7,
 -1,
 1,
 9,


In [95]:
len(np.array(cell_type_int)[np.array(cell_type_int) > 0])

489

In [96]:
len(labels)

642

In [100]:
sample

'ROI-17'

In [99]:
len(labels)

642

In [67]:
labels = np.zeros(int(max(cells.label)) + 1)
for i in range(len(labels)):
    labels[i] = -1
idx = [int(item) for item in cells["label"].to_list()]
labels[idx] = cells["cell_type_int"]

# Save as npz in the "data" folder
np.savez(os.path.join(cells2labels_dir,  name + ".npz"), data = labels)

In [40]:
cells2labels_dir = '../output/CellSighter/celltype/cell_classification/CellTypes/cells2labels'

In [70]:
"../output/CellSighter/celltype/cell_classification/CellTypes/cells2labels/ROI-15-Pacome.npz"

'ROI-07-Ionoss.npz'

In [33]:
# Copy full markers images

all_markers = []
# Load the markers 
for marker in marker_unique:
    # Load segmentation 
    marker_image = imread(os.path.join(tiff_dir, sample, marker + ".tiff"))

    # Crop to the right corner region
    # marker_cropped = marker_image[y1:y2, x1:x2]
    
    # Save in the "cells" folder
    all_markers.append(marker_image)
    

# Combine
all_markers = np.array(all_markers)
all_markers = np.transpose(all_markers, (1, 2, 0))

# Save as npz in the "data" folder

np.savez(os.path.join(images_dir,  name + ".npz"),  data = all_markers)

In [34]:
string_to_int

{'': -1,
 'Macrophages': 0,
 'APC': 1,
 'B_cell': 2,
 'T_regulatory': 3,
 'Monocytes': 4,
 'Neutrophils': 5,
 'NKT': 6,
 'Keratinocyte': 7,
 'Leukocyte': 8,
 'Endothelial': 9,
 'T_cytotoxic': 10,
 'pDC': 11,
 'T_helper': 12,
 'Fibroblast': 13,
 'Lymphatic': 14,
 'Basophil': 15,
 'Monocytic_Lineage': 16}

In [35]:
import glob
from pathlib import Path
import numpy as np
import scipy.ndimage as ndimage
from data.cell_crop import CellCrop
from PIL import Image
from skimage import io
from collections import Counter


ModuleNotFoundError: No module named 'data'

In [57]:
def find_most_occuring_number(array):
    # Flatten the 2D array into a 1D list
    flattened_array = [num for sublist in array for num in sublist]

    # Use Counter to count the occurrences of each number
    counter = Counter(flattened_array)

    # Find the most common number and its count
    most_common = counter.most_common(1)[0]

    return most_common[0]

2


In [45]:
obj = ndimage.find_objects(whole_cell == 1)[0]
obj

(slice(0, 28, None), slice(3804, 3854, None))

In [65]:
objs = ndimage.find_objects(whole_cell)

for cell_id in np.unique(whole_cell)[1:]:
    print(cell_id)
    print(find_most_occuring_number(whole_cell[objs[cell_id]]))

1
2
2
3
3
4
4
5
5
6
6
7
7
8
8
9
9
10
10
11
11
12
12
13
13
14
14
15
15
16
16
17
17
18
18
19
19
20
20
21
21
22
22
23
23
24
24
25
25
26
26
27
27
28
28
29
29
30
30
31
31
32
32
33
33
34
34
35
35
36
36
37
37
38
38
39
39
40
40
41
41
42
42
43
43
44
44
45
45
46
46
47
47
48
48
49
49
50
50
51
51
52
52
53
53
54
54
55
55
56
56
57
57
58
58
59
59
60
60
61
61
62
62
63
63
64
64
65
65
66
66
67
67
68
68
69
69
70
70
71
71
72
72
73
73
74
74
75
75
76
76
77
77
78
78
79
79
80
80
81
81
82
82
83
83
84
84
85
85
86
86
87
87
88
88
89
89
90
90
91
91
92
92
93
93
94
94
95
95
96
96
97
97
98
98
99
99
100
100
101
101
102
102
103
103
104
104
105
105
106
106
107
107
108
108
109
109
110
110
111
111
112
112
113
113
114
114
115
115
116
116
117
117
118
118
119
119
120
120
121
121
122
122
123
123
124
124
125
125
126
126
127
127
128
128
129
129
130
130
131
131
132
132
133
133
134
134
135
135
136
136
137
137
138
138
139
139
140
140
141
141
142
142
143
143
144
144
145
145
146
146
147
147
148
148
149
149
150
150
151
151
152
152
15

TypeError: unhashable type: 'numpy.ndarray'

range(1, 100)