# Data Exploration

In [20]:
import pandas as pd
import random

## Danenberg et al

In [3]:
cells = pd.read_csv('Datasets/Danenberg_et_al/cells.csv') 
cells.head()

Unnamed: 0,ImageNumber,ObjectNumber,metabric_id,meta_id,meta_description,print_order,colours,is_epithelial,isTumour,is_normal,...,CD31-vWF,CXCL12,HLA-ABC,panCK,c-Caspase3,DNA1,DNA2,Location_Center_X,Location_Center_Y,AreaShape_Area
0,1,1,MB-0282,Ep 6,CK^{med}ER^{lo},6,#40647A,True,True,0,...,0.49533,0.401104,9.28218,18.180412,0.163026,29.445302,54.989698,121.418605,4.023256,43
1,1,2,MB-0282,Ep 5,ER^{hi}CXCL12^{+},5,#99CCCC,True,True,0,...,0.552035,2.567602,1.417755,48.593726,0.310149,23.160734,42.596685,150.468531,6.083916,143
2,1,4,MB-0282,SI 3,CD4^{+} T cells & APCs,3,#F8B195,False,True,0,...,0.493669,0.117742,3.791699,3.404438,0.242591,16.884008,31.325295,221.930233,5.139535,129
3,1,5,MB-0282,Ep 6,CK^{med}ER^{lo},6,#40647A,True,True,0,...,0.373286,1.048929,2.364418,23.289196,0.2977,8.475436,15.654673,354.136364,5.890909,110
4,1,7,MB-0282,SI 2,CD4^{+} T cells,2,#FF50A2,False,True,0,...,0.287512,0.058795,4.690886,0.664581,0.078051,14.009167,28.587826,469.174603,7.15873,126


In [23]:
clinical = pd.read_csv('Datasets/Danenberg_et_al/clinical.csv')
print(len(clinical))
clinical.head()


682


Unnamed: 0.1,Unnamed: 0,metabric_id,ERStatus,LymphNodesOrdinal,sizeOrdinal,Grade,ERBB2_pos,yearsToStatus,DeathBreast,isValidation,PAM50,IntClust,isDiscovery
0,0,MB-0000,pos,7+,20-50mm,3.0,False,11.540041,0,False,Normal-like,IntClust 4+,True
1,1,MB-0002,pos,0,5-10mm,3.0,False,6.951403,0,False,Luminal A,IntClust 4+,True
2,2,MB-0005,pos,1,10-20mm,2.0,False,13.445585,1,False,Luminal B,IntClust 3,True
3,3,MB-0006,pos,2-3,20-50mm,2.0,False,13.546886,0,False,Luminal B,IntClust 9,True
4,4,MB-0008,pos,7+,20-50mm,3.0,False,3.397673,1,False,Luminal B,IntClust 9,True


In [4]:
print("{} patients, {} images, and {} cells".format(
    len(cells['metabric_id'].unique()), len(cells['ImageNumber'].unique()), len(cells)
)
)

718 patients, 794 images, and 1123466 cells


### Data inclusion

1) Exclude samples with only normal tissues and in-situ carcinoma

In [5]:
cells= cells.loc[cells.isTumour == 1]
print("{} patients, {} images, and {} cells".format(
    len(cells['metabric_id'].unique()), len(cells['ImageNumber'].unique()), len(cells)
)
)

693 patients, 749 images, and 1066966 cells


2) Exclude patients with clinical information missing

In [6]:

cells = pd.merge(cells, clinical, on = 'metabric_id')
print("{} patients, {} images, and {} cells".format(
    len(cells['metabric_id'].unique()), len(cells['ImageNumber'].unique()), len(cells)
)
)

682 patients, 736 images, and 1045835 cells


3) Exclude images with less than 500 cells

In [13]:
cells_per_image = cells.groupby('ImageNumber').size()
print(cells_per_image[:10])
cells = cells.loc[cells['ImageNumber'].isin(cells_per_image[cells_per_image > 500].index)]
clinical = clinical.loc[clinical['metabric_id'].isin(cells['metabric_id'].unique())]
print("{} patients, {} images, and {} cells".format(
    len(cells['metabric_id'].unique()), len(cells['ImageNumber'].unique()), len(cells)
)
)


ImageNumber
1     1624
2     1057
3     1648
4     1348
5     2018
6      944
7     1377
8     1535
9     1209
10    1373
dtype: int64
579 patients, 621 images, and 1016163 cells
579


## Discovery and inner-validation sets partition

In [21]:
random.seed(0)
Subset_id = [1] * (len(clinical) - 200) + [2] * 200
random.shuffle(Subset_id)


[2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 

### Summarize data partition for Danenbert_et_al (Fig.3.a)