# Curation of Finotello dataset (already TPM normalized)

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

## Load & check TPM data

In [None]:
# Here loading recently processed data by Dietrich and colleagues (2025, bioRxiv)
tpm_data = pd.read_csv('../data/bulk_data/Finotello_2019/finotello_tpm.csv', index_col=0)
tpm_data = tpm_data.transpose()
tpm_data

Unnamed: 0,UBE2Q2P2,SSX9,CXorf67,EFCAB8,SPATA31B1P,SDR16C6P,GTPBP6,EFCAB12,A1BG,A1CF,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,TPTEP1
pbmc_1,0.0,0.0,0.118865,0.0,0.0,0.0,57.280197,0.252188,10.863071,0.0,...,3.477319,3.44931,6.36162,16.47336,0.426082,7.47139,327.73759,17.070511,14.530079,5.15538
pbmc_10,0.081115,0.0,0.086782,0.0,0.0,0.0,72.275326,0.095742,14.785649,0.0,...,3.71275,3.55546,5.50518,14.92462,0.222251,6.381709,442.838401,12.744583,9.246122,13.326484
pbmc_12,0.0,0.0,0.188464,0.03157,0.0,0.0,50.997427,0.100608,7.492537,0.0,...,2.937518,3.70517,7.47136,16.866,0.409656,9.498991,364.59906,18.526458,13.210396,19.452058
pbmc_2,0.0,0.0,0.094006,0.004499,0.0,0.0,60.604133,0.050183,15.82718,0.0,...,2.454242,1.86107,3.35811,13.062773,0.405473,5.79988,219.37605,12.623465,5.652651,13.401427
pbmc_4,0.059691,0.212565,0.496705,0.3464,0.0,0.287881,104.4053,0.984864,10.54015,0.269025,...,2.88574,3.78,9.81738,30.148749,4.52006,12.56658,250.9182,34.72053,12.323414,45.098055
pbmc_5,0.613015,0.0,0.19609,0.385828,0.0,0.0,102.1565,0.513702,6.647064,0.0,...,3.758755,4.24224,10.6283,30.75677,1.14744,11.558017,372.00006,37.414882,13.270457,17.063871
pbmc_6,0.0,0.0,0.0,0.0,0.0,0.0,52.882543,0.17612,9.519831,0.0,...,4.2561,3.51173,4.68182,15.2459,0.213477,7.53623,458.76488,15.674012,9.39696,12.847275
pbmc_7,0.0,0.0,0.165406,0.048187,0.0,0.0,61.837424,0.044789,9.196318,0.0,...,2.472849,2.89564,5.98077,14.53583,0.520593,5.990022,290.344501,16.399724,10.407555,15.988808
pbmc_9,0.0,0.0,0.157926,0.041571,0.0,0.0,47.350889,0.098357,9.537508,0.0,...,3.869803,3.45627,5.65227,14.146395,0.508913,7.500722,380.691001,14.024112,8.51398,8.728245


In [3]:
# Check TPM, where values scale nearly up to 10^6 --> can be adjusted by GrooD automatically
np.sum(tpm_data, axis=1)

pbmc_1     926540.616547
pbmc_10    913335.282857
pbmc_12    912615.006737
pbmc_2     850411.844518
pbmc_4     804431.915567
pbmc_5     849048.060168
pbmc_6     904090.712475
pbmc_7     879647.071209
pbmc_9     901550.535458
dtype: float64

In [4]:
# TPM data scaled to 10^6
tpm_scaled = np.divide(tpm_data.transpose(),np.sum(tpm_data,axis=1))*1e6
tpm_scaled = tpm_scaled.transpose()
tpm_scaled

Unnamed: 0,UBE2Q2P2,SSX9,CXorf67,EFCAB8,SPATA31B1P,SDR16C6P,GTPBP6,EFCAB12,A1BG,A1CF,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,TPTEP1
pbmc_1,0.0,0.0,0.128289,0.0,0.0,0.0,61.821572,0.272182,11.724333,0.0,...,3.753013,3.722783,6.865992,17.779426,0.459863,8.063748,353.721773,18.423921,15.682075,5.564117
pbmc_10,0.088812,0.0,0.095017,0.0,0.0,0.0,79.1334,0.104827,16.188632,0.0,...,4.065046,3.892831,6.027556,16.34079,0.24334,6.987258,484.858528,13.953893,10.12347,14.59101
pbmc_12,0.0,0.0,0.20651,0.034593,0.0,0.0,55.880548,0.110241,8.209965,0.0,...,3.218793,4.059949,8.18676,18.480958,0.448882,10.408541,399.510262,20.300409,14.475322,21.314637
pbmc_2,0.0,0.0,0.110541,0.005291,0.0,0.0,71.264451,0.05901,18.611194,0.0,...,2.885945,2.188434,3.948804,15.360526,0.476796,6.820084,257.964481,14.843943,6.646957,15.758749
pbmc_4,0.074203,0.264242,0.617461,0.430614,0.0,0.357869,129.787615,1.224298,13.1026,0.334429,...,3.587302,4.698968,12.204115,37.478311,5.618947,15.621683,311.919748,43.161552,15.3194,56.061991
pbmc_5,0.722003,0.0,0.230953,0.454424,0.0,0.0,120.318866,0.605033,7.828843,0.0,...,4.427023,4.996466,12.517902,36.225005,1.351443,13.612913,438.137813,44.06686,15.629807,20.09765
pbmc_6,0.0,0.0,0.0,0.0,0.0,0.0,58.492519,0.194803,10.52973,0.0,...,4.707603,3.884267,5.178485,16.863241,0.236123,8.335701,507.432356,17.336769,10.393824,14.210161
pbmc_7,0.0,0.0,0.188037,0.05478,0.0,0.0,70.297993,0.050917,10.454554,0.0,...,2.811183,3.29182,6.799056,16.524616,0.59182,6.809574,330.06931,18.643527,11.831512,18.17639
pbmc_9,0.0,0.0,0.175172,0.046111,0.0,0.0,52.521614,0.109097,10.579005,0.0,...,4.292386,3.833695,6.269499,15.691184,0.564486,8.319802,422.26252,15.555547,9.443708,9.681371


In [None]:
# Save TPM data
tpm_data.to_csv('../data/bulk_data/Finotello_2019/final_data/Finotello_TPM_data.csv')
tpm_scaled.to_csv('../data/bulk_data/Finotello_2019/final_data/Finotello_TPM_data_rescaled.csv')

## Load a format different version of cell type proportions

In [None]:
# Load proportions provided by Dietrich and colleagues; slight adjustment of cell type nomenclature
facs_data = pd.read_csv('../data/bulk_data/Finotello_2019/finotello_facs_edit.csv', index_col=0).transpose()
facs_data = facs_data.loc[tpm_data.index.tolist(),]
facs_data

Unnamed: 0,NK cells,B cells,Tregs,DC,Monocytes,Neutrophils,CD8 T cells,CD4 T cells,Other
pbmc_1,0.0675,0.0581,0.0175,0.016,0.2001,0.0245,0.1564,0.3796,0.0802
pbmc_10,0.0718,0.064,0.0084,0.0227,0.3586,0.0397,0.1839,0.1786,0.0722
pbmc_12,0.1409,0.0516,0.005,0.0426,0.2003,0.0268,0.2984,0.1533,0.0811
pbmc_2,0.1128,0.0296,0.0143,0.0234,0.2481,0.0484,0.1638,0.2552,0.1044
pbmc_4,0.1154,0.0384,0.0097,0.0357,0.1943,0.0367,0.2452,0.2247,0.0999
pbmc_5,0.0896,0.0606,0.0084,0.0424,0.2636,0.0332,0.3661,0.0853,0.0508
pbmc_6,0.1035,0.0645,0.0108,0.0242,0.3238,0.0443,0.2834,0.0996,0.0459
pbmc_7,0.1746,0.0403,0.0087,0.0329,0.2387,0.0505,0.3227,0.1274,0.0043
pbmc_9,0.1144,0.048,0.0085,0.0476,0.2815,0.0257,0.226,0.1289,0.1193


In [8]:
# Check proportions sum to 100 %
np.sum(facs_data, axis=1) # which they do

pbmc_1     0.9999
pbmc_10    0.9999
pbmc_12    1.0000
pbmc_2     1.0000
pbmc_4     1.0000
pbmc_5     1.0000
pbmc_6     1.0000
pbmc_7     1.0001
pbmc_9     0.9999
dtype: float64

In [9]:
# Drop neutrophils since they are not present in most available PBMC references
facs_no_neutrophils = facs_data.drop('Neutrophils', axis=1)
facs_no_neutrophils

Unnamed: 0,NK cells,B cells,Tregs,DC,Monocytes,CD8 T cells,CD4 T cells,Other
pbmc_1,0.0675,0.0581,0.0175,0.016,0.2001,0.1564,0.3796,0.0802
pbmc_10,0.0718,0.064,0.0084,0.0227,0.3586,0.1839,0.1786,0.0722
pbmc_12,0.1409,0.0516,0.005,0.0426,0.2003,0.2984,0.1533,0.0811
pbmc_2,0.1128,0.0296,0.0143,0.0234,0.2481,0.1638,0.2552,0.1044
pbmc_4,0.1154,0.0384,0.0097,0.0357,0.1943,0.2452,0.2247,0.0999
pbmc_5,0.0896,0.0606,0.0084,0.0424,0.2636,0.3661,0.0853,0.0508
pbmc_6,0.1035,0.0645,0.0108,0.0242,0.3238,0.2834,0.0996,0.0459
pbmc_7,0.1746,0.0403,0.0087,0.0329,0.2387,0.3227,0.1274,0.0043
pbmc_9,0.1144,0.048,0.0085,0.0476,0.2815,0.226,0.1289,0.1193


In [10]:
# Summarize proportions of Neutrophils and others

sum_props = facs_data['Other'] + facs_data['Neutrophils']
facs_summed_others = facs_no_neutrophils.copy()
facs_summed_others['Other'] = sum_props
facs_summed_others

Unnamed: 0,NK cells,B cells,Tregs,DC,Monocytes,CD8 T cells,CD4 T cells,Other
pbmc_1,0.0675,0.0581,0.0175,0.016,0.2001,0.1564,0.3796,0.1047
pbmc_10,0.0718,0.064,0.0084,0.0227,0.3586,0.1839,0.1786,0.1119
pbmc_12,0.1409,0.0516,0.005,0.0426,0.2003,0.2984,0.1533,0.1079
pbmc_2,0.1128,0.0296,0.0143,0.0234,0.2481,0.1638,0.2552,0.1528
pbmc_4,0.1154,0.0384,0.0097,0.0357,0.1943,0.2452,0.2247,0.1366
pbmc_5,0.0896,0.0606,0.0084,0.0424,0.2636,0.3661,0.0853,0.084
pbmc_6,0.1035,0.0645,0.0108,0.0242,0.3238,0.2834,0.0996,0.0902
pbmc_7,0.1746,0.0403,0.0087,0.0329,0.2387,0.3227,0.1274,0.0548
pbmc_9,0.1144,0.048,0.0085,0.0476,0.2815,0.226,0.1289,0.145


In [11]:
# Drop neutrophils and others since they are not present in most available PBMC references
facs_no_neutrophils_others = facs_data.drop(['Neutrophils', 'Other'], axis=1)
facs_no_neutrophils_others

Unnamed: 0,NK cells,B cells,Tregs,DC,Monocytes,CD8 T cells,CD4 T cells
pbmc_1,0.0675,0.0581,0.0175,0.016,0.2001,0.1564,0.3796
pbmc_10,0.0718,0.064,0.0084,0.0227,0.3586,0.1839,0.1786
pbmc_12,0.1409,0.0516,0.005,0.0426,0.2003,0.2984,0.1533
pbmc_2,0.1128,0.0296,0.0143,0.0234,0.2481,0.1638,0.2552
pbmc_4,0.1154,0.0384,0.0097,0.0357,0.1943,0.2452,0.2247
pbmc_5,0.0896,0.0606,0.0084,0.0424,0.2636,0.3661,0.0853
pbmc_6,0.1035,0.0645,0.0108,0.0242,0.3238,0.2834,0.0996
pbmc_7,0.1746,0.0403,0.0087,0.0329,0.2387,0.3227,0.1274
pbmc_9,0.1144,0.048,0.0085,0.0476,0.2815,0.226,0.1289


In [12]:
# Drop other since they are not present in most available PBMC references
facs_no_others = facs_data.drop('Other', axis=1)
facs_no_others

Unnamed: 0,NK cells,B cells,Tregs,DC,Monocytes,Neutrophils,CD8 T cells,CD4 T cells
pbmc_1,0.0675,0.0581,0.0175,0.016,0.2001,0.0245,0.1564,0.3796
pbmc_10,0.0718,0.064,0.0084,0.0227,0.3586,0.0397,0.1839,0.1786
pbmc_12,0.1409,0.0516,0.005,0.0426,0.2003,0.0268,0.2984,0.1533
pbmc_2,0.1128,0.0296,0.0143,0.0234,0.2481,0.0484,0.1638,0.2552
pbmc_4,0.1154,0.0384,0.0097,0.0357,0.1943,0.0367,0.2452,0.2247
pbmc_5,0.0896,0.0606,0.0084,0.0424,0.2636,0.0332,0.3661,0.0853
pbmc_6,0.1035,0.0645,0.0108,0.0242,0.3238,0.0443,0.2834,0.0996
pbmc_7,0.1746,0.0403,0.0087,0.0329,0.2387,0.0505,0.3227,0.1274
pbmc_9,0.1144,0.048,0.0085,0.0476,0.2815,0.0257,0.226,0.1289


In [None]:
# Save proportion data

facs_data.to_csv('../data/bulk_data/Finotello_2019/final_data/Finotello_FACS_proportions_full.csv')
facs_no_neutrophils.to_csv('../data/bulk_data/Finotello_2019/final_data/Finotello_FACS_proportions_no_neutrophils.csv')
facs_no_neutrophils_others.to_csv('../data/bulk_data/Finotello_2019/final_data/Finotello_FACS_proportions_no_neutrophils_others.csv')
facs_no_others.to_csv('../data/bulk_data/Finotello_2019/final_data/Finotello_FACS_proportions_no_others.csv')

facs_summed_others.to_csv('../data/bulk_data/Finotello_2019/final_data/Finotello_FACS_proportions_others_neutrophils_summarized.csv')

Proportions ``facs_no_neutrophils_others'' used in the GrooD paper.