In [16]:
import pandas as pd
from scipy import stats
import numpy as np

In [17]:
samples = ['CMT-baseline1C_CAGATC' , 'ESO1-sorted-T-cells_S13_L007',
           'HM-baseline1C_CGATGT' , 'INY1-sorted-T-cells_S14_L007',
           'INY2-sorted-T-cells_S15_L007' , 'JSSBaseline-RNA_GTGAAA',
           'LEK-OT110712A_CCGTCC' , 'LEK-baseline_CGATGT' , 'MP-11-28-12RNA_S2',
           'PT0112-B_S3' , 'PT0285-B_S5' , 'PT0310_S9' , 'RAS-baseline_TGACCA',
           'SAR-11-14-12RNA_S1' , 'TR2-PBMC_S12', 'SRR5233637', 'SRR5233639',
           'TCGA-CZ-4862','TCGA-CZ-5463','TCGA-CZ-5985']

In [18]:
MIXCR = pd.read_csv("../summary_data/original/MIXCR_TCR_nt.csv")
TRUST4 = pd.read_csv("../summary_data/original/TRUST4_TCR_nt.csv")

In [19]:
## MIXCR
for sample in samples:
    MIXCR.loc[(MIXCR['frequency_tool'] == 0), 'frequency_tool_TCR'] = 0
    MIXCR.loc[~(MIXCR['frequency_tool'] == 0), 'frequency_tool_TCR'] = MIXCR['frequency_TCR']

# calculate the ability of capturing (non-cumulative)
MIXCR.loc[:,'frequency_TCR_copy'] = MIXCR['frequency_TCR']
MIXCR_sum = MIXCR.groupby(['sample','frequency_TCR'])['frequency_tool_TCR'].sum().reset_index()
MIXCR_TCR_sum = MIXCR.groupby(['sample','frequency_TCR'])['frequency_TCR_copy'].sum().reset_index(name='frequency_TCR_sum')
MIXCR_TCR = pd.merge(MIXCR_sum, MIXCR_TCR_sum, how='outer', on=['sample','frequency_TCR'])
MIXCR_TCR['portion_TCR'] = (MIXCR_TCR['frequency_tool_TCR']*100)/MIXCR_TCR['frequency_TCR_sum']
MIXCR_TCR.fillna(0,inplace=True)

# calculate the ability of capturing (cumulative)
MIXCR_TCR.loc[:,'frequency_tool_TCR_cumulative'] = MIXCR_TCR.groupby('sample')['frequency_tool_TCR'].cumsum()
MIXCR_TCR = MIXCR_TCR.groupby('sample').apply(lambda x: x.sort_values('frequency_TCR', ascending = False))
MIXCR_TCR = MIXCR_TCR.droplevel('sample')  
MIXCR_TCR.loc[:,'portion_TCR_sum'] = MIXCR_TCR.groupby('sample')['portion_TCR'].cumsum()
MIXCR_TCR['full_coverage'] = (MIXCR_TCR.groupby('sample').cumcount() + 1)*100
MIXCR_TCR['portion_TCR_cum'] = (MIXCR_TCR['portion_TCR_sum']*100)/MIXCR_TCR['full_coverage']
MIXCR_TCR['tool'] = 'MIXCR'

## TRUST4
for sample in samples:
    TRUST4.loc[(TRUST4['frequency_tool'] == 0), 'frequency_tool_TCR'] = 0
    TRUST4.loc[~(TRUST4['frequency_tool'] == 0), 'frequency_tool_TCR'] = TRUST4['frequency_TCR']

# calculate the ability of capturing (non-cumulative)
TRUST4.loc[:,'frequency_TCR_copy'] = TRUST4['frequency_TCR']
TRUST4_sum = TRUST4.groupby(['sample','frequency_TCR'])['frequency_tool_TCR'].sum().reset_index()
TRUST4_TCR_sum = TRUST4.groupby(['sample','frequency_TCR'])['frequency_TCR_copy'].sum().reset_index(name='frequency_TCR_sum')
TRUST4_TCR = pd.merge(TRUST4_sum, TRUST4_TCR_sum, how='outer', on=['sample','frequency_TCR'])
TRUST4_TCR['portion_TCR'] = (TRUST4_TCR['frequency_tool_TCR']*100)/TRUST4_TCR['frequency_TCR_sum']
TRUST4_TCR.fillna(0,inplace=True)

# calculate the ability of capturing (cumulative)
TRUST4_TCR.loc[:,'frequency_tool_TCR_cumulative'] = TRUST4_TCR.groupby('sample')['frequency_tool_TCR'].cumsum()
TRUST4_TCR = TRUST4_TCR.groupby('sample').apply(lambda x: x.sort_values('frequency_TCR', ascending = False))
TRUST4_TCR = TRUST4_TCR.droplevel('sample')  
TRUST4_TCR.loc[:,'portion_TCR_sum'] = TRUST4_TCR.groupby('sample')['portion_TCR'].cumsum()
TRUST4_TCR['full_coverage'] = (TRUST4_TCR.groupby('sample').cumcount() + 1)*100
TRUST4_TCR['portion_TCR_cum'] = (TRUST4_TCR['portion_TCR_sum']*100)/TRUST4_TCR['full_coverage']
TRUST4_TCR['tool'] = 'TRUST4'

capture_ability = pd.concat([MIXCR_TCR,TRUST4_TCR])
capture_ability.fillna(0,inplace=True)
capture_ability

Unnamed: 0,sample,frequency_TCR,frequency_tool_TCR,frequency_TCR_sum,portion_TCR,frequency_tool_TCR_cumulative,portion_TCR_sum,full_coverage,portion_TCR_cum,tool
633,CMT-baseline1C_CAGATC,0.011442,0.011442,0.011442,100.000000,0.063586,100.000000,100,100.000000,MIXCR
632,CMT-baseline1C_CAGATC,0.007747,0.007747,0.007747,100.000000,0.052144,200.000000,200,100.000000,MIXCR
631,CMT-baseline1C_CAGATC,0.006257,0.000000,0.006257,0.000000,0.044396,200.000000,300,66.666667,MIXCR
630,CMT-baseline1C_CAGATC,0.006117,0.006117,0.006117,100.000000,0.044396,300.000000,400,75.000000,MIXCR
629,CMT-baseline1C_CAGATC,0.005690,0.005690,0.005690,100.000000,0.038279,400.000000,500,80.000000,MIXCR
...,...,...,...,...,...,...,...,...,...,...
7366,TR2-PBMC_S12,0.000062,0.001241,0.008932,13.888889,0.008828,5277.805930,7400,71.321702,TRUST4
7365,TR2-PBMC_S12,0.000052,0.001602,0.011785,13.596491,0.007588,5291.402421,7500,70.552032,TRUST4
7364,TR2-PBMC_S12,0.000041,0.002233,0.016954,13.170732,0.005986,5304.573153,7600,69.797015,TRUST4
7363,TR2-PBMC_S12,0.000031,0.002016,0.026485,7.611241,0.003753,5312.184394,7700,68.989408,TRUST4


In [20]:
# add sample type
capture_ability.loc[capture_ability['sample']=='CMT-baseline1C_CAGATC','tissue_type'] = 'melanoma'
capture_ability.loc[capture_ability['sample']=='ESO1-sorted-T-cells_S13_L007','tissue_type'] = 'PBMC'
capture_ability.loc[capture_ability['sample']=='HM-baseline1C_CGATGT','tissue_type'] = 'melanoma'
capture_ability.loc[capture_ability['sample']=='INY1-sorted-T-cells_S14_L007','tissue_type'] = 'PBMC'
capture_ability.loc[capture_ability['sample']=='INY2-sorted-T-cells_S15_L007','tissue_type'] = 'PBMC'
capture_ability.loc[capture_ability['sample']=='JSSBaseline-RNA_GTGAAA','tissue_type'] = 'melanoma'
capture_ability.loc[capture_ability['sample']=='LEK-OT110712A_CCGTCC','tissue_type'] = 'melanoma'
capture_ability.loc[capture_ability['sample']=='LEK-baseline_CGATGT','tissue_type'] = 'melanoma'
capture_ability.loc[capture_ability['sample']=='MP-11-28-12RNA_S2','tissue_type'] = 'PBMC'
capture_ability.loc[capture_ability['sample']=='PT0112-B_S3','tissue_type'] = 'melanoma'
capture_ability.loc[capture_ability['sample']=='PT0285-B_S5','tissue_type'] = 'melanoma'
capture_ability.loc[capture_ability['sample']=='PT0310_S9','tissue_type'] = 'melanoma'
capture_ability.loc[capture_ability['sample']=='RAS-baseline_TGACCA','tissue_type'] = 'melanoma'
capture_ability.loc[capture_ability['sample']=='SAR-11-14-12RNA_S1','tissue_type'] = 'PBMC'
capture_ability.loc[capture_ability['sample']=='TR2-PBMC_S12','tissue_type'] = 'PBMC'
capture_ability.loc[capture_ability['sample']=='SRR5233639','tissue_type'] = 'lymph_node'
capture_ability.loc[capture_ability['sample']=='SRR5233637','tissue_type'] = 'small_intestine'
capture_ability.loc[capture_ability['sample']=='TCGA-CZ-4862','tissue_type'] = 'kidney'
capture_ability.loc[capture_ability['sample']=='TCGA-CZ-5463','tissue_type'] = 'kidney'
capture_ability.loc[capture_ability['sample']=='TCGA-CZ-5985','tissue_type'] = 'kidney'

# add repertoire type 
capture_ability.loc[capture_ability['sample']=='CMT-baseline1C_CAGATC','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='ESO1-sorted-T-cells_S13_L007','repertoire_type'] = 'monoclonal'
capture_ability.loc[capture_ability['sample']=='HM-baseline1C_CGATGT','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='INY1-sorted-T-cells_S14_L007','repertoire_type'] = 'monoclonal'
capture_ability.loc[capture_ability['sample']=='INY2-sorted-T-cells_S15_L007','repertoire_type'] = 'monoclonal'
capture_ability.loc[capture_ability['sample']=='JSSBaseline-RNA_GTGAAA','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='LEK-OT110712A_CCGTCC','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='LEK-baseline_CGATGT','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='MP-11-28-12RNA_S2','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='PT0112-B_S3','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='PT0285-B_S5','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='PT0310_S9','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='RAS-baseline_TGACCA','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='SAR-11-14-12RNA_S1','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='TR2-PBMC_S12','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='SRR5233637','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='SRR5233639','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='TCGA-CZ-4862','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='TCGA-CZ-5463','repertoire_type'] = 'polyclonal'
capture_ability.loc[capture_ability['sample']=='TCGA-CZ-5985','repertoire_type'] = 'polyclonal'

Streamline and save data for visualization

In [21]:
capture_ability_data = capture_ability[['sample','tissue_type','repertoire_type','frequency_TCR','frequency_tool_TCR','portion_TCR','portion_TCR_cum','tool']]
capture_ability_data.to_csv('../summary_data/original/all_tools_capturing_ability_nt.csv', index=False)

capture_ability_data

Unnamed: 0,sample,tissue_type,repertoire_type,frequency_TCR,frequency_tool_TCR,portion_TCR,portion_TCR_cum,tool
633,CMT-baseline1C_CAGATC,melanoma,polyclonal,0.011442,0.011442,100.000000,100.000000,MIXCR
632,CMT-baseline1C_CAGATC,melanoma,polyclonal,0.007747,0.007747,100.000000,100.000000,MIXCR
631,CMT-baseline1C_CAGATC,melanoma,polyclonal,0.006257,0.000000,0.000000,66.666667,MIXCR
630,CMT-baseline1C_CAGATC,melanoma,polyclonal,0.006117,0.006117,100.000000,75.000000,MIXCR
629,CMT-baseline1C_CAGATC,melanoma,polyclonal,0.005690,0.005690,100.000000,80.000000,MIXCR
...,...,...,...,...,...,...,...,...
7366,TR2-PBMC_S12,PBMC,polyclonal,0.000062,0.001241,13.888889,71.321702,TRUST4
7365,TR2-PBMC_S12,PBMC,polyclonal,0.000052,0.001602,13.596491,70.552032,TRUST4
7364,TR2-PBMC_S12,PBMC,polyclonal,0.000041,0.002233,13.170732,69.797015,TRUST4
7363,TR2-PBMC_S12,PBMC,polyclonal,0.000031,0.002016,7.611241,68.989408,TRUST4
