In [1]:
import pandas as pd
import glob
import os
import numpy as np
from functools import reduce
import warnings
warnings.simplefilter('ignore')

In [2]:
data = pd.read_csv('../summary_data/original/all_tools_TRB.csv')
data

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_MIXCR,nReads_IMREP,nReads_TRUST4,tissue,tissue_type,total_reads_TCR,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4,frequency_TCR,frequency_MIXCR,frequency_IMREP,frequency_TRUST4,repertoire_type,class
0,SRR5233637,CASSPRVTSGTYEQYF,32.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000010,0.0,0.0,0.000000,polyclonal,T_cell_poor_polyclonal
1,SRR5233637,CASSYSDRGGQPQHF,13.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000004,0.0,0.0,0.000000,polyclonal,T_cell_poor_polyclonal
2,SRR5233637,CASKVALGGETQYF,25.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000008,0.0,0.0,0.000000,polyclonal,T_cell_poor_polyclonal
3,SRR5233637,CASRAPGTGTLGSPLHF,66.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000022,0.0,0.0,0.000000,polyclonal,T_cell_poor_polyclonal
4,SRR5233637,CASSSGQGGPSTEAFF,52.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000017,0.0,0.0,0.000000,polyclonal,T_cell_poor_polyclonal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375331,sample14,CASSESPAFGEKLFF,0.0,0.0,0.0,3.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,0.000000,0.0,0.0,0.008547,polyclonal,T_cell_poor_polyclonal
375332,sample14,CASSWTGSQETQYF,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,0.000000,0.0,0.0,0.005698,polyclonal,T_cell_poor_polyclonal
375333,sample14,CASRTGLAGGIGELFF,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,0.000000,0.0,0.0,0.005698,polyclonal,T_cell_poor_polyclonal
375334,sample14,CASSVEGYEQYF,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,0.000000,0.0,0.0,0.005698,polyclonal,T_cell_poor_polyclonal


In [3]:
def compute_frequency(samples_data, tool, discard_zero_freq_samples):

    samples = set(samples_data['Sample'])
    
    type_observed = tool
    
    data = samples_data.rename(columns={'frequency_TCR': 'frequency_truth'})
    data = data.rename(columns={'frequency_'+type_observed: 'frequency_observed'})
    data = data.rename(columns={'nReads_TCR': 'count_truth'})
    data = data.rename(columns={'nReads_'+type_observed: 'count_observed'})
    
    data = data.loc[data['frequency_truth'] > 0] #excluding clonotypes not present in ground truth

    file = open('../summary_data/original/capturing_per_sample/capturing_'+type_observed+'.csv',"w")
        
    file.write("th,observed_portion_frequency\n")
    
    data_per_sample = {}

    for s in samples:
        
        data_per_sample[s] = data.loc[data['Sample'] == s, ['frequency_truth','frequency_observed','count_truth','count_observed']]

    for th in np.linspace(0., 1.0, 100000):

        s_portions = []

        for s in samples: 

            k_observed=0
            k_true=0

            s_observed=0
            s_true=0

            sample_data = data_per_sample[s]
            sample_data = sample_data.loc[sample_data['frequency_truth'] >= th]
            
            for index, cdr in sample_data.iterrows():

                freq = cdr['frequency_truth']
                
                if cdr['frequency_observed'] > 0: #check if observed sample has the clonotype
                        
                    k_observed += 1.0    #number of clonotypes with frequency greater or equal than th
                    s_observed += freq   #sum of frequencies of observed greater or equal than th
                        
                k_true += 1.0 #number of clonotypes with frequency greater or equal than th
                s_true += freq #sum of frequencies of TCR greater or equal than th

            if s_true > 0:
                s_portions.append(s_observed/s_true)
            elif discard_zero_freq_samples is False:
                s_portions.append(0.)

        file.write(str(th)+","+str(np.mean(s_portions))+"\n")

    file.close()
    print('done '+tool)

Discarding samples with no clonotypes from truth to compute the average portion

In [4]:
# sample01

sample01 = data.loc[data['Sample'] == 'sample01']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample01, tool, True)

done MIXCR
done IMREP
done TRUST4


In [5]:
# sample02

sample02 = data.loc[data['Sample'] == 'sample02']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample02, tool, True)

done MIXCR
done IMREP
done TRUST4


In [6]:
# sample03

sample03 = data.loc[data['Sample'] == 'sample03']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample03, tool, True)

done MIXCR
done IMREP
done TRUST4


In [7]:
# sample04

sample04 = data.loc[data['Sample'] == 'sample04']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample04, tool, True)

done MIXCR
done IMREP
done TRUST4


In [8]:
# sample05

sample05 = data.loc[data['Sample'] == 'sample05']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample05, tool, True)

done MIXCR
done IMREP
done TRUST4


In [9]:
# sample06

sample06 = data.loc[data['Sample'] == 'sample06']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample06, tool, True)

done MIXCR
done IMREP
done TRUST4


In [10]:
# sample07

sample07 = data.loc[data['Sample'] == 'sample07']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample07, tool, True)

done MIXCR
done IMREP
done TRUST4


In [11]:
# sample08

sample08 = data.loc[data['Sample'] == 'sample08']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample08, tool, True)

done MIXCR
done IMREP
done TRUST4


In [12]:
# sample09

sample09 = data.loc[data['Sample'] == 'sample09']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample09, tool, True)

done MIXCR
done IMREP
done TRUST4


In [13]:
# sample10

sample10 = data.loc[data['Sample'] == 'sample10']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample10, tool, True)

done MIXCR
done IMREP
done TRUST4


In [14]:
# sample11

sample11 = data.loc[data['Sample'] == 'sample11']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample11, tool, True)

done MIXCR
done IMREP
done TRUST4


In [15]:
# sample12

sample12 = data.loc[data['Sample'] == 'sample12']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample12, tool, True)

done MIXCR
done IMREP
done TRUST4


In [16]:
# sample13

sample13 = data.loc[data['Sample'] == 'sample13']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample13, tool, True)

done MIXCR
done IMREP
done TRUST4


In [17]:
# sample14

sample14 = data.loc[data['Sample'] == 'sample14']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample14, tool, True)

done MIXCR
done IMREP
done TRUST4


In [18]:
# SRR5233639

sample15 = data.loc[data['Sample'] == 'SRR5233639']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample15, tool, True)

done MIXCR
done IMREP
done TRUST4


In [19]:
# SRR5233637

sample16 = data.loc[data['Sample'] == 'SRR5233637']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample16, tool, True)

done MIXCR
done IMREP
done TRUST4


In [20]:
# TCGA-CZ-4862

sample17 = data.loc[data['Sample'] == 'TCGA-CZ-4862']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample17, tool, True)

done MIXCR
done IMREP
done TRUST4


In [21]:
# TCGA-CZ-5463

sample18 = data.loc[data['Sample'] == 'TCGA-CZ-5463']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample18, tool, True)

done MIXCR
done IMREP
done TRUST4


In [22]:
# TCGA-CZ-5985

sample19 = data.loc[data['Sample'] == 'TCGA-CZ-5985']

tools = ['MIXCR', 'IMREP', 'TRUST4']

for tool in tools:

    compute_frequency(sample19, tool, True)

done MIXCR
done IMREP
done TRUST4
