In [1]:
import pandas as pd
import glob
import os
import numpy as np
from functools import reduce
import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv("../summary_data/subsample/subsample_complete.csv")
df = df[['Sample','CDR3','frequency_TCR','tool','new_frequency_tool','est_reads']]
df

Unnamed: 0,Sample,CDR3,frequency_TCR,tool,new_frequency_tool,est_reads
0,sample01,CAWRGDTAQQPQHF,0.000022,MIXCR,0.0,28000
1,sample01,CASSRDSPETQYF,0.000022,MIXCR,0.0,28000
2,sample01,CASSYSGRALGTGELFF,0.000022,MIXCR,0.0,28000
3,sample01,CASSPDGGLRSPLHF,0.000022,MIXCR,0.0,28000
4,sample01,CASTPRGTVTSNQPQHF,0.000044,MIXCR,0.0,28000
...,...,...,...,...,...,...
624115,sample03,CAISVSVPLGDEQFF,0.000000,CATT,0.0,310
624116,sample03,CASSLDSEQFF,0.000000,CATT,0.0,310
624117,sample03,CSVETGLALDTDTQYF,0.000000,CATT,0.0,310
624118,sample03,CASSPPGRGYTF,0.000000,CATT,0.0,310


In [3]:
# MIXCR
MIXCR = df.loc[df['tool']=='MIXCR']
MIXCR = MIXCR.rename(columns={'new_frequency_tool':'frequency_MIXCR'})
MIXCR = MIXCR.drop('tool', axis=1)

# IMREP
IMREP = df.loc[df['tool']=='IMREP']
IMREP = IMREP.rename(columns={'new_frequency_tool':'frequency_IMREP'})
IMREP = IMREP.drop('tool', axis=1)

# TRUST4
TRUST4 = df.loc[df['tool']=='TRUST4']
TRUST4 = TRUST4.rename(columns={'new_frequency_tool':'frequency_TRUST4'})
TRUST4 = TRUST4.drop('tool', axis=1)

# CATT
CATT = df.loc[df['tool']=='CATT']
CATT = CATT.rename(columns={'new_frequency_tool':'frequency_CATT'})
CATT = CATT.drop('tool', axis=1)

data = reduce(lambda  left,right: pd.merge(left,right,on=['Sample','CDR3','frequency_TCR','est_reads'], how='outer'), [MIXCR,IMREP,TRUST4,CATT])
data

Unnamed: 0,Sample,CDR3,frequency_TCR,frequency_MIXCR,est_reads,frequency_IMREP,frequency_TRUST4,frequency_CATT
0,sample01,CAWRGDTAQQPQHF,0.000022,0.0,28000,0.0,0.0,0.0
1,sample01,CASSRDSPETQYF,0.000022,0.0,28000,0.0,0.0,0.0
2,sample01,CASSYSGRALGTGELFF,0.000022,0.0,28000,0.0,0.0,0.0
3,sample01,CASSPDGGLRSPLHF,0.000022,0.0,28000,0.0,0.0,0.0
4,sample01,CASTPRGTVTSNQPQHF,0.000044,0.0,28000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
156025,sample03,CAISVSVPLGDEQFF,0.000000,0.0,310,0.0,0.0,0.0
156026,sample03,CASSLDSEQFF,0.000000,0.0,310,0.0,0.0,0.0
156027,sample03,CSVETGLALDTDTQYF,0.000000,0.0,310,0.0,0.0,0.0
156028,sample03,CASSPPGRGYTF,0.000000,0.0,310,0.0,0.0,0.0


In [4]:
def compute_frequency(samples_data, tool, discard_zero_freq_samples):

    samples = set(samples_data['Sample'])
    
    type_observed = tool
    
    data = samples_data.rename(columns={'frequency_TCR': 'frequency_truth'})
    data = data.rename(columns={'frequency_'+type_observed: 'frequency_observed'})
    
    data = data.loc[data['frequency_truth'] > 0] #excluding clonotypes not present in ground truth

    file = open('../summary_data/subsample/capturing_per_reads/capturing_'+type_observed+'.csv',"w")
        
    file.write("th,observed_portion_frequency\n")
    
    data_per_sample = {}

    for s in samples:
        
        data_per_sample[s] = data.loc[data['Sample'] == s, ['frequency_truth','frequency_observed']]

    for th in np.linspace(0., 1.0, 100000):

        s_portions = []

        for s in samples: 

            k_observed=0
            k_true=0

            s_observed=0
            s_true=0

            sample_data = data_per_sample[s]
            sample_data = sample_data.loc[sample_data['frequency_truth'] >= th]
            
            for index, cdr in sample_data.iterrows():

                freq = cdr['frequency_truth']
                
                if cdr['frequency_observed'] > 0: #check if observed sample has the clonotype
                        
                    k_observed += 1.0    #number of clonotypes with frequency greater or equal than th
                    s_observed += freq   #sum of frequencies of observed greater or equal than th
                        
                k_true += 1.0 #number of clonotypes with frequency greater or equal than th
                s_true += freq #sum of frequencies of TCR greater or equal than th

            if s_true > 0:
                s_portions.append(s_observed/s_true)
            elif discard_zero_freq_samples is False:
                s_portions.append(0.)

        file.write(str(th)+","+str(np.mean(s_portions))+"\n")

    file.close()
    print('done '+tool)

Discarding samples with no clonotypes from truth to compute the average portion

In [5]:
# est_reads = 28000

df1 = data.loc[data['est_reads'] == 28000]

tools = ['MIXCR', 'IMREP', 'TRUST4', 'CATT']

for tool in tools:

    compute_frequency(df1, tool, True)

done MIXCR
done IMREP
done TRUST4
done CATT


In [6]:
# est_reads = 13500

df2 = data.loc[data['est_reads'] == 13500]

tools = ['MIXCR', 'IMREP', 'TRUST4', 'CATT']

for tool in tools:

    compute_frequency(df2, tool, True)

done MIXCR
done IMREP
done TRUST4
done CATT


In [7]:
# est_reads = 2600

df3 = data.loc[data['est_reads'] == 2600]

tools = ['MIXCR', 'IMREP', 'TRUST4', 'CATT']

for tool in tools:

    compute_frequency(df3, tool, True)

done MIXCR
done IMREP
done TRUST4
done CATT


In [8]:
# est_reads = 1300

df4 = data.loc[data['est_reads'] == 1300]

tools = ['MIXCR', 'IMREP', 'TRUST4', 'CATT']

for tool in tools:

    compute_frequency(df4, tool, True)

done MIXCR
done IMREP
done TRUST4
done CATT


In [9]:
# est_reads = 310

df5 = data.loc[data['est_reads'] == 310]

tools = ['MIXCR', 'IMREP', 'TRUST4', 'CATT']

for tool in tools:

    compute_frequency(df5, tool, True)

done MIXCR
done IMREP
done TRUST4
done CATT
