In [1]:
import pandas as pd
import glob
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings("ignore", module="matplotlib")

In [2]:
data = pd.read_csv("../summary_data/original/all_tools_capturing_ability_nt.csv")
data

Unnamed: 0,sample,nucleotide_TCR,frequency_MIXCR,frequency_TCR,frequency_TRUST4,tissue,class
0,CMT-baseline1C_CAGATC,ACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCA...,0.000016,0.000051,0.001273,melanoma,T_cell_poor_polyclonal
1,CMT-baseline1C_CAGATC,GTGAGCACCTTGGAGCTGGGGGACTCGGCCCTTTATCTTTGCGCCA...,0.000016,0.000028,0.000849,melanoma,T_cell_poor_polyclonal
2,CMT-baseline1C_CAGATC,CACGCCCTGCAGCCAGAAGACTCAGCCCTGTATCTCTGCGCCAGCA...,0.000016,0.000582,0.002121,melanoma,T_cell_poor_polyclonal
3,CMT-baseline1C_CAGATC,TTGGAGATCCAGCGCACAGAGCAGGGGGACTCGGCCATGTATCTCT...,0.000024,0.001122,0.002121,melanoma,T_cell_poor_polyclonal
4,CMT-baseline1C_CAGATC,CTCAGGCTGCTGTCGGCTGCTCCCTCCCAGACATCTGTGTACTTCT...,0.000024,0.002540,0.000000,melanoma,T_cell_poor_polyclonal
...,...,...,...,...,...,...,...
379485,TR2-PBMC_S12,TTTCCCCTGACCCTGGAGTCTGCCAGGCCCTCACATACCTCTCAGT...,0.000000,0.000021,0.000000,PBMC,T_cell_rich_polyclonal
379486,TR2-PBMC_S12,TTTCCTCTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCT...,0.000000,0.000021,0.000000,PBMC,T_cell_rich_polyclonal
379487,TR2-PBMC_S12,TTTCCTCTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCT...,0.000000,0.000021,0.000000,PBMC,T_cell_rich_polyclonal
379488,TR2-PBMC_S12,TTTCCTCTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCT...,0.000000,0.000021,0.000000,PBMC,T_cell_rich_polyclonal


In [3]:
def compute_frequency(samples_data, tool, discard_zero_freq_samples):

    samples = set(samples_data['sample'])
    
    type_observed = tool
    
    data = samples_data.rename(columns={'frequency_TCR': 'frequency_truth'})
    data = data.rename(columns={'frequency_'+type_observed: 'frequency_observed'})
    
    data = data.loc[data['frequency_truth'] > 0] #excluding clonotypes not present in ground truth

    file = open('../summary_data/original/capturing_nt_ability/capturing_nt_ability_'+type_observed+'.csv',"w")
        
    file.write("th,observed_portion_frequency\n")
    
    data_per_sample = {}

    for s in samples:
        
        data_per_sample[s] = data.loc[data['sample'] == s, ['frequency_truth','frequency_observed']]

    for th in np.linspace(0., 1.0, 10000):

        s_portions = []

        for s in samples: 

            k_observed=0
            k_true=0

            s_observed=0
            s_true=0

            sample_data = data_per_sample[s]
            sample_data = sample_data.loc[sample_data['frequency_truth'] >= th]
            
            for index, cdr in sample_data.iterrows():

                freq = cdr['frequency_truth']
                
                if cdr['frequency_observed'] > 0: #check if observed sample has the clonotype
                        
                    k_observed += 1.0    #number of clonotypes with frequency greater or equal than th
                    s_observed += freq   #sum of frequencies of observed greater or equal than th
                        
                k_true += 1.0 #number of clonotypes with frequency greater or equal than th
                s_true += freq #sum of frequencies of TCR greater or equal than th

            if s_true > 0:
                s_portions.append(s_observed/s_true)
            elif discard_zero_freq_samples is False:
                s_portions.append(0.)

        file.write(str(th)+","+str(np.mean(s_portions))+"\n")

    file.close()
    print('done '+tool)

Discarding samples with no clonotypes from truth to compute the average portion

In [4]:
# INY1-sorted-T-cells_S14_L007

data1 = data.loc[data['sample'] == 'INY1-sorted-T-cells_S14_L007']

tools = ['MIXCR','TRUST4']

for tool in tools:

    compute_frequency(data1, tool, True)

done MIXCR
done TRUST4


In [5]:
# INY2-sorted-T-cells_S15_L007 

data2 = data.loc[data['sample'] == 'INY2-sorted-T-cells_S15_L007']

tools = ['MIXCR','TRUST4']

for tool in tools:

    compute_frequency(data2, tool, True)

done MIXCR
done TRUST4


In [6]:
# ESO1-sorted-T-cells_S13_L007 

data3 = data.loc[data['sample'] == 'ESO1-sorted-T-cells_S13_L007']

for tool in tools:

    compute_frequency(data3, tool, True)

done MIXCR
done TRUST4


In [7]:
# SAR-11-14-12RNA_S1

data4 = data.loc[data['sample'] == 'SAR-11-14-12RNA_S1']

for tool in tools:

    compute_frequency(data4, tool, True)

done MIXCR
done TRUST4


In [8]:
# TR2-PBMC_S12

data5 = data.loc[data['sample'] == 'TR2-PBMC_S12']

for tool in tools:

    compute_frequency(data5, tool, True)

done MIXCR
done TRUST4


In [9]:
# SRR5233639

data6 = data.loc[data['sample'] == 'SRR5233639']

for tool in tools:

    compute_frequency(data6, tool, True)

done MIXCR
done TRUST4


In [10]:
# MP-11-28-12RNA_S2

data7 = data.loc[data['sample'] == 'MP-11-28-12RNA_S2']

for tool in tools:

    compute_frequency(data7, tool, True)

done MIXCR
done TRUST4


In [11]:
# CMT-baseline1C_CAGATC

data8 = data.loc[data['sample'] == 'CMT-baseline1C_CAGATC']

for tool in tools:

    compute_frequency(data8, tool, True)

done MIXCR
done TRUST4


In [13]:
# HM-baseline1C_CGATGT

data9 = data.loc[data['sample'] == 'HM-baseline1C_CGATGT']

for tool in tools:

    compute_frequency(data9, tool, True)

done MIXCR
done TRUST4


In [14]:
# JSSBaseline-RNA_GTGAAA

data10 = data.loc[data['sample'] == 'JSSBaseline-RNA_GTGAAA']

for tool in tools:

    compute_frequency(data10, tool, True)

done MIXCR
done TRUST4


In [15]:
# LEK-OT110712A_CCGTCC

data11 = data.loc[data['sample'] == 'LEK-OT110712A_CCGTCC']

for tool in tools:

    compute_frequency(data11, tool, True)

done MIXCR
done TRUST4


In [16]:
# LEK-baseline_CGATGT

data12 = data.loc[data['sample'] == 'LEK-baseline_CGATGT']

for tool in tools:

    compute_frequency(data12, tool, True)

done MIXCR
done TRUST4


In [17]:
# PT0112-B_S3

data13 = data.loc[data['sample'] == 'PT0112-B_S3']

for tool in tools:

    compute_frequency(data13, tool, True)

done MIXCR
done TRUST4


In [18]:
# PT0285-B_S5

data14 = data.loc[data['sample'] == 'PT0285-B_S5']

for tool in tools:

    compute_frequency(data14, tool, True)

done MIXCR
done TRUST4


In [19]:
# PT0310_S9

data15 = data.loc[data['sample'] == 'PT0310_S9']

for tool in tools:

    compute_frequency(data15, tool, True)

done MIXCR
done TRUST4


In [20]:
# RAS-baseline_TGACCA

data16 = data.loc[data['sample'] == 'RAS-baseline_TGACCA']

for tool in tools:

    compute_frequency(data16, tool, True)

done MIXCR
done TRUST4


In [21]:
# SRR5233637

data17 = data.loc[data['sample'] == 'SRR5233637']

for tool in tools:

    compute_frequency(data17, tool, True)

done MIXCR
done TRUST4


In [22]:
# TCGA-CZ-4862

data18 = data.loc[data['sample'] == 'TCGA-CZ-4862']

for tool in tools:

    compute_frequency(data18, tool, True)

done MIXCR
done TRUST4


In [23]:
# TCGA-CZ-5463

data19 = data.loc[data['sample'] == 'TCGA-CZ-5463']

for tool in tools:

    compute_frequency(data19, tool, True)

done MIXCR
done TRUST4


In [24]:
# TCGA-CZ-5985

data20 = data.loc[data['sample'] == 'TCGA-CZ-5985']

for tool in tools:

    compute_frequency(data20, tool, True)

done MIXCR
done TRUST4
