In [47]:
import pandas as pd
import numpy as np
import os
import glob 

Load TCR-Seq results

In [48]:
TCR_files = glob.glob("../raw_data/TCR_Seq/*.tsv")

data = []

for file in TCR_files:
    dataframe = pd.read_csv(file,sep='\t')
    dataframe['sample'] = os.path.basename(file)
    data.append(dataframe)

TCR = pd.concat(data, sort=False)

In [49]:
# rename TCR sample names based on 1:1 matchup in python dictionary
Sample_name_matchup_RNA_TCR_dict = {
    "ESO1-sorted-T-cells_S13_L007" : "RearrangementDetails_ESO1_sorted_infusion.tsv",
    "INY1-sorted-T-cells_S14_L007" : "RearrangementDetails_INY1_sorted_infusion.tsv",
    "INY2-sorted-T-cells_S15_L007" : "RearrangementDetails_INY2_sorted_infusion.tsv",
    "TR2-PBMC_S12" : "RearrangementDetails_TR-PBMC.tsv",
    "SAR-11-14-12RNA_S1" : "SAR_11-14_PBMC.tsv",
    "MP-11-28-12RNA_S2" : "MP_11-28_PBMC.tsv",
    "CMT-baseline1C_CAGATC" : "Pt204_Baseline_TCR_seq.tsv",
    "HM-baseline1C_CGATGT" : "Pt310_baseline_TCRseq.tsv",
    "PT0310_S9" : "Pt310_on-tx_TCR_seq.tsv",
    "LEK-baseline_CGATGT" : "Pt294_baseline_TCR_seq.tsv",
    "LEK-OT110712A_CCGTCC" : "Pt294_on-tx_TCR_seq.tsv",
    "JSSBaseline-RNA_GTGAAA" : "Pt_308_baseline_TCR_seq.tsv",
    "RAS-baseline_TGACCA" : "Pt_325_baseline_TCR_seq.tsv",
    "PT0112-B_S3" : "Pt_112_baseline_TCR_seq.tsv",
    "PT0285-B_S5" : "Pt_285_baseline_TCR_seq.tsv",
    "SRR5233637" : "SRR5233637.tsv",
    "SRR5233639" : "SRR5233639.tsv",
    "TCGA-CZ-4862" : "TCGA-CZ-4862.tsv",
    "TCGA-CZ-5463" : "TCGA-CZ-5463.tsv",
    "TCGA-CZ-5985" : "TCGA-CZ-5985.tsv"
}

for matchup in Sample_name_matchup_RNA_TCR_dict:
    TCR.loc[(TCR['sample'] == Sample_name_matchup_RNA_TCR_dict[matchup]), 'sample' ] = matchup

In [50]:
# rename and select the columns that needed for analysis
TCR.rename(columns = {'reads':'reads_TCR','rearrangement':'nucleotide_TCR'},inplace=True)
TCR = TCR[['reads_TCR','nucleotide_TCR','sample']]

# calculate frequency
samples = ['CMT-baseline1C_CAGATC' , 'ESO1-sorted-T-cells_S13_L007',
           'HM-baseline1C_CGATGT' , 'INY1-sorted-T-cells_S14_L007',
           'INY2-sorted-T-cells_S15_L007' , 'JSSBaseline-RNA_GTGAAA',
           'LEK-OT110712A_CCGTCC' , 'LEK-baseline_CGATGT' , 'MP-11-28-12RNA_S2',
           'PT0112-B_S3' , 'PT0285-B_S5' , 'PT0310_S9' , 'RAS-baseline_TGACCA',
           'SAR-11-14-12RNA_S1' , 'TR2-PBMC_S12', 'SRR5233637', 'SRR5233639',
           'TCGA-CZ-4862', 'TCGA-CZ-5463', 'TCGA-CZ-5985']

TCR_frequency = pd.DataFrame()

for sample in samples:
    sample_frequency = TCR.loc[(TCR['sample'] == sample)]
    sample_frequency.loc[:,'frequency_TCR'] = sample_frequency['reads_TCR']/sum(sample_frequency['reads_TCR'])
    TCR_frequency = pd.concat([sample_frequency,TCR_frequency],sort=False)

# select rows with reads greater than 1
TCR_frequency = TCR_frequency.loc[(TCR_frequency['reads_TCR'] > 1)]
TCR_frequency = TCR_frequency.groupby(['sample','nucleotide_TCR'])['frequency_TCR'].sum().reset_index()
TCR_frequency

Unnamed: 0,sample,nucleotide_TCR,frequency_TCR
0,CMT-baseline1C_CAGATC,AAAATCCGGTCCACAAAGCTGGAGGACTCAGCCATGTACTTCTGTG...,0.000025
1,CMT-baseline1C_CAGATC,AAAATCCGGTCCACAAAGCTGGAGGACTCAGCCGTGTACTTCTGTG...,0.000002
2,CMT-baseline1C_CAGATC,AACATGAGCCCTGAAGACAGCAGCATATATCTCTACAGCGTTGAAC...,0.000014
3,CMT-baseline1C_CAGATC,AACATGAGCCCTGAAGACAGCAGCATATATCTCTGCAGCACATGCC...,0.000040
4,CMT-baseline1C_CAGATC,AACATGAGCCCTGAAGACAGCAGCATATATCTCTGCAGCCCTCTCC...,0.000013
...,...,...,...
379485,TR2-PBMC_S12,TTTCCCCTGACCCTGGAGTCTGCCAGGCCCTCACATACCTCTCAGT...,0.000021
379486,TR2-PBMC_S12,TTTCCTCTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCT...,0.000021
379487,TR2-PBMC_S12,TTTCCTCTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCT...,0.000021
379488,TR2-PBMC_S12,TTTCCTCTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCT...,0.000021


Load MIXCR results

In [51]:
MIXCR_files = glob.glob("../raw_data/MIXCR/*.txt")

MIXCR = pd.DataFrame()
for sample in MIXCR_files:
    df_sample = pd.read_csv(sample, delimiter = "\t")
    for sample_name in Sample_name_matchup_RNA_TCR_dict.keys():
        if sample_name in sample:
            df_sample['sample'] = sample_name
    MIXCR = MIXCR.append(df_sample, sort=False)

In [52]:
# rename and select the columns that needed for analysis
MIXCR['allVHitsWithScore']=MIXCR['allVHitsWithScore'].str.replace(r"\(.*\)","")
MIXCR['allDHitsWithScore']=MIXCR['allDHitsWithScore'].str.replace(r"\(.*\)","")
MIXCR['allJHitsWithScore']=MIXCR['allJHitsWithScore'].str.replace(r"\(.*\)","")
MIXCR.rename(columns = {'cloneFraction':'frequency_tool','cloneCount':'reads_tool','aaSeqCDR3':'amino_acid','targetSequences':'nucleotide_tool','allVHitsWithScore':'V','allDHitsWithScore':'D','allJHitsWithScore':'J'},inplace=True)
MIXCR = MIXCR[['frequency_tool','reads_tool','nucleotide_tool','V','J','sample']]
MIXCR['tool'] = "MIXCR"

# select TRB 
MIXCR = MIXCR[MIXCR['V'].str.contains("TRB") & MIXCR['J'].str.contains("TRB")]

# select the columns that needed for analysis
MIXCR = MIXCR[['frequency_tool','reads_tool','nucleotide_tool','sample','tool']]

# select rows with reads greater than 1
MIXCR = MIXCR[MIXCR['reads_tool'] > 1]
MIXCR = MIXCR.groupby(['sample','nucleotide_tool','tool'])['frequency_tool'].sum().reset_index()
MIXCR

Unnamed: 0,sample,nucleotide_tool,tool,frequency_tool
0,CMT-baseline1C_CAGATC,TGCAGTGCCGGACTACTCGGCGAGCAGTACTTC,MIXCR,0.000016
1,CMT-baseline1C_CAGATC,TGCAGTGCTAGTAGGGGCCGGATCTATGGCTACACCTTC,MIXCR,0.000016
2,CMT-baseline1C_CAGATC,TGCGCCAGCAGCTTGCTGAATGAGCAGTTCTTC,MIXCR,0.000016
3,CMT-baseline1C_CAGATC,TGCGCCAGCAGCTTTCTCAGGGTTCCCGAGACCCAGTACTTC,MIXCR,0.000016
4,CMT-baseline1C_CAGATC,TGCGCCAGCAGGGTTCTTATGTCGGGGCCCAATGAGCAGTTCTTC,MIXCR,0.000016
...,...,...,...,...
8108,TR2-PBMC_S12,TGTGCTAGTGGTCAGACGACAGGGGCCGATACGCAGTATTTT,MIXCR,0.000039
8109,TR2-PBMC_S12,TGTGCTAGTGGTTTCGGGACTAGCGGGAGTATGGGCACCGGGGAGC...,MIXCR,0.000039
8110,TR2-PBMC_S12,TGTGCTAGTGGTTTGGCAGGGTACGAGCAGTACTTC,MIXCR,0.000039
8111,TR2-PBMC_S12,TGTGCTAGTGGTTTGGGACGGGAGGTGGATGAGCAGTTCTTC,MIXCR,0.000039


Merge MIXCR & TCR-Seq dataframe based on nucleotide sequence

In [53]:
samples = ['CMT-baseline1C_CAGATC' , 'ESO1-sorted-T-cells_S13_L007',
           'HM-baseline1C_CGATGT' , 'INY1-sorted-T-cells_S14_L007',
           'INY2-sorted-T-cells_S15_L007' , 'JSSBaseline-RNA_GTGAAA',
           'LEK-OT110712A_CCGTCC' , 'LEK-baseline_CGATGT' , 'MP-11-28-12RNA_S2',
           'PT0112-B_S3' , 'PT0285-B_S5' , 'PT0310_S9' , 'RAS-baseline_TGACCA',
           'SAR-11-14-12RNA_S1' , 'TR2-PBMC_S12', 'SRR5233637', 'SRR5233639',
           'TCGA-CZ-4862', 'TCGA-CZ-5463', 'TCGA-CZ-5985']

def sequence_found(x):
    for seq_TCR in TCR_nt_list:
        if x.lower() in seq_TCR.lower() and len(seq_TCR) >= len(x):
            try:
                return seq_TCR
            finally:
                TCR_nt_list.remove(seq_TCR)
        pass 

MIXCR_merge = pd.DataFrame()

for sample in samples:
    MIXCR_nt = MIXCR.loc[(MIXCR['sample'] == sample)]
    TCR_nt = TCR_frequency.loc[(TCR_frequency['sample'] == sample)]

    # convert the nucleotide sequence in TCR-Seq to a list
    TCR_nt_list = TCR_nt.loc[:,'nucleotide_TCR'].tolist()

    MIXCR_nt.loc[:,'nucleotide_TCR'] = MIXCR_nt['nucleotide_tool'].apply(lambda x: sequence_found(x))

    MIXCR_merge = pd.concat([MIXCR_merge,MIXCR_nt],ignore_index=True,sort=False)

MIXCR_TCR_merge = MIXCR_merge.merge(TCR_frequency, how='right', on=['sample','nucleotide_TCR'])
MIXCR_TCR_merge.rename(columns={'frequency_tool':'frequency_MIXCR'}, inplace=True)
MIXCR_TCR_merge['frequency_MIXCR'] = MIXCR_TCR_merge['frequency_MIXCR'].fillna(0)
MIXCR_TCR_merge = MIXCR_TCR_merge[['sample','nucleotide_TCR','frequency_MIXCR','frequency_TCR']]

Load TRUST4 output

In [54]:
TRUST4_files = glob.glob("../raw_data/TRUST4/*.tsv")

TRUST4 = pd.DataFrame()
for sample in TRUST4_files:
    df_sample = pd.read_csv(sample,sep='\t')
    for sample_name in Sample_name_matchup_RNA_TCR_dict.keys():
        if sample_name in sample:
            df_sample['sample'] = sample_name
    TRUST4 = TRUST4.append(df_sample, sort=False)

In [55]:
# rename and select the columns that needed for analysis
TRUST4.rename(columns = {'frequency':'frequency_tool','#count':'reads_tool','CDR3aa':'amino_acid','CDR3nt':'nucleotide_tool'},inplace=True)
TRUST4 = TRUST4[['frequency_tool','reads_tool','nucleotide_tool','V','J','sample']]
TRUST4['tool'] = "TRUST4"

# select TRB 
TRUST4 = TRUST4[TRUST4['V'].str.contains("TRB") & TRUST4['J'].str.contains("TRB")]

# select the columns that needed for analysis
TRUST4 = TRUST4[['frequency_tool','reads_tool','nucleotide_tool','sample','tool']]

# select rows with reads greater than 1
TRUST4 = TRUST4[TRUST4['reads_tool'] > 1]
TRUST4 = TRUST4.groupby(['sample','nucleotide_tool','tool'])['frequency_tool'].sum().reset_index()
TRUST4

Unnamed: 0,sample,nucleotide_tool,tool,frequency_tool
0,CMT-baseline1C_CAGATC,TCAATGAATGTTGAGTTTAACATTCACCGGGGAGCTGTTTTTT,TRUST4,0.001697
1,CMT-baseline1C_CAGATC,TGCAGCGTACCGAGGGGGGGGAATGAGCAGTTCTTC,TRUST4,0.000849
2,CMT-baseline1C_CAGATC,TGCAGCGTCCCCGCGGGAGGGGGGGATACGCAGTATTTT,TRUST4,0.000849
3,CMT-baseline1C_CAGATC,TGCAGCGTTATCTTTCAGATGGGACAGGGAGGCATTCAGTACTTC,TRUST4,0.000849
4,CMT-baseline1C_CAGATC,TGCAGCGTTGATGATGATGGAGGGTTTGGTTCTGGAAACACCATAT...,TRUST4,0.000849
...,...,...,...,...
22634,TR2-PBMC_S12,TGTGTAATAACCGGGACACACCCCTTATCGGCGCAGTACTTC,TRUST4,0.000060
22635,TR2-PBMC_S12,TGTGTAGGCTCACCCCTCCACTTT,TRUST4,0.000090
22636,TR2-PBMC_S12,TGTGTCAGCAGTTACTCAAGGGATGGCTACACCTTC,TRUST4,0.000060
22637,TR2-PBMC_S12,TGTGTCAGCTCACCCATACTTGGGACAGGGGTAAATTGGAACACTG...,TRUST4,0.000060


Merge TRUST4 & TCR-Seq dataframe based on nucleotide sequence

In [56]:
samples = ['CMT-baseline1C_CAGATC' , 'ESO1-sorted-T-cells_S13_L007',
           'HM-baseline1C_CGATGT' , 'INY1-sorted-T-cells_S14_L007',
           'INY2-sorted-T-cells_S15_L007' , 'JSSBaseline-RNA_GTGAAA',
           'LEK-OT110712A_CCGTCC' , 'LEK-baseline_CGATGT' , 'MP-11-28-12RNA_S2',
           'PT0112-B_S3' , 'PT0285-B_S5' , 'PT0310_S9' , 'RAS-baseline_TGACCA',
           'SAR-11-14-12RNA_S1' , 'TR2-PBMC_S12', 'SRR5233637', 'SRR5233639',
           'TCGA-CZ-4862', 'TCGA-CZ-5463', 'TCGA-CZ-5985']

TRUST4_merge = pd.DataFrame()

for sample in samples:
    TRUST4_nt = TRUST4.loc[(TRUST4['sample'] == sample)]
    TCR_nt = TCR_frequency.loc[(TCR_frequency['sample'] == sample)]

    # convert the nucleotide sequence in TCR-Seq as a list
    TCR_nt_list = TCR_nt.loc[:,'nucleotide_TCR'].tolist()

    TRUST4_nt.loc[:,'nucleotide_TCR'] = TRUST4_nt['nucleotide_tool'].apply(lambda x: sequence_found(x))
               
    TRUST4_merge = pd.concat([TRUST4_merge,TRUST4_nt],ignore_index=True,sort=False)
        
TRUST4_TCR_merge = TRUST4_merge.merge(TCR_frequency, how='right', on=['sample','nucleotide_TCR'])
TRUST4_TCR_merge.rename(columns={'frequency_tool':'frequency_TRUST4'}, inplace=True)
TRUST4_TCR_merge['frequency_TRUST4'] = TRUST4_TCR_merge['frequency_TRUST4'].fillna(0)
TRUST4_TCR_merge = TRUST4_TCR_merge[['sample','nucleotide_TCR','frequency_TRUST4','frequency_TCR']]

Merge results from MiXCR and TRUST4 

In [57]:
MIXCR_TRUST4 = MIXCR_TCR_merge.merge(TRUST4_TCR_merge, how='right', on=['sample','nucleotide_TCR','frequency_TCR'])
MIXCR_TRUST4

Unnamed: 0,sample,nucleotide_TCR,frequency_MIXCR,frequency_TCR,frequency_TRUST4
0,CMT-baseline1C_CAGATC,ACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCA...,0.000016,0.000051,0.001273
1,CMT-baseline1C_CAGATC,GTGAGCACCTTGGAGCTGGGGGACTCGGCCCTTTATCTTTGCGCCA...,0.000016,0.000028,0.000849
2,CMT-baseline1C_CAGATC,CACGCCCTGCAGCCAGAAGACTCAGCCCTGTATCTCTGCGCCAGCA...,0.000016,0.000582,0.002121
3,CMT-baseline1C_CAGATC,TTGGAGATCCAGCGCACAGAGCAGGGGGACTCGGCCATGTATCTCT...,0.000024,0.001122,0.002121
4,CMT-baseline1C_CAGATC,CTCAGGCTGCTGTCGGCTGCTCCCTCCCAGACATCTGTGTACTTCT...,0.000024,0.002540,0.000000
...,...,...,...,...,...
379485,TR2-PBMC_S12,TTTCCCCTGACCCTGGAGTCTGCCAGGCCCTCACATACCTCTCAGT...,0.000000,0.000021,0.000000
379486,TR2-PBMC_S12,TTTCCTCTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCT...,0.000000,0.000021,0.000000
379487,TR2-PBMC_S12,TTTCCTCTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCT...,0.000000,0.000021,0.000000
379488,TR2-PBMC_S12,TTTCCTCTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCT...,0.000000,0.000021,0.000000


In [58]:
# add tissue type
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='CMT-baseline1C_CAGATC','tissue'] = 'melanoma'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='ESO1-sorted-T-cells_S13_L007','tissue'] = 'PBMC'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='HM-baseline1C_CGATGT','tissue'] = 'melanoma'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='INY1-sorted-T-cells_S14_L007','tissue'] = 'PBMC'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='INY2-sorted-T-cells_S15_L007','tissue'] = 'PBMC'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='JSSBaseline-RNA_GTGAAA','tissue'] = 'melanoma'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='LEK-OT110712A_CCGTCC','tissue'] = 'melanoma'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='LEK-baseline_CGATGT','tissue'] = 'melanoma'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='MP-11-28-12RNA_S2','tissue'] = 'PBMC'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='PT0112-B_S3','tissue'] = 'melanoma'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='PT0285-B_S5','tissue'] = 'melanoma'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='PT0310_S9','tissue'] = 'melanoma'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='RAS-baseline_TGACCA','tissue'] = 'melanoma'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='SAR-11-14-12RNA_S1','tissue'] = 'PBMC'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='TR2-PBMC_S12','tissue'] = 'PBMC'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='SRR5233639','tissue'] = 'lymph_node'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='SRR5233637','tissue'] = 'small_intestine'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='TCGA-CZ-4862','tissue'] = 'kidney'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='TCGA-CZ-5463','tissue'] = 'kidney'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='TCGA-CZ-5985','tissue'] = 'kidney'

# add repertoire type 
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='CMT-baseline1C_CAGATC','class'] = 'T_cell_poor_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='ESO1-sorted-T-cells_S13_L007','class'] = 'T_cell_rich_monoclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='HM-baseline1C_CGATGT','class'] = 'T_cell_poor_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='INY1-sorted-T-cells_S14_L007','class'] = 'T_cell_rich_monoclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='INY2-sorted-T-cells_S15_L007','class'] = 'T_cell_rich_monoclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='JSSBaseline-RNA_GTGAAA','class'] = 'T_cell_poor_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='LEK-OT110712A_CCGTCC','class'] = 'T_cell_poor_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='LEK-baseline_CGATGT','class'] = 'T_cell_poor_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='MP-11-28-12RNA_S2','class'] = 'T_cell_rich_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='PT0112-B_S3','class'] = 'T_cell_poor_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='PT0285-B_S5','class'] = 'T_cell_poor_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='PT0310_S9','class'] = 'T_cell_poor_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='RAS-baseline_TGACCA','class'] = 'T_cell_poor_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='SAR-11-14-12RNA_S1','class'] = 'T_cell_rich_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='TR2-PBMC_S12','class'] = 'T_cell_rich_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='SRR5233639','class'] = 'T_cell_rich_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='SRR5233637','class'] = 'T_cell_poor_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='TCGA-CZ-4862','class'] = 'T_cell_poor_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='TCGA-CZ-5463','class'] = 'T_cell_poor_polyclonal'
MIXCR_TRUST4.loc[MIXCR_TRUST4['sample']=='TCGA-CZ-5985','class'] = 'T_cell_poor_polyclonal'

In [59]:
MIXCR_TRUST4.to_csv('../summary_data/original/all_tools_capturing_ability_nt.csv', index=False)
MIXCR_TRUST4     

Unnamed: 0,sample,nucleotide_TCR,frequency_MIXCR,frequency_TCR,frequency_TRUST4,tissue,class
0,CMT-baseline1C_CAGATC,ACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCA...,0.000016,0.000051,0.001273,melanoma,T_cell_poor_polyclonal
1,CMT-baseline1C_CAGATC,GTGAGCACCTTGGAGCTGGGGGACTCGGCCCTTTATCTTTGCGCCA...,0.000016,0.000028,0.000849,melanoma,T_cell_poor_polyclonal
2,CMT-baseline1C_CAGATC,CACGCCCTGCAGCCAGAAGACTCAGCCCTGTATCTCTGCGCCAGCA...,0.000016,0.000582,0.002121,melanoma,T_cell_poor_polyclonal
3,CMT-baseline1C_CAGATC,TTGGAGATCCAGCGCACAGAGCAGGGGGACTCGGCCATGTATCTCT...,0.000024,0.001122,0.002121,melanoma,T_cell_poor_polyclonal
4,CMT-baseline1C_CAGATC,CTCAGGCTGCTGTCGGCTGCTCCCTCCCAGACATCTGTGTACTTCT...,0.000024,0.002540,0.000000,melanoma,T_cell_poor_polyclonal
...,...,...,...,...,...,...,...
379485,TR2-PBMC_S12,TTTCCCCTGACCCTGGAGTCTGCCAGGCCCTCACATACCTCTCAGT...,0.000000,0.000021,0.000000,PBMC,T_cell_rich_polyclonal
379486,TR2-PBMC_S12,TTTCCTCTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCT...,0.000000,0.000021,0.000000,PBMC,T_cell_rich_polyclonal
379487,TR2-PBMC_S12,TTTCCTCTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCT...,0.000000,0.000021,0.000000,PBMC,T_cell_rich_polyclonal
379488,TR2-PBMC_S12,TTTCCTCTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCT...,0.000000,0.000021,0.000000,PBMC,T_cell_rich_polyclonal
