In [1]:
import pandas as pd
import numpy as np

In [2]:
IMREP_df = pd.read_csv("../summary_data/additional/IMREP/IMREP_TRB_merged_extracted_features.csv")
TRUST4_df = pd.read_csv("../summary_data/additional/TRUST4/TRUST4_TRB_merged_extracted_features.csv")
MIXCR_df = pd.read_csv("../summary_data/additional/MIXCR/MIXCR_TRB_merged_extracted_features.csv")
CATT_df = pd.read_csv("../summary_data/additional/CATT/CATT_TRB_merged_extracted_features.csv")
TCR_df = pd.read_csv("../summary_data/additional/TCR_Seq/TCR_merged_extracted_features.csv")

In [3]:
# Select rows with nReads greater than 1
IMREP_df = IMREP_df[IMREP_df['nReads'] > 1]
TRUST4_df = TRUST4_df[TRUST4_df['nReads'] > 1]
MIXCR_df = MIXCR_df[MIXCR_df['nReads'] > 1]
CATT_df = CATT_df[CATT_df['nReads'] > 1]
TCR_df = TCR_df[TCR_df['nReads'] > 1]

In [4]:
# Rename nReads and frequencies columns according to tool for proper merging
IMREP_df = IMREP_df.rename(columns={"nReads": "nReads_IMREP"})
TRUST4_df = TRUST4_df.rename(columns={"nReads": "nReads_TRUST4"})
MIXCR_df = MIXCR_df.rename(columns={"nReads": "nReads_MIXCR"})
CATT_df = CATT_df.rename(columns={"nReads": "nReads_CATT"})
TCR_df = TCR_df.rename(columns={"nReads": "nReads_TCR"})

Complete dataframe across all samples and tools

In [5]:
# Merge dataframes based on two key combination: Sample and CDR3. Outer join ensures no data is lost for instances that do not have overlap
merge_IMREP_TRUST4 = pd.merge(IMREP_df, TRUST4_df, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_IMREP_TRUST4_MIXCR = pd.merge(MIXCR_df, merge_IMREP_TRUST4, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_IMREP_TRUST4_MIXCR_CATT = pd.merge(CATT_df, merge_IMREP_TRUST4_MIXCR, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_complete = pd.merge(TCR_df, merge_IMREP_TRUST4_MIXCR_CATT, how='outer', on=['Sample', 'CDR3']).fillna(0)

merge_complete

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_CATT,nReads_MIXCR,nReads_IMREP,nReads_TRUST4
0,addsample01,CAWSEGRDTGELFF,3.0,0.0,0.0,0.0,0.0
1,addsample01,CAISGLAGEETQYF,3.0,0.0,0.0,0.0,0.0
2,addsample01,CASSPGQAYEQYF,2.0,0.0,0.0,0.0,2.0
3,addsample01,CASSQEVPGTYEQYF,3.0,0.0,0.0,0.0,0.0
4,addsample01,CSAIGIAKETQYF,2.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
210211,addsample10,CASSYSPANNSPLHF,0.0,0.0,0.0,0.0,2.0
210212,addsample10,CASSGTSGGTYNEQFF,0.0,0.0,0.0,0.0,2.0
210213,addsample10,CASSLRTSGVTYNEQFF,0.0,0.0,0.0,0.0,2.0
210214,addsample10,CARLRDGRSAFF,0.0,0.0,0.0,0.0,2.0


In [6]:
# Add the tissue type
merge_complete.loc[merge_complete['Sample']=='addsample01','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='addsample02','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='addsample03','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='addsample04','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='addsample05','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='addsample06','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='addsample07','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='addsample08','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='addsample09','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='addsample10','tissue'] = 'PBMC'

In [7]:
# Add T cell rich or poor tissue type
merge_complete.loc[merge_complete['Sample']=='addsample01','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='addsample02','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='addsample03','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='addsample04','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='addsample05','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='addsample06','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='addsample07','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='addsample08','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='addsample09','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='addsample10','tissue_type'] = 'T_cell_rich'

In [8]:
# Calculate total number of reads in each sample
total_reads = merge_complete[['Sample','nReads_TCR','nReads_MIXCR','nReads_IMREP','nReads_TRUST4','nReads_CATT']].groupby('Sample').sum().rename(columns={'nReads_TCR':'total_reads_TCR','nReads_MIXCR':'total_reads_MIXCR','nReads_IMREP':'total_reads_IMREP','nReads_TRUST4':'total_reads_TRUST4','nReads_CATT':'total_reads_CATT'})
total_reads

Unnamed: 0_level_0,total_reads_TCR,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4,total_reads_CATT
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
addsample01,8268.0,342.0,268.0,1472.0,8704.0
addsample02,20897.0,788.0,511.0,2756.0,8849.0
addsample03,52379.0,124.0,108.0,799.0,6803.0
addsample04,224049.0,323.0,246.0,1480.0,9012.0
addsample05,59517.0,1139.0,649.0,3571.0,8168.0
addsample06,29586.0,1521.0,914.0,4352.0,10181.0
addsample07,115957.0,354.0,261.0,1296.0,7599.0
addsample08,68388.0,704.0,499.0,2815.0,8952.0
addsample09,453906.0,285.0,195.0,1346.0,7650.0
addsample10,104959.0,514.0,297.0,1907.0,9031.0


In [9]:
# Merge dataframes 
merge = pd.merge(merge_complete, total_reads, how='outer', on=['Sample']).fillna(0)

# Calculate frequency of CDR3 reads with respect to CDR3s that occur more than once
merge['frequency_TCR'] = merge['nReads_TCR'] / (merge['total_reads_TCR'] * 1.0)
merge['frequency_MIXCR'] = merge['nReads_MIXCR'] / (merge['total_reads_MIXCR'] * 1.0)
merge['frequency_IMREP'] = merge['nReads_IMREP'] / (merge['total_reads_IMREP'] * 1.0)
merge['frequency_TRUST4'] = merge['nReads_TRUST4'] / (merge['total_reads_TRUST4'] * 1.0)  
merge['frequency_CATT'] = merge['nReads_CATT'] / (merge['total_reads_CATT'] * 1.0) 
merge.fillna(0, inplace=True)
merge

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_CATT,nReads_MIXCR,nReads_IMREP,nReads_TRUST4,tissue,tissue_type,total_reads_TCR,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4,total_reads_CATT,frequency_TCR,frequency_MIXCR,frequency_IMREP,frequency_TRUST4,frequency_CATT
0,addsample01,CAWSEGRDTGELFF,3.0,0.0,0.0,0.0,0.0,PBMC,T_cell_rich,8268.0,342.0,268.0,1472.0,8704.0,0.000363,0.0,0.0,0.000000,0.0
1,addsample01,CAISGLAGEETQYF,3.0,0.0,0.0,0.0,0.0,PBMC,T_cell_rich,8268.0,342.0,268.0,1472.0,8704.0,0.000363,0.0,0.0,0.000000,0.0
2,addsample01,CASSPGQAYEQYF,2.0,0.0,0.0,0.0,2.0,PBMC,T_cell_rich,8268.0,342.0,268.0,1472.0,8704.0,0.000242,0.0,0.0,0.001359,0.0
3,addsample01,CASSQEVPGTYEQYF,3.0,0.0,0.0,0.0,0.0,PBMC,T_cell_rich,8268.0,342.0,268.0,1472.0,8704.0,0.000363,0.0,0.0,0.000000,0.0
4,addsample01,CSAIGIAKETQYF,2.0,0.0,0.0,0.0,0.0,PBMC,T_cell_rich,8268.0,342.0,268.0,1472.0,8704.0,0.000242,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210211,addsample10,CASSYSPANNSPLHF,0.0,0.0,0.0,0.0,2.0,PBMC,T_cell_rich,104959.0,514.0,297.0,1907.0,9031.0,0.000000,0.0,0.0,0.001049,0.0
210212,addsample10,CASSGTSGGTYNEQFF,0.0,0.0,0.0,0.0,2.0,PBMC,T_cell_rich,104959.0,514.0,297.0,1907.0,9031.0,0.000000,0.0,0.0,0.001049,0.0
210213,addsample10,CASSLRTSGVTYNEQFF,0.0,0.0,0.0,0.0,2.0,PBMC,T_cell_rich,104959.0,514.0,297.0,1907.0,9031.0,0.000000,0.0,0.0,0.001049,0.0
210214,addsample10,CARLRDGRSAFF,0.0,0.0,0.0,0.0,2.0,PBMC,T_cell_rich,104959.0,514.0,297.0,1907.0,9031.0,0.000000,0.0,0.0,0.001049,0.0


Differentiate low_SDI and high_SDI repertoire

In [10]:
# Calculate Shannon-Wiener index
diversity_TCR = merge[['Sample','tissue','tissue_type','CDR3','nReads_TCR','total_reads_TCR','frequency_TCR']]
diversity_TCR = diversity_TCR[diversity_TCR.nReads_TCR != 0]
clonotype_count_TCR = diversity_TCR.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_TCR')

diversity_TCR['shannon_index_TCR'] = -(diversity_TCR['frequency_TCR']*np.log(diversity_TCR['frequency_TCR']))
shannon_TCR = diversity_TCR.groupby(['Sample']).agg({'shannon_index_TCR':'sum'}).reset_index().rename(columns={'':"shannon_index_TCR"})
shannon_TCR = pd.merge(shannon_TCR, clonotype_count_TCR, on=['Sample'])

# Define low_SDI sample as the shannon_index < 2, high_SDI sample as the shannon_index >= 2
shannon_TCR['repertoire_type'] = ['low_SDI' if x < 2 else 'high_SDI' for x  in shannon_TCR['shannon_index_TCR']]
repertoire_type = shannon_TCR[['Sample','repertoire_type']]
shannon_TCR

Unnamed: 0,Sample,shannon_index_TCR,clonotype_count_TCR,repertoire_type
0,addsample01,7.951096,3123,high_SDI
1,addsample02,7.824713,7038,high_SDI
2,addsample03,9.43415,17329,high_SDI
3,addsample04,9.857141,28783,high_SDI
4,addsample05,8.906202,12839,high_SDI
5,addsample06,8.385267,7626,high_SDI
6,addsample07,9.584192,23753,high_SDI
7,addsample08,9.626994,24307,high_SDI
8,addsample09,9.985733,46941,high_SDI
9,addsample10,9.477759,25514,high_SDI


In [11]:
# Generate metadata 
metadata = pd.merge(merge, repertoire_type, how='outer', on=['Sample'])
metadata.loc[:,'class'] = metadata["tissue_type"] +"_"+ metadata["repertoire_type"]

metadata.to_csv('../summary_data/additional/all_tools_TRB.csv', index=False)
metadata

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_CATT,nReads_MIXCR,nReads_IMREP,nReads_TRUST4,tissue,tissue_type,total_reads_TCR,...,total_reads_IMREP,total_reads_TRUST4,total_reads_CATT,frequency_TCR,frequency_MIXCR,frequency_IMREP,frequency_TRUST4,frequency_CATT,repertoire_type,class
0,addsample01,CAWSEGRDTGELFF,3.0,0.0,0.0,0.0,0.0,PBMC,T_cell_rich,8268.0,...,268.0,1472.0,8704.0,0.000363,0.0,0.0,0.000000,0.0,high_SDI,T_cell_rich_high_SDI
1,addsample01,CAISGLAGEETQYF,3.0,0.0,0.0,0.0,0.0,PBMC,T_cell_rich,8268.0,...,268.0,1472.0,8704.0,0.000363,0.0,0.0,0.000000,0.0,high_SDI,T_cell_rich_high_SDI
2,addsample01,CASSPGQAYEQYF,2.0,0.0,0.0,0.0,2.0,PBMC,T_cell_rich,8268.0,...,268.0,1472.0,8704.0,0.000242,0.0,0.0,0.001359,0.0,high_SDI,T_cell_rich_high_SDI
3,addsample01,CASSQEVPGTYEQYF,3.0,0.0,0.0,0.0,0.0,PBMC,T_cell_rich,8268.0,...,268.0,1472.0,8704.0,0.000363,0.0,0.0,0.000000,0.0,high_SDI,T_cell_rich_high_SDI
4,addsample01,CSAIGIAKETQYF,2.0,0.0,0.0,0.0,0.0,PBMC,T_cell_rich,8268.0,...,268.0,1472.0,8704.0,0.000242,0.0,0.0,0.000000,0.0,high_SDI,T_cell_rich_high_SDI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210211,addsample10,CASSYSPANNSPLHF,0.0,0.0,0.0,0.0,2.0,PBMC,T_cell_rich,104959.0,...,297.0,1907.0,9031.0,0.000000,0.0,0.0,0.001049,0.0,high_SDI,T_cell_rich_high_SDI
210212,addsample10,CASSGTSGGTYNEQFF,0.0,0.0,0.0,0.0,2.0,PBMC,T_cell_rich,104959.0,...,297.0,1907.0,9031.0,0.000000,0.0,0.0,0.001049,0.0,high_SDI,T_cell_rich_high_SDI
210213,addsample10,CASSLRTSGVTYNEQFF,0.0,0.0,0.0,0.0,2.0,PBMC,T_cell_rich,104959.0,...,297.0,1907.0,9031.0,0.000000,0.0,0.0,0.001049,0.0,high_SDI,T_cell_rich_high_SDI
210214,addsample10,CARLRDGRSAFF,0.0,0.0,0.0,0.0,2.0,PBMC,T_cell_rich,104959.0,...,297.0,1907.0,9031.0,0.000000,0.0,0.0,0.001049,0.0,high_SDI,T_cell_rich_high_SDI


Calculate Shannon-Wiener index and absolute error for each sample across different tools

In [12]:
# MIXCR
diversity_MIXCR = merge[['Sample','tissue','tissue_type','CDR3','nReads_MIXCR','total_reads_MIXCR','frequency_MIXCR']]
diversity_MIXCR = diversity_MIXCR[diversity_MIXCR.nReads_MIXCR != 0]
clonotype_count_MIXCR = diversity_MIXCR.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity_MIXCR['shannon_index_tool'] = -(diversity_MIXCR['frequency_MIXCR']*np.log(diversity_MIXCR['frequency_MIXCR']))
shannon_MIXCR = diversity_MIXCR.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
shannon_MIXCR = pd.merge(shannon_MIXCR, clonotype_count_MIXCR, on=['Sample'])
shannon_MIXCR['tool'] = 'MIXCR'

# IMREP
diversity_IMREP = merge[['Sample','tissue','tissue_type','CDR3','nReads_IMREP','total_reads_IMREP','frequency_IMREP']]
diversity_IMREP = diversity_IMREP[diversity_IMREP.nReads_IMREP != 0]
clonotype_count_IMREP = diversity_IMREP.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity_IMREP['shannon_index_tool'] = -(diversity_IMREP['frequency_IMREP']*np.log(diversity_IMREP['frequency_IMREP']))
shannon_IMREP = diversity_IMREP.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
shannon_IMREP = pd.merge(shannon_IMREP, clonotype_count_IMREP, on=['Sample'])
shannon_IMREP['tool'] = 'IMREP'

# TRUST4
diversity_TRUST4 = merge[['Sample','tissue','tissue_type','CDR3','nReads_TRUST4','total_reads_TRUST4','frequency_TRUST4']]
diversity_TRUST4 = diversity_TRUST4[diversity_TRUST4.nReads_TRUST4 != 0]
clonotype_count_TRUST4 = diversity_TRUST4.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity_TRUST4['shannon_index_tool'] = -(diversity_TRUST4['frequency_TRUST4']*np.log(diversity_TRUST4['frequency_TRUST4']))
shannon_TRUST4 = diversity_TRUST4.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
shannon_TRUST4 = pd.merge(shannon_TRUST4, clonotype_count_TRUST4, on=['Sample'])
shannon_TRUST4['tool'] = 'TRUST4'

# CATT
diversity_CATT = merge[['Sample','tissue','tissue_type','CDR3','nReads_CATT','total_reads_CATT','frequency_CATT']]
diversity_CATT = diversity_CATT[diversity_CATT.nReads_CATT != 0]
clonotype_count_CATT = diversity_CATT.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity_CATT['shannon_index_tool'] = -(diversity_CATT['frequency_CATT']*np.log(diversity_CATT['frequency_CATT']))
shannon_CATT = diversity_CATT.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
shannon_CATT = pd.merge(shannon_CATT, clonotype_count_CATT, on=['Sample'])
shannon_CATT['tool'] = 'CATT'

diversity = pd.concat([shannon_MIXCR,shannon_IMREP,shannon_TRUST4,shannon_CATT])
diversity = pd.merge(diversity, shannon_TCR, how='outer', on=['Sample'])
diversity = diversity.fillna(0)
tissue_type = merge[['Sample','tissue','tissue_type']].drop_duplicates(keep='first')
diversity = pd.merge(diversity, tissue_type, how='inner', on=['Sample'])
diversity['SDI_absolute_error'] = np.abs(diversity['shannon_index_TCR'] - diversity['shannon_index_tool'])
diversity.loc[:,'class'] = diversity["tissue_type"] +"_"+ diversity["repertoire_type"]

# Calculate clonality
diversity['clonality_TCR'] = 1 - (diversity['shannon_index_TCR']/np.log(diversity['clonotype_count_TCR']))
diversity['clonality_tool'] = 1 - (diversity['shannon_index_tool']/np.log(diversity['clonotype_count_tool']))
diversity['clonality_absolute_error'] = np.abs(diversity['clonality_TCR'] - diversity['clonality_tool'])

diversity.to_csv('../summary_data/additional/all_tools_TRB_diversity.csv', index=False)
diversity

Unnamed: 0,Sample,shannon_index_tool,clonotype_count_tool,tool,shannon_index_TCR,clonotype_count_TCR,repertoire_type,tissue,tissue_type,SDI_absolute_error,class,clonality_TCR,clonality_tool,clonality_absolute_error
0,addsample01,4.976815,150,MIXCR,7.951096,3123,high_SDI,PBMC,T_cell_rich,2.974281,T_cell_rich_high_SDI,0.011863,0.00675,0.005113
1,addsample01,4.709526,117,IMREP,7.951096,3123,high_SDI,PBMC,T_cell_rich,3.24157,T_cell_rich_high_SDI,0.011863,0.011055,0.000807
2,addsample01,5.902409,470,TRUST4,7.951096,3123,high_SDI,PBMC,T_cell_rich,2.048687,T_cell_rich_high_SDI,0.011863,0.040685,0.028822
3,addsample01,4.004364,760,CATT,7.951096,3123,high_SDI,PBMC,T_cell_rich,3.946732,T_cell_rich_high_SDI,0.011863,0.396326,0.384463
4,addsample02,4.664017,222,MIXCR,7.824713,7038,high_SDI,PBMC,T_cell_rich,3.160697,T_cell_rich_high_SDI,0.116758,0.136721,0.019964
5,addsample02,4.437385,156,IMREP,7.824713,7038,high_SDI,PBMC,T_cell_rich,3.387328,T_cell_rich_high_SDI,0.116758,0.121285,0.004527
6,addsample02,5.813776,739,TRUST4,7.824713,7038,high_SDI,PBMC,T_cell_rich,2.010938,T_cell_rich_high_SDI,0.116758,0.119831,0.003074
7,addsample02,4.489985,933,CATT,7.824713,7038,high_SDI,PBMC,T_cell_rich,3.334729,T_cell_rich_high_SDI,0.116758,0.343416,0.226659
8,addsample03,3.956479,54,MIXCR,9.43415,17329,high_SDI,PBMC,T_cell_rich,5.477671,T_cell_rich_high_SDI,0.0334,0.008149,0.025251
9,addsample03,3.815572,47,IMREP,9.43415,17329,high_SDI,PBMC,T_cell_rich,5.618578,T_cell_rich_high_SDI,0.0334,0.00898,0.024419


Calculate number of TCR derived reads per one million RNA-Seq reads

In [13]:
RNA_seq_reads = pd.read_csv("../summary_data/additional/RNA_Seq_reads.csv")
RNA_seq_reads

Unnamed: 0,Sample,total_reads_RNA_seq
0,addsample01,101924084
1,addsample02,104352202
2,addsample03,89333304
3,addsample04,120840832
4,addsample05,90101915
5,addsample06,97169398
6,addsample07,99240419
7,addsample08,107430151
8,addsample09,93539596
9,addsample10,103818947


In [14]:
reads = metadata[['Sample','tissue','tissue_type','repertoire_type','class','total_reads_TCR','total_reads_MIXCR','total_reads_IMREP','total_reads_TRUST4','total_reads_CATT']]
reads = reads.drop_duplicates(keep='first')
reads

# Add the column of RNA-Seq reads
reads = pd.merge(reads, RNA_seq_reads, how='left', on=['Sample'])
reads

# Calculate number of TCR derived reads per one million RNA-Seq reads 
# MIXCR
MIXCR = reads[['Sample','tissue','tissue_type','repertoire_type','class','total_reads_MIXCR','total_reads_RNA_seq']]
MIXCR.rename(columns={'total_reads_MIXCR':'total_reads_tool'}, inplace=True)
MIXCR['tool'] = 'MIXCR'
MIXCR.loc[:,'TCR_derived_by_RNA_seq_tool'] = MIXCR['total_reads_tool']/MIXCR['total_reads_RNA_seq'] *1000000

# IMREP
IMREP = reads[['Sample','tissue','tissue_type','repertoire_type','class','total_reads_IMREP','total_reads_RNA_seq']]
IMREP.rename(columns={'total_reads_IMREP':'total_reads_tool'}, inplace=True)
IMREP['tool'] = 'IMREP'
IMREP.loc[:,'TCR_derived_by_RNA_seq_tool'] = IMREP['total_reads_tool']/IMREP['total_reads_RNA_seq']*1000000

# TRUST4
TRUST4 = reads[['Sample','tissue','tissue_type','repertoire_type','class','total_reads_TRUST4','total_reads_RNA_seq']]
TRUST4.rename(columns={'total_reads_TRUST4':'total_reads_tool'}, inplace=True)
TRUST4['tool'] = 'TRUST4'
TRUST4.loc[:,'TCR_derived_by_RNA_seq_tool'] = TRUST4['total_reads_tool']/TRUST4['total_reads_RNA_seq']*1000000

# CATT
CATT = reads[['Sample','tissue','tissue_type','repertoire_type','class','total_reads_CATT','total_reads_RNA_seq']]
CATT.rename(columns={'total_reads_CATT':'total_reads_tool'}, inplace=True)
CATT.loc[:,'tool'] = 'CATT'
CATT.loc[:,'TCR_derived_by_RNA_seq_tool'] = CATT['total_reads_tool']/CATT['total_reads_RNA_seq']*1000000

reads_count = pd.concat([MIXCR,IMREP,TRUST4,CATT])
reads_count.to_csv('../summary_data/additional/all_tools_TRB_reads.csv', index=False)

reads_count

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/us

Unnamed: 0,Sample,tissue,tissue_type,repertoire_type,class,total_reads_tool,total_reads_RNA_seq,tool,TCR_derived_by_RNA_seq_tool
0,addsample01,PBMC,T_cell_rich,high_SDI,T_cell_rich_high_SDI,342.0,101924084,MIXCR,3.355439
1,addsample02,PBMC,T_cell_rich,high_SDI,T_cell_rich_high_SDI,788.0,104352202,MIXCR,7.55135
2,addsample03,PBMC,T_cell_rich,high_SDI,T_cell_rich_high_SDI,124.0,89333304,MIXCR,1.38806
3,addsample04,PBMC,T_cell_rich,high_SDI,T_cell_rich_high_SDI,323.0,120840832,MIXCR,2.672938
4,addsample05,PBMC,T_cell_rich,high_SDI,T_cell_rich_high_SDI,1139.0,90101915,MIXCR,12.641241
5,addsample06,PBMC,T_cell_rich,high_SDI,T_cell_rich_high_SDI,1521.0,97169398,MIXCR,15.653076
6,addsample07,PBMC,T_cell_rich,high_SDI,T_cell_rich_high_SDI,354.0,99240419,MIXCR,3.567095
7,addsample08,PBMC,T_cell_rich,high_SDI,T_cell_rich_high_SDI,704.0,107430151,MIXCR,6.553095
8,addsample09,PBMC,T_cell_rich,high_SDI,T_cell_rich_high_SDI,285.0,93539596,MIXCR,3.046838
9,addsample10,PBMC,T_cell_rich,high_SDI,T_cell_rich_high_SDI,514.0,103818947,MIXCR,4.950927
