In [1]:
import pandas as pd
import numpy as np

In [2]:
IMREP_df = pd.read_csv("../summary_data/original/IMREP/IMREP_TRB_merged_extracted_features.csv")
TRUST4_df = pd.read_csv("../summary_data/original/TRUST4/TRUST4_TRB_merged_extracted_features.csv")
MIXCR_df = pd.read_csv("../summary_data/original/MIXCR/MIXCR_TRB_merged_extracted_features.csv")
TCR_df = pd.read_csv("../summary_data/original/TCR_Seq/TCR_merged_extracted_features.csv")

In [3]:
# Select rows with nReads greater than 1
IMREP_df = IMREP_df[IMREP_df['nReads'] > 1]
TRUST4_df = TRUST4_df[TRUST4_df['nReads'] > 1]
MIXCR_df = MIXCR_df[MIXCR_df['nReads'] > 1]
TCR_df = TCR_df[TCR_df['nReads'] > 1]

In [4]:
# Rename nReads and frequencies columns according to tool for proper merging
IMREP_df = IMREP_df.rename(columns={"nReads": "nReads_IMREP"})
TRUST4_df = TRUST4_df.rename(columns={"nReads": "nReads_TRUST4"})
MIXCR_df = MIXCR_df.rename(columns={"nReads": "nReads_MIXCR"})
TCR_df = TCR_df.rename(columns={"nReads": "nReads_TCR"})

Complete dataframe across all samples and tools

In [5]:
# Merge dataframes based on two key combination: Sample and CDR3. Outer join ensures no data is lost for instances that do not have overlap
merge_IMREP_TRUST4 = pd.merge(IMREP_df, TRUST4_df, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_IMREP_TRUST4_MIXCR = pd.merge(MIXCR_df, merge_IMREP_TRUST4, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_complete = pd.merge(TCR_df, merge_IMREP_TRUST4_MIXCR, how='outer', on=['Sample', 'CDR3']).fillna(0)

merge_complete

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_MIXCR,nReads_IMREP,nReads_TRUST4
0,SRR5233637,CASSPRVTSGTYEQYF,32.0,0.0,0.0,0.0
1,SRR5233637,CASSYSDRGGQPQHF,13.0,0.0,0.0,0.0
2,SRR5233637,CASKVALGGETQYF,25.0,0.0,0.0,0.0
3,SRR5233637,CASRAPGTGTLGSPLHF,66.0,0.0,0.0,0.0
4,SRR5233637,CASSSGQGGPSTEAFF,52.0,0.0,0.0,0.0
...,...,...,...,...,...,...
375331,sample14,CASSESPAFGEKLFF,0.0,0.0,0.0,3.0
375332,sample14,CASSWTGSQETQYF,0.0,0.0,0.0,2.0
375333,sample14,CASRTGLAGGIGELFF,0.0,0.0,0.0,2.0
375334,sample14,CASSVEGYEQYF,0.0,0.0,0.0,2.0


In [6]:
# Add the tissue type
merge_complete.loc[merge_complete['Sample']=='sample01','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='sample02','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='sample03','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='sample04','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='sample05','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='sample06','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample07','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample08','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample09','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample10','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample11','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample12','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample13','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample14','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='SRR5233639','tissue'] = 'lymph_node'
merge_complete.loc[merge_complete['Sample']=='SRR5233637','tissue'] = 'small_intestine'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-4862','tissue'] = 'kidney'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-5463','tissue'] = 'kidney'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-5985','tissue'] = 'kidney'

In [7]:
# Add T cell rich or poor tissue type
merge_complete.loc[merge_complete['Sample']=='sample01','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='sample02','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='sample03','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='sample04','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='sample05','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='sample06','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample07','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample08','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample09','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample10','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample11','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample12','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample13','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample14','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='SRR5233639','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='SRR5233637','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-4862','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-5463','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-5985','tissue_type'] = 'T_cell_poor'

In [8]:
# Calculate total number of reads in each sample
total_reads = merge_complete[['Sample','nReads_TCR','nReads_MIXCR','nReads_IMREP','nReads_TRUST4']].groupby('Sample').sum().rename(columns={'nReads_TCR':'total_reads_TCR','nReads_MIXCR':'total_reads_MIXCR','nReads_IMREP':'total_reads_IMREP','nReads_TRUST4':'total_reads_TRUST4'})
total_reads

Unnamed: 0_level_0,total_reads_TCR,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SRR5233637,3047629.0,84.0,315.0,544.0
SRR5233639,3256697.0,1346.0,6143.0,9054.0
TCGA-CZ-4862,16784.0,0.0,50.0,710.0
TCGA-CZ-5463,806.0,0.0,16.0,366.0
TCGA-CZ-5985,12998.0,0.0,23.0,282.0
sample01,90577.0,132472.0,219185.0,264884.0
sample02,87762.0,41370.0,57700.0,89947.0
sample03,305953.0,52066.0,64914.0,99723.0
sample04,104779.0,158.0,1354.0,2006.0
sample05,18617.0,55.0,111.0,186.0


In [9]:
# Merge dataframes 
merge = pd.merge(merge_complete, total_reads, how='outer', on=['Sample']).fillna(0)

# Calculate frequency of CDR3 reads with respect to CDR3s that occur more than once
merge['frequency_TCR'] = merge['nReads_TCR'] / (merge['total_reads_TCR'] * 1.0)
merge['frequency_MIXCR'] = merge['nReads_MIXCR'] / (merge['total_reads_MIXCR'] * 1.0)
merge['frequency_IMREP'] = merge['nReads_IMREP'] / (merge['total_reads_IMREP'] * 1.0)
merge['frequency_TRUST4'] = merge['nReads_TRUST4'] / (merge['total_reads_TRUST4'] * 1.0)  
merge.fillna(0, inplace=True)
merge

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_MIXCR,nReads_IMREP,nReads_TRUST4,tissue,tissue_type,total_reads_TCR,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4,frequency_TCR,frequency_MIXCR,frequency_IMREP,frequency_TRUST4
0,SRR5233637,CASSPRVTSGTYEQYF,32.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000010,0.0,0.0,0.000000
1,SRR5233637,CASSYSDRGGQPQHF,13.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000004,0.0,0.0,0.000000
2,SRR5233637,CASKVALGGETQYF,25.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000008,0.0,0.0,0.000000
3,SRR5233637,CASRAPGTGTLGSPLHF,66.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000022,0.0,0.0,0.000000
4,SRR5233637,CASSSGQGGPSTEAFF,52.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000017,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375331,sample14,CASSESPAFGEKLFF,0.0,0.0,0.0,3.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,0.000000,0.0,0.0,0.008547
375332,sample14,CASSWTGSQETQYF,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,0.000000,0.0,0.0,0.005698
375333,sample14,CASRTGLAGGIGELFF,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,0.000000,0.0,0.0,0.005698
375334,sample14,CASSVEGYEQYF,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,0.000000,0.0,0.0,0.005698


Differentiate monoclonal and polyclonal repertoire

In [10]:
# Calculate Shannon-Wiener index
diversity_TCR = merge[['Sample','tissue','tissue_type','CDR3','nReads_TCR','total_reads_TCR','frequency_TCR']]
diversity_TCR = diversity_TCR[diversity_TCR.nReads_TCR != 0]
clonotype_count_TCR = diversity_TCR.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_TCR')

diversity_TCR['shannon_index_TCR'] = -(diversity_TCR['frequency_TCR']*np.log(diversity_TCR['frequency_TCR']))
shannon_TCR = diversity_TCR.groupby(['Sample']).agg({'shannon_index_TCR':'sum'}).reset_index().rename(columns={'':"shannon_index_TCR"})
shannon_TCR = pd.merge(shannon_TCR, clonotype_count_TCR, on=['Sample'])

# Define monoclonal sample as the shannon_index < 2, polyclonal sample as the shannon_index >= 2
shannon_TCR['repertoire_type'] = ['monoclonal' if x < 2 else 'polyclonal' for x  in shannon_TCR['shannon_index_TCR']]
repertoire_type = shannon_TCR[['Sample','repertoire_type']]
shannon_TCR

Unnamed: 0,Sample,shannon_index_TCR,clonotype_count_TCR,repertoire_type
0,SRR5233637,7.712411,27947,polyclonal
1,SRR5233639,10.059157,202869,polyclonal
2,TCGA-CZ-4862,7.485462,4237,polyclonal
3,TCGA-CZ-5463,2.957966,99,polyclonal
4,TCGA-CZ-5985,5.638249,1415,polyclonal
5,sample01,1.072675,3001,monoclonal
6,sample02,1.001202,3229,monoclonal
7,sample03,0.960516,7697,monoclonal
8,sample04,9.598323,26802,polyclonal
9,sample05,8.005115,5440,polyclonal


In [11]:
# Generate metadata 
metadata = pd.merge(merge, repertoire_type, how='outer', on=['Sample'])
metadata.loc[:,'class'] = metadata["tissue_type"] +"_"+ metadata["repertoire_type"]

metadata.to_csv('../summary_data/original/all_tools_TRB.csv', index=False)
metadata

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_MIXCR,nReads_IMREP,nReads_TRUST4,tissue,tissue_type,total_reads_TCR,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4,frequency_TCR,frequency_MIXCR,frequency_IMREP,frequency_TRUST4,repertoire_type,class
0,SRR5233637,CASSPRVTSGTYEQYF,32.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000010,0.0,0.0,0.000000,polyclonal,T_cell_poor_polyclonal
1,SRR5233637,CASSYSDRGGQPQHF,13.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000004,0.0,0.0,0.000000,polyclonal,T_cell_poor_polyclonal
2,SRR5233637,CASKVALGGETQYF,25.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000008,0.0,0.0,0.000000,polyclonal,T_cell_poor_polyclonal
3,SRR5233637,CASRAPGTGTLGSPLHF,66.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000022,0.0,0.0,0.000000,polyclonal,T_cell_poor_polyclonal
4,SRR5233637,CASSSGQGGPSTEAFF,52.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,0.000017,0.0,0.0,0.000000,polyclonal,T_cell_poor_polyclonal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375331,sample14,CASSESPAFGEKLFF,0.0,0.0,0.0,3.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,0.000000,0.0,0.0,0.008547,polyclonal,T_cell_poor_polyclonal
375332,sample14,CASSWTGSQETQYF,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,0.000000,0.0,0.0,0.005698,polyclonal,T_cell_poor_polyclonal
375333,sample14,CASRTGLAGGIGELFF,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,0.000000,0.0,0.0,0.005698,polyclonal,T_cell_poor_polyclonal
375334,sample14,CASSVEGYEQYF,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,0.000000,0.0,0.0,0.005698,polyclonal,T_cell_poor_polyclonal


Calculate Shannon-Wiener index and absolute error for each sample across different tools

In [12]:
# MIXCR
diversity_MIXCR = merge[['Sample','tissue','tissue_type','CDR3','nReads_MIXCR','total_reads_MIXCR','frequency_MIXCR']]
diversity_MIXCR = diversity_MIXCR[diversity_MIXCR.nReads_MIXCR != 0]
clonotype_count_MIXCR = diversity_MIXCR.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity_MIXCR['shannon_index_tool'] = -(diversity_MIXCR['frequency_MIXCR']*np.log(diversity_MIXCR['frequency_MIXCR']))
shannon_MIXCR = diversity_MIXCR.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
shannon_MIXCR = pd.merge(shannon_MIXCR, clonotype_count_MIXCR, on=['Sample'])
shannon_MIXCR['tool'] = 'MIXCR'

# IMREP
diversity_IMREP = merge[['Sample','tissue','tissue_type','CDR3','nReads_IMREP','total_reads_IMREP','frequency_IMREP']]
diversity_IMREP = diversity_IMREP[diversity_IMREP.nReads_IMREP != 0]
clonotype_count_IMREP = diversity_IMREP.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity_IMREP['shannon_index_tool'] = -(diversity_IMREP['frequency_IMREP']*np.log(diversity_IMREP['frequency_IMREP']))
shannon_IMREP = diversity_IMREP.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
shannon_IMREP = pd.merge(shannon_IMREP, clonotype_count_IMREP, on=['Sample'])
shannon_IMREP['tool'] = 'IMREP'

# TRUST4
diversity_TRUST4 = merge[['Sample','tissue','tissue_type','CDR3','nReads_TRUST4','total_reads_TRUST4','frequency_TRUST4']]
diversity_TRUST4 = diversity_TRUST4[diversity_TRUST4.nReads_TRUST4 != 0]
clonotype_count_TRUST4 = diversity_TRUST4.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity_TRUST4['shannon_index_tool'] = -(diversity_TRUST4['frequency_TRUST4']*np.log(diversity_TRUST4['frequency_TRUST4']))
shannon_TRUST4 = diversity_TRUST4.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
shannon_TRUST4 = pd.merge(shannon_TRUST4, clonotype_count_TRUST4, on=['Sample'])
shannon_TRUST4['tool'] = 'TRUST4'

diversity = pd.concat([shannon_MIXCR,shannon_IMREP,shannon_TRUST4])
diversity = pd.merge(diversity, shannon_TCR, how='outer', on=['Sample'])
diversity = diversity.fillna(0)
tissue_type = merge[['Sample','tissue','tissue_type']].drop_duplicates(keep='first')
diversity = pd.merge(diversity, tissue_type, how='inner', on=['Sample'])
diversity['absolute_error'] = np.abs(diversity['shannon_index_TCR'] - diversity['shannon_index_tool'])
diversity.loc[:,'class'] = diversity["tissue_type"] +"_"+ diversity["repertoire_type"]
diversity.to_csv('../summary_data/original/all_tools_TRB_diversity.csv', index=False)

diversity

Unnamed: 0,Sample,shannon_index_tool,clonotype_count_tool,tool,shannon_index_TCR,clonotype_count_TCR,repertoire_type,tissue,tissue_type,absolute_error,class
0,SRR5233637,3.061608,24,MIXCR,7.712411,27947,polyclonal,small_intestine,T_cell_poor,4.650803,T_cell_poor_polyclonal
1,SRR5233637,4.562193,115,IMREP,7.712411,27947,polyclonal,small_intestine,T_cell_poor,3.150218,T_cell_poor_polyclonal
2,SRR5233637,4.309543,118,TRUST4,7.712411,27947,polyclonal,small_intestine,T_cell_poor,3.402869,T_cell_poor_polyclonal
3,SRR5233639,5.478183,353,MIXCR,10.059157,202869,polyclonal,lymph_node,T_cell_rich,4.580975,T_cell_rich_polyclonal
4,SRR5233639,7.548679,2342,IMREP,10.059157,202869,polyclonal,lymph_node,T_cell_rich,2.510478,T_cell_rich_polyclonal
5,SRR5233639,7.186224,2362,TRUST4,10.059157,202869,polyclonal,lymph_node,T_cell_rich,2.872934,T_cell_rich_polyclonal
6,sample01,0.829164,2592,MIXCR,1.072675,3001,monoclonal,PBMC,T_cell_rich,0.24351,T_cell_rich_monoclonal
7,sample01,1.163479,3682,IMREP,1.072675,3001,monoclonal,PBMC,T_cell_rich,0.090805,T_cell_rich_monoclonal
8,sample01,1.165598,4154,TRUST4,1.072675,3001,monoclonal,PBMC,T_cell_rich,0.092923,T_cell_rich_monoclonal
9,sample02,0.98874,1565,MIXCR,1.001202,3229,monoclonal,PBMC,T_cell_rich,0.012462,T_cell_rich_monoclonal
