In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
IMREP_df = pd.read_csv("../summary_data/original/IMREP/IMREP_TRB_merged_extracted_features.csv")
TRUST4_df = pd.read_csv("../summary_data/original/TRUST4/TRUST4_TRB_merged_extracted_features.csv")
MIXCR_df = pd.read_csv("../summary_data/original/MIXCR/MIXCR_TRB_merged_extracted_features.csv")
CATT_df = pd.read_csv("../summary_data/original/CATT/CATT_TRB_merged_extracted_features.csv")
TCR_df = pd.read_csv("../summary_data/original/TCR_Seq/TCR_merged_extracted_features.csv")

In [3]:
# Select rows with nReads greater than 1
IMREP_df = IMREP_df[IMREP_df['nReads'] > 1]
TRUST4_df = TRUST4_df[TRUST4_df['nReads'] > 1]
MIXCR_df = MIXCR_df[MIXCR_df['nReads'] > 1]
CATT_df = CATT_df[CATT_df['nReads'] > 1]
TCR_df = TCR_df[TCR_df['nReads'] > 1]

In [4]:
# Rename nReads and frequencies columns according to tool for proper merging
IMREP_df = IMREP_df.rename(columns={"nReads": "nReads_IMREP"})
TRUST4_df = TRUST4_df.rename(columns={"nReads": "nReads_TRUST4"})
MIXCR_df = MIXCR_df.rename(columns={"nReads": "nReads_MIXCR"})
CATT_df = CATT_df.rename(columns={"nReads": "nReads_CATT"})
TCR_df = TCR_df.rename(columns={"nReads": "nReads_TCR"})

Complete dataframe across all samples and tools

In [5]:
# Merge dataframes based on two key combination: Sample and CDR3. Outer join ensures no data is lost for instances that do not have overlap
merge_IMREP_TRUST4 = pd.merge(IMREP_df, TRUST4_df, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_IMREP_TRUST4_MIXCR = pd.merge(MIXCR_df, merge_IMREP_TRUST4, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_IMREP_TRUST4_MIXCR_CATT = pd.merge(CATT_df, merge_IMREP_TRUST4_MIXCR, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_complete = pd.merge(TCR_df, merge_IMREP_TRUST4_MIXCR_CATT, how='outer', on=['Sample', 'CDR3']).fillna(0)

merge_complete

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_CATT,nReads_MIXCR,nReads_IMREP,nReads_TRUST4
0,SRR5233637,CASSPRVTSGTYEQYF,32.0,0.0,0.0,0.0,0.0
1,SRR5233637,CASSYSDRGGQPQHF,13.0,0.0,0.0,0.0,0.0
2,SRR5233637,CASKVALGGETQYF,25.0,0.0,0.0,0.0,0.0
3,SRR5233637,CASRAPGTGTLGSPLHF,66.0,0.0,0.0,0.0,0.0
4,SRR5233637,CASSSGQGGPSTEAFF,52.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
378937,sample14,CASSESPAFGEKLFF,0.0,0.0,0.0,0.0,3.0
378938,sample14,CASSWTGSQETQYF,0.0,0.0,0.0,0.0,2.0
378939,sample14,CASRTGLAGGIGELFF,0.0,0.0,0.0,0.0,2.0
378940,sample14,CASSVEGYEQYF,0.0,0.0,0.0,0.0,2.0


In [6]:
# Add the tissue type
merge_complete.loc[merge_complete['Sample']=='sample01','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='sample02','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='sample03','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='sample04','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='sample05','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='sample06','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample07','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample08','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample09','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample10','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample11','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample12','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample13','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='sample14','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='SRR5233639','tissue'] = 'lymph_node'
merge_complete.loc[merge_complete['Sample']=='SRR5233637','tissue'] = 'small_intestine'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-4862','tissue'] = 'kidney'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-5463','tissue'] = 'kidney'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-5985','tissue'] = 'kidney'

In [7]:
# Add T cell rich or poor tissue type
merge_complete.loc[merge_complete['Sample']=='sample01','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='sample02','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='sample03','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='sample04','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='sample05','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='sample06','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample07','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample08','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample09','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample10','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample11','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample12','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample13','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='sample14','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='SRR5233639','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='SRR5233637','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-4862','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-5463','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-5985','tissue_type'] = 'T_cell_poor'

In [8]:
# Calculate total number of reads in each sample
total_reads = merge_complete[['Sample','nReads_TCR','nReads_MIXCR','nReads_IMREP','nReads_TRUST4','nReads_CATT']].groupby('Sample').sum().rename(columns={'nReads_TCR':'total_reads_TCR','nReads_MIXCR':'total_reads_MIXCR','nReads_IMREP':'total_reads_IMREP','nReads_TRUST4':'total_reads_TRUST4','nReads_CATT':'total_reads_CATT'})
total_reads

Unnamed: 0_level_0,total_reads_TCR,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4,total_reads_CATT
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SRR5233637,3047629.0,84.0,315.0,544.0,5616.0
SRR5233639,3256697.0,1346.0,6143.0,9054.0,5873.0
TCGA-CZ-4862,16784.0,0.0,50.0,710.0,642.0
TCGA-CZ-5463,806.0,0.0,16.0,366.0,845.0
TCGA-CZ-5985,12998.0,0.0,23.0,282.0,915.0
sample01,90577.0,132472.0,219185.0,264884.0,214716.0
sample02,87762.0,41370.0,57700.0,89947.0,72800.0
sample03,305953.0,52066.0,64914.0,99723.0,79629.0
sample04,104779.0,158.0,1354.0,2006.0,4050.0
sample05,18617.0,55.0,111.0,186.0,1724.0


In [9]:
# Merge dataframes 
merge = pd.merge(merge_complete, total_reads, how='outer', on=['Sample']).fillna(0)

# Calculate frequency of CDR3 reads with respect to CDR3s that occur more than once
merge['frequency_TCR'] = merge['nReads_TCR'] / (merge['total_reads_TCR'] * 1.0)
merge['frequency_MIXCR'] = merge['nReads_MIXCR'] / (merge['total_reads_MIXCR'] * 1.0)
merge['frequency_IMREP'] = merge['nReads_IMREP'] / (merge['total_reads_IMREP'] * 1.0)
merge['frequency_TRUST4'] = merge['nReads_TRUST4'] / (merge['total_reads_TRUST4'] * 1.0) 
merge['frequency_CATT'] = merge['nReads_CATT'] / (merge['total_reads_CATT'] * 1.0) 
merge.fillna(0, inplace=True)
merge

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_CATT,nReads_MIXCR,nReads_IMREP,nReads_TRUST4,tissue,tissue_type,total_reads_TCR,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4,total_reads_CATT,frequency_TCR,frequency_MIXCR,frequency_IMREP,frequency_TRUST4,frequency_CATT
0,SRR5233637,CASSPRVTSGTYEQYF,32.0,0.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,5616.0,0.000010,0.0,0.0,0.000000,0.0
1,SRR5233637,CASSYSDRGGQPQHF,13.0,0.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,5616.0,0.000004,0.0,0.0,0.000000,0.0
2,SRR5233637,CASKVALGGETQYF,25.0,0.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,5616.0,0.000008,0.0,0.0,0.000000,0.0
3,SRR5233637,CASRAPGTGTLGSPLHF,66.0,0.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,5616.0,0.000022,0.0,0.0,0.000000,0.0
4,SRR5233637,CASSSGQGGPSTEAFF,52.0,0.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,84.0,315.0,544.0,5616.0,0.000017,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378937,sample14,CASSESPAFGEKLFF,0.0,0.0,0.0,0.0,3.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,751.0,0.000000,0.0,0.0,0.008547,0.0
378938,sample14,CASSWTGSQETQYF,0.0,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,751.0,0.000000,0.0,0.0,0.005698,0.0
378939,sample14,CASRTGLAGGIGELFF,0.0,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,751.0,0.000000,0.0,0.0,0.005698,0.0
378940,sample14,CASSVEGYEQYF,0.0,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,113.0,152.0,351.0,751.0,0.000000,0.0,0.0,0.005698,0.0


Differentiate low_SDI and high_SDI repertoire

In [10]:
# Calculate Shannon-Wiener index
diversity_TCR = merge[['Sample','tissue','tissue_type','CDR3','nReads_TCR','total_reads_TCR','frequency_TCR']]
diversity_TCR = diversity_TCR[diversity_TCR.nReads_TCR != 0]
clonotype_count_TCR = diversity_TCR.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_TCR')

diversity_TCR['shannon_index_TCR'] = -(diversity_TCR['frequency_TCR']*np.log(diversity_TCR['frequency_TCR']))
shannon_TCR = diversity_TCR.groupby(['Sample']).agg({'shannon_index_TCR':'sum'}).reset_index().rename(columns={'':"shannon_index_TCR"})
shannon_TCR = pd.merge(shannon_TCR, clonotype_count_TCR, on=['Sample'])

# Define low SDI sample as the shannon_index < 1, high SDI sample as the shannon_index >= 1
shannon_TCR['repertoire_type'] = ['low_SDI' if x < 1 else 'high_SDI' for x  in shannon_TCR['shannon_index_TCR']]
repertoire_type = shannon_TCR[['Sample','repertoire_type']]
shannon_TCR

Unnamed: 0,Sample,shannon_index_TCR,clonotype_count_TCR,repertoire_type
0,SRR5233637,7.712411,27947,high_SDI
1,SRR5233639,10.059157,202869,high_SDI
2,TCGA-CZ-4862,7.485462,4237,high_SDI
3,TCGA-CZ-5463,2.957966,99,high_SDI
4,TCGA-CZ-5985,5.638249,1415,high_SDI
5,sample01,1.072675,3001,high_SDI
6,sample02,1.001202,3229,high_SDI
7,sample03,0.960516,7697,low_SDI
8,sample04,9.598323,26802,high_SDI
9,sample05,8.005115,5440,high_SDI


In [11]:
# Generate metadata 
metadata = pd.merge(merge, repertoire_type, how='outer', on=['Sample'])
metadata.loc[:,'class'] = metadata["tissue_type"] +"_"+ metadata["repertoire_type"]

metadata

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_CATT,nReads_MIXCR,nReads_IMREP,nReads_TRUST4,tissue,tissue_type,total_reads_TCR,...,total_reads_IMREP,total_reads_TRUST4,total_reads_CATT,frequency_TCR,frequency_MIXCR,frequency_IMREP,frequency_TRUST4,frequency_CATT,repertoire_type,class
0,SRR5233637,CASSPRVTSGTYEQYF,32.0,0.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,...,315.0,544.0,5616.0,0.000010,0.0,0.0,0.000000,0.0,high_SDI,T_cell_poor_high_SDI
1,SRR5233637,CASSYSDRGGQPQHF,13.0,0.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,...,315.0,544.0,5616.0,0.000004,0.0,0.0,0.000000,0.0,high_SDI,T_cell_poor_high_SDI
2,SRR5233637,CASKVALGGETQYF,25.0,0.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,...,315.0,544.0,5616.0,0.000008,0.0,0.0,0.000000,0.0,high_SDI,T_cell_poor_high_SDI
3,SRR5233637,CASRAPGTGTLGSPLHF,66.0,0.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,...,315.0,544.0,5616.0,0.000022,0.0,0.0,0.000000,0.0,high_SDI,T_cell_poor_high_SDI
4,SRR5233637,CASSSGQGGPSTEAFF,52.0,0.0,0.0,0.0,0.0,small_intestine,T_cell_poor,3047629.0,...,315.0,544.0,5616.0,0.000017,0.0,0.0,0.000000,0.0,high_SDI,T_cell_poor_high_SDI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378937,sample14,CASSESPAFGEKLFF,0.0,0.0,0.0,0.0,3.0,melanoma,T_cell_poor,749686.0,...,152.0,351.0,751.0,0.000000,0.0,0.0,0.008547,0.0,high_SDI,T_cell_poor_high_SDI
378938,sample14,CASSWTGSQETQYF,0.0,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,...,152.0,351.0,751.0,0.000000,0.0,0.0,0.005698,0.0,high_SDI,T_cell_poor_high_SDI
378939,sample14,CASRTGLAGGIGELFF,0.0,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,...,152.0,351.0,751.0,0.000000,0.0,0.0,0.005698,0.0,high_SDI,T_cell_poor_high_SDI
378940,sample14,CASSVEGYEQYF,0.0,0.0,0.0,0.0,2.0,melanoma,T_cell_poor,749686.0,...,152.0,351.0,751.0,0.000000,0.0,0.0,0.005698,0.0,high_SDI,T_cell_poor_high_SDI


Calculate Shannon-Wiener index and absolute error for each sample across different tools

In [12]:
# MIXCR
diversity_MIXCR = merge[['Sample','tissue','tissue_type','CDR3','nReads_MIXCR','total_reads_MIXCR','frequency_MIXCR']]
diversity_MIXCR = diversity_MIXCR[diversity_MIXCR.nReads_MIXCR != 0]
clonotype_count_MIXCR = diversity_MIXCR.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity_MIXCR['shannon_index_tool'] = -(diversity_MIXCR['frequency_MIXCR']*np.log(diversity_MIXCR['frequency_MIXCR']))
shannon_MIXCR = diversity_MIXCR.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
shannon_MIXCR = pd.merge(shannon_MIXCR, clonotype_count_MIXCR, on=['Sample'])
shannon_MIXCR['tool'] = 'MIXCR'

# IMREP
diversity_IMREP = merge[['Sample','tissue','tissue_type','CDR3','nReads_IMREP','total_reads_IMREP','frequency_IMREP']]
diversity_IMREP = diversity_IMREP[diversity_IMREP.nReads_IMREP != 0]
clonotype_count_IMREP = diversity_IMREP.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity_IMREP['shannon_index_tool'] = -(diversity_IMREP['frequency_IMREP']*np.log(diversity_IMREP['frequency_IMREP']))
shannon_IMREP = diversity_IMREP.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
shannon_IMREP = pd.merge(shannon_IMREP, clonotype_count_IMREP, on=['Sample'])
shannon_IMREP['tool'] = 'IMREP'

# TRUST4
diversity_TRUST4 = merge[['Sample','tissue','tissue_type','CDR3','nReads_TRUST4','total_reads_TRUST4','frequency_TRUST4']]
diversity_TRUST4 = diversity_TRUST4[diversity_TRUST4.nReads_TRUST4 != 0]
clonotype_count_TRUST4 = diversity_TRUST4.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity_TRUST4['shannon_index_tool'] = -(diversity_TRUST4['frequency_TRUST4']*np.log(diversity_TRUST4['frequency_TRUST4']))
shannon_TRUST4 = diversity_TRUST4.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
shannon_TRUST4 = pd.merge(shannon_TRUST4, clonotype_count_TRUST4, on=['Sample'])
shannon_TRUST4['tool'] = 'TRUST4'

# CATT
diversity_CATT = merge[['Sample','tissue','tissue_type','CDR3','nReads_CATT','total_reads_CATT','frequency_CATT']]
diversity_CATT = diversity_CATT[diversity_CATT.nReads_CATT != 0]
clonotype_count_CATT = diversity_CATT.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity_CATT['shannon_index_tool'] = -(diversity_CATT['frequency_CATT']*np.log(diversity_CATT['frequency_CATT']))
shannon_CATT = diversity_CATT.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
shannon_CATT = pd.merge(shannon_CATT, clonotype_count_CATT, on=['Sample'])
shannon_CATT['tool'] = 'CATT'

diversity = pd.concat([shannon_MIXCR,shannon_IMREP,shannon_TRUST4,shannon_CATT])
diversity = pd.merge(diversity, shannon_TCR, how='outer', on=['Sample'])
diversity = diversity.fillna(0)
tissue_type = merge[['Sample','tissue','tissue_type']].drop_duplicates(keep='first')
diversity = pd.merge(diversity, tissue_type, how='inner', on=['Sample'])
diversity['SDI_absolute_error'] = np.abs(diversity['shannon_index_TCR'] - diversity['shannon_index_tool'])
diversity.loc[:,'class'] = diversity["tissue_type"] +"_"+ diversity["repertoire_type"]

# Calculate clonality
diversity['clonality_TCR'] = 1 - (diversity['shannon_index_TCR']/np.log(diversity['clonotype_count_TCR']))
diversity['clonality_tool'] = 1 - (diversity['shannon_index_tool']/np.log(diversity['clonotype_count_tool']))
diversity['clonality_absolute_error'] = np.abs(diversity['clonality_TCR'] - diversity['clonality_tool'])

diversity

Unnamed: 0,Sample,shannon_index_tool,clonotype_count_tool,tool,shannon_index_TCR,clonotype_count_TCR,repertoire_type,tissue,tissue_type,SDI_absolute_error,class,clonality_TCR,clonality_tool,clonality_absolute_error
0,SRR5233637,3.061608,24,MIXCR,7.712411,27947,high_SDI,small_intestine,T_cell_poor,4.650803,T_cell_poor_high_SDI,0.246692,0.036640,0.210052
1,SRR5233637,4.562193,115,IMREP,7.712411,27947,high_SDI,small_intestine,T_cell_poor,3.150218,T_cell_poor_high_SDI,0.246692,0.038512,0.208180
2,SRR5233637,4.309543,118,TRUST4,7.712411,27947,high_SDI,small_intestine,T_cell_poor,3.402869,T_cell_poor_high_SDI,0.246692,0.096662,0.150031
3,SRR5233637,1.342190,144,CATT,7.712411,27947,high_SDI,small_intestine,T_cell_poor,6.370222,T_cell_poor_high_SDI,0.246692,0.729932,0.483239
4,SRR5233639,5.478183,353,MIXCR,10.059157,202869,high_SDI,lymph_node,T_cell_rich,4.580975,T_cell_rich_high_SDI,0.176850,0.066187,0.110662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,TCGA-CZ-5985,2.174110,22,TRUST4,5.638249,1415,high_SDI,kidney,T_cell_poor,3.464139,T_cell_poor_high_SDI,0.222834,0.296642,0.073808
68,TCGA-CZ-5985,1.160764,25,CATT,5.638249,1415,high_SDI,kidney,T_cell_poor,4.477485,T_cell_poor_high_SDI,0.222834,0.639388,0.416554
69,sample07,0.000000,1,IMREP,4.680209,3548,high_SDI,melanoma,T_cell_poor,4.680209,T_cell_poor_high_SDI,0.427437,,
70,sample07,0.970585,4,TRUST4,4.680209,3548,high_SDI,melanoma,T_cell_poor,3.709624,T_cell_poor_high_SDI,0.427437,0.299871,0.127566


In [13]:
mean_class_absolute_error = diversity.groupby("class")["SDI_absolute_error"].agg(["mean", "std"])
display(mean_class_absolute_error)
tools = ['MIXCR','IMREP','TRUST4','CATT']              
for tool in tools:
    print(tool)
    df_tool = diversity.loc[diversity['tool'] == tool]
    mean_class_absolute_error = df_tool.groupby("class")["SDI_absolute_error"].agg(["mean", "std"]) 
    display(mean_class_absolute_error)

Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,4.135492,1.328268
T_cell_rich_high_SDI,2.860745,2.153281
T_cell_rich_low_SDI,0.586547,0.309431


MIXCR


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,4.50895,1.077323
T_cell_rich_high_SDI,3.116482,2.754672
T_cell_rich_low_SDI,0.168253,


IMREP


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,4.012572,1.101935
T_cell_rich_high_SDI,2.441691,1.785227
T_cell_rich_low_SDI,0.914802,


TRUST4


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,3.486723,1.179376
T_cell_rich_high_SDI,2.35712,1.683547
T_cell_rich_low_SDI,0.646845,


CATT


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,4.648633,1.633713
T_cell_rich_high_SDI,3.527686,2.690731
T_cell_rich_low_SDI,0.616287,


In [14]:
mean_class_absolute_error = diversity.groupby("class")["clonality_absolute_error"].agg(["mean", "std"])
display(mean_class_absolute_error)
tools = ['MIXCR','IMREP','TRUST4','CATT']              
for tool in tools:
    print(tool)
    df_tool = diversity.loc[diversity['tool'] == tool]
    mean_class_absolute_error = df_tool.groupby("class")["clonality_absolute_error"].agg(["mean", "std"]) 
    display(mean_class_absolute_error)

Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,0.200018,0.15576
T_cell_rich_high_SDI,0.093158,0.117073
T_cell_rich_low_SDI,0.076447,0.04894


MIXCR


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,0.153529,0.122182
T_cell_rich_high_SDI,0.040845,0.040177
T_cell_rich_low_SDI,0.007157,


IMREP


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,0.165713,0.1262
T_cell_rich_high_SDI,0.074892,0.077709
T_cell_rich_low_SDI,0.122362,


TRUST4


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,0.115997,0.097975
T_cell_rich_high_SDI,0.05605,0.05867
T_cell_rich_low_SDI,0.086915,


CATT


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,0.347892,0.157213
T_cell_rich_high_SDI,0.200846,0.184265
T_cell_rich_low_SDI,0.089354,
