In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
IMREP_df = pd.read_csv("../summary_data/150bp/IMREP_TRA_merged_extracted_features.csv")
TRUST4_df = pd.read_csv("../summary_data/150bp/TRUST4_TRA_merged_extracted_features.csv")
MIXCR_df = pd.read_csv("../summary_data/150bp/MIXCR_TRA_merged_extracted_features.csv")

In [4]:
# Select rows with nReads greater than 1
IMREP_df = IMREP_df[IMREP_df['nReads'] > 1]
TRUST4_df = TRUST4_df[TRUST4_df['nReads'] > 1]
MIXCR_df = MIXCR_df[MIXCR_df['nReads'] > 1]

In [5]:
# Rename nReads and frequencies columns according to tool for proper merging
IMREP_df = IMREP_df.rename(columns={"nReads": "nReads_IMREP"})
TRUST4_df = TRUST4_df.rename(columns={"nReads": "nReads_TRUST4"})
MIXCR_df = MIXCR_df.rename(columns={"nReads": "nReads_MIXCR"})

In [6]:
# Complete dataframe across all samples and tools
# Merge dataframes based on two key combination: Sample and CDR3. Outer join ensures no data is lost for instances that do not have overlap
merge_IMREP_TRUST4 = pd.merge(IMREP_df, TRUST4_df, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_IMREP_TRUST4_MIXCR = pd.merge(MIXCR_df, merge_IMREP_TRUST4, how='outer', on=['Sample', 'CDR3']).fillna(0)

merge_IMREP_TRUST4_MIXCR

Unnamed: 0,Sample,CDR3,nReads_MIXCR,nReads_IMREP,nReads_TRUST4
0,CMT-baseline1C_CAGATC,CAVRDWAGGFKTIF,2.0,0.0,5.0
1,CMT-baseline1C_CAGATC,CIVLGGSQGNLIF,2.0,2.0,8.0
2,CMT-baseline1C_CAGATC,CAVTGASKIIF,4.0,0.0,9.0
3,CMT-baseline1C_CAGATC,CAASTGGFKTIF,3.0,0.0,0.0
4,CMT-baseline1C_CAGATC,CAEILNSGGSNYKLTF,2.0,0.0,4.0
...,...,...,...,...,...
17541,TR2-PBMC_S12,CLVGDPSGGYNKLIF,0.0,0.0,2.0
17542,TR2-PBMC_S12,CAALMDSSYKLIF,0.0,0.0,2.0
17543,TR2-PBMC_S12,CLVGPLTGTASKLTF,0.0,0.0,4.0
17544,TR2-PBMC_S12,CAGAAGTASKLTF,0.0,0.0,2.0


In [7]:
# Add the sample type
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='CMT-baseline1C_CAGATC','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='ESO1-sorted-T-cells_S13_L007','sample_type'] = 'PBMC'
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='HM-baseline1C_CGATGT','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='INY1-sorted-T-cells_S14_L007','sample_type'] = 'PBMC'
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='INY2-sorted-T-cells_S15_L007','sample_type'] = 'PBMC'
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='JSSBaseline-RNA_GTGAAA','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='LEK-OT110712A_CCGTCC','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='LEK-baseline_CGATGT','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='MP-11-28-12RNA_S2','sample_type'] = 'PBMC'
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='PT0112-B_S3','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='PT0285-B_S5','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='PT0310_S9','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='RAS-baseline_TGACCA','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='SAR-11-14-12RNA_S1','sample_type'] = 'PBMC'
merge_IMREP_TRUST4_MIXCR.loc[merge_IMREP_TRUST4_MIXCR['Sample']=='TR2-PBMC_S12','sample_type'] = 'PBMC'

In [8]:
# Calculate total number of reads in each sample
total_reads = merge_IMREP_TRUST4_MIXCR[['Sample','nReads_MIXCR','nReads_IMREP','nReads_TRUST4']].groupby('Sample').sum().rename(columns={'nReads_MIXCR':'total_reads_MIXCR','nReads_IMREP':'total_reads_IMREP','nReads_TRUST4':'total_reads_TRUST4'})
total_reads

Unnamed: 0_level_0,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CMT-baseline1C_CAGATC,58.0,154.0,508.0
ESO1-sorted-T-cells_S13_L007,185894.0,228248.0,335248.0
HM-baseline1C_CGATGT,2.0,15.0,7.0
INY1-sorted-T-cells_S14_L007,48760.0,57538.0,83325.0
INY2-sorted-T-cells_S15_L007,67676.0,74871.0,124841.0
JSSBaseline-RNA_GTGAAA,98.0,196.0,320.0
LEK-OT110712A_CCGTCC,182.0,184.0,522.0
LEK-baseline_CGATGT,22.0,37.0,102.0
MP-11-28-12RNA_S2,158.0,489.0,299.0
PT0112-B_S3,4.0,12.0,35.0


In [9]:
# Merge dataframes 
merge = pd.merge(merge_IMREP_TRUST4_MIXCR, total_reads, how='outer', on=['Sample']).fillna(0)

In [10]:
# Calculate frequency of CDR3 reads with respect to CDR3s that occur more than once
merge['frequency_MIXCR'] = merge['nReads_MIXCR'] / (merge['total_reads_MIXCR'] * 1.0)
merge['frequency_IMREP'] = merge['nReads_IMREP'] / (merge['total_reads_IMREP'] * 1.0)
merge['frequency_TRUST4'] = merge['nReads_TRUST4'] / (merge['total_reads_TRUST4'] * 1.0)  

merge

Unnamed: 0,Sample,CDR3,nReads_MIXCR,nReads_IMREP,nReads_TRUST4,sample_type,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4,frequency_MIXCR,frequency_IMREP,frequency_TRUST4
0,CMT-baseline1C_CAGATC,CAVRDWAGGFKTIF,2.0,0.0,5.0,melanoma,58.0,154.0,508.0,0.034483,0.000000,0.009843
1,CMT-baseline1C_CAGATC,CIVLGGSQGNLIF,2.0,2.0,8.0,melanoma,58.0,154.0,508.0,0.034483,0.012987,0.015748
2,CMT-baseline1C_CAGATC,CAVTGASKIIF,4.0,0.0,9.0,melanoma,58.0,154.0,508.0,0.068966,0.000000,0.017717
3,CMT-baseline1C_CAGATC,CAASTGGFKTIF,3.0,0.0,0.0,melanoma,58.0,154.0,508.0,0.051724,0.000000,0.000000
4,CMT-baseline1C_CAGATC,CAEILNSGGSNYKLTF,2.0,0.0,4.0,melanoma,58.0,154.0,508.0,0.034483,0.000000,0.007874
...,...,...,...,...,...,...,...,...,...,...,...,...
17541,TR2-PBMC_S12,CLVGDPSGGYNKLIF,0.0,0.0,2.0,PBMC,1084.0,3650.0,5196.0,0.000000,0.000000,0.000385
17542,TR2-PBMC_S12,CAALMDSSYKLIF,0.0,0.0,2.0,PBMC,1084.0,3650.0,5196.0,0.000000,0.000000,0.000385
17543,TR2-PBMC_S12,CLVGPLTGTASKLTF,0.0,0.0,4.0,PBMC,1084.0,3650.0,5196.0,0.000000,0.000000,0.000770
17544,TR2-PBMC_S12,CAGAAGTASKLTF,0.0,0.0,2.0,PBMC,1084.0,3650.0,5196.0,0.000000,0.000000,0.000385


In [11]:
merge.to_csv('../summary_data/150bp/all_tools_TRA.csv', index=False)

Calculate normalized Shannon-Wiener index for each sample across different tools

In [14]:
# MIXCR
diversity_MIXCR = merge[merge.nReads_MIXCR != 0]
clonotype_count_MIXCR = diversity_MIXCR.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity1 = diversity_MIXCR.copy()
diversity1['shannon_index_tool'] = -(diversity_MIXCR['frequency_MIXCR']*np.log(diversity_MIXCR['frequency_MIXCR']))

shannon_MIXCR = diversity1.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})

shannon_MIXCR = pd.merge(shannon_MIXCR, clonotype_count_MIXCR, on=['Sample'])
shannon_MIXCR['normalized_shannon_index_tool'] = shannon_MIXCR['shannon_index_tool']/np.log(shannon_MIXCR['clonotype_count_tool'])
shannon_MIXCR['tool'] = 'MIXCR'

# IMREP
diversity_IMREP = merge[merge.nReads_IMREP != 0]
clonotype_count_IMREP = diversity_IMREP.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity2 = diversity_IMREP.copy()
diversity2['shannon_index_tool'] = -(diversity_IMREP['frequency_IMREP']*np.log(diversity_IMREP['frequency_IMREP']))

shannon_IMREP = diversity2.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})

shannon_IMREP = pd.merge(shannon_IMREP, clonotype_count_IMREP, on=['Sample'])
shannon_IMREP['normalized_shannon_index_tool'] = shannon_IMREP['shannon_index_tool']/np.log(shannon_IMREP['clonotype_count_tool'])
shannon_IMREP['tool'] = 'IMREP'

# TRUST4
diversity_TRUST4 = merge[merge.nReads_TRUST4 != 0]
clonotype_count_TRUST4 = diversity_TRUST4.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity3 = diversity_TRUST4.copy()
diversity3['shannon_index_tool'] = -(diversity_TRUST4['frequency_TRUST4']*np.log(diversity_TRUST4['frequency_TRUST4']))

shannon_TRUST4 = diversity3.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})

shannon_TRUST4 = pd.merge(shannon_TRUST4, clonotype_count_TRUST4, on=['Sample'])
shannon_TRUST4['normalized_shannon_index_tool'] = shannon_TRUST4['shannon_index_tool']/np.log(shannon_TRUST4['clonotype_count_tool'])
shannon_TRUST4['tool'] = 'TRUST4'

diversity = pd.concat([shannon_MIXCR,shannon_IMREP,shannon_TRUST4])
sample_type = merge[['Sample','sample_type']].drop_duplicates(keep='first')
diversity = diversity.fillna(0)
diversity = pd.merge(diversity, sample_type, how='inner', on=['Sample'])
diversity.to_csv('../summary_data/150bp/all_tools_TRA_diversity.csv', index=False)

diversity

Unnamed: 0,Sample,shannon_index_tool,clonotype_count_tool,normalized_shannon_index_tool,tool,sample_type
0,CMT-baseline1C_CAGATC,3.092193,23,0.98619,MIXCR,melanoma
1,CMT-baseline1C_CAGATC,4.125943,65,0.988395,IMREP,melanoma
2,CMT-baseline1C_CAGATC,4.938231,160,0.973017,TRUST4,melanoma
3,ESO1-sorted-T-cells_S13_L007,0.535771,2215,0.069553,MIXCR,PBMC
4,ESO1-sorted-T-cells_S13_L007,0.824119,2953,0.103136,IMREP,PBMC
5,ESO1-sorted-T-cells_S13_L007,0.958934,3427,0.117813,TRUST4,PBMC
6,HM-baseline1C_CGATGT,0.0,1,0.0,MIXCR,melanoma
7,HM-baseline1C_CGATGT,0.764754,3,0.696109,IMREP,melanoma
8,HM-baseline1C_CGATGT,1.078992,3,0.982141,TRUST4,melanoma
9,INY1-sorted-T-cells_S14_L007,0.780964,1414,0.107657,MIXCR,PBMC
