In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
IMREP_df = pd.read_csv("../summary_data/150bp/IMREP_TRB_merged_extracted_features.csv")
TRUST4_df = pd.read_csv("../summary_data/150bp/TRUST4_TRB_merged_extracted_features.csv")
MIXCR_df = pd.read_csv("../summary_data/150bp/MIXCR_TRB_merged_extracted_features.csv")
TCR_df = pd.read_csv("../summary_data/150bp/TCR_merged_extracted_features.csv")

In [3]:
# Rename TCR SAMPLE names based on 1:1 matchup in python dictionary
Sample_name_matchup_RNA_TCR_dict = {
    "ESO1-sorted-T-cells_S13_L007" : "RearrangementDetails_ESO1_sorted_infusion",
    "INY1-sorted-T-cells_S14_L007" : "RearrangementDetails_INY1_sorted_infusion",
    "INY2-sorted-T-cells_S15_L007" : "RearrangementDetails_INY2_sorted_infusion",
    "TR2-PBMC_S12" : "RearrangementDetails_TR-PBMC",
    "SAR-11-14-12RNA_S1" : "SAR_11-14_PBMC",
    "MP-11-28-12RNA_S2" : "MP_11-28_PBMC",
    "CMT-baseline1C_CAGATC" : "Pt204_Baseline_TCR_seq",
    "HM-baseline1C_CGATGT" : "Pt310_baseline_TCRseq",
    "PT0310_S9" : "Pt310_on-tx_TCR_seq",
    "LEK-baseline_CGATGT" : "Pt294_baseline_TCR_seq",
    "LEK-OT110712A_CCGTCC" : "Pt294_on-tx_TCR_seq",
    "JSSBaseline-RNA_GTGAAA" : "Pt_308_baseline_TCR_seq",
    "RAS-baseline_TGACCA" : "Pt_325_baseline_TCR_seq",
    "PT0112-B_S3" : "Pt_112_baseline_TCR_seq",
    "PT0285-B_S5" : "Pt_285_baseline_TCR_seq"
}

for matchup in Sample_name_matchup_RNA_TCR_dict:
    TCR_df.loc[(TCR_df['Sample'] == Sample_name_matchup_RNA_TCR_dict[matchup]), 'Sample' ] = matchup

In [4]:
# Select rows with nReads greater than 1
IMREP_df = IMREP_df[IMREP_df['nReads'] > 1]
TRUST4_df = TRUST4_df[TRUST4_df['nReads'] > 1]
MIXCR_df = MIXCR_df[MIXCR_df['nReads'] > 1]
TCR_df = TCR_df[TCR_df['nReads'] > 1]

In [5]:
# Rename nReads and frequencies columns according to tool for proper merging
IMREP_df = IMREP_df.rename(columns={"nReads": "nReads_IMREP"})
TRUST4_df = TRUST4_df.rename(columns={"nReads": "nReads_TRUST4"})
MIXCR_df = MIXCR_df.rename(columns={"nReads": "nReads_MIXCR"})
TCR_df = TCR_df.rename(columns={"nReads": "nReads_TCR"})

In [6]:
# Complete dataframe across all samples and tools
# Merge dataframes based on two key combination: Sample and CDR3. Outer join ensures no data is lost for instances that do not have overlap
merge_IMREP_TRUST4 = pd.merge(IMREP_df, TRUST4_df, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_IMREP_TRUST4_MIXCR = pd.merge(MIXCR_df, merge_IMREP_TRUST4, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_IMREP_TRUST4_MIXCR_TCR = pd.merge(TCR_df, merge_IMREP_TRUST4_MIXCR, how='outer', on=['Sample', 'CDR3']).fillna(0)

merge_IMREP_TRUST4_MIXCR_TCR

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_MIXCR,nReads_IMREP,nReads_TRUST4
0,MP-11-28-12RNA_S2,CASSETGAETQYF,4.0,0.0,0.0,0.0
1,MP-11-28-12RNA_S2,CASSFETVNNSPLHF,2.0,0.0,0.0,0.0
2,MP-11-28-12RNA_S2,CAISESGGSSYNEQFF,2.0,0.0,0.0,0.0
3,MP-11-28-12RNA_S2,CASSVSYRGRFGYTF,4.0,0.0,0.0,0.0
4,MP-11-28-12RNA_S2,CASTRWGDNEQFF,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...
151597,TR2-PBMC_S12,CATRQGAINEQFF,0.0,0.0,0.0,3.0
151598,TR2-PBMC_S12,CASSTAENRPQHF,0.0,0.0,0.0,2.0
151599,TR2-PBMC_S12,CASSHERTEGTEAFF,0.0,0.0,0.0,2.0
151600,TR2-PBMC_S12,CASSVGTPTDTLYF,0.0,0.0,0.0,2.0


In [7]:
# Add the sample type
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='CMT-baseline1C_CAGATC','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='ESO1-sorted-T-cells_S13_L007','sample_type'] = 'PBMC'
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='HM-baseline1C_CGATGT','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='INY1-sorted-T-cells_S14_L007','sample_type'] = 'PBMC'
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='INY2-sorted-T-cells_S15_L007','sample_type'] = 'PBMC'
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='JSSBaseline-RNA_GTGAAA','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='LEK-OT110712A_CCGTCC','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='LEK-baseline_CGATGT','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='MP-11-28-12RNA_S2','sample_type'] = 'PBMC'
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='PT0112-B_S3','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='PT0285-B_S5','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='PT0310_S9','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='RAS-baseline_TGACCA','sample_type'] = 'melanoma'
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='SAR-11-14-12RNA_S1','sample_type'] = 'PBMC'
merge_IMREP_TRUST4_MIXCR_TCR.loc[merge_IMREP_TRUST4_MIXCR_TCR['Sample']=='TR2-PBMC_S12','sample_type'] = 'PBMC'

In [8]:
# Calculate total number of reads in each sample
total_reads = merge_IMREP_TRUST4_MIXCR_TCR[['Sample','nReads_TCR','nReads_MIXCR','nReads_IMREP','nReads_TRUST4']].groupby('Sample').sum().rename(columns={'nReads_TCR':'total_reads_TCR','nReads_MIXCR':'total_reads_MIXCR','nReads_IMREP':'total_reads_IMREP','nReads_TRUST4':'total_reads_TRUST4'})
total_reads

Unnamed: 0_level_0,total_reads_TCR,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CMT-baseline1C_CAGATC,906121.0,91.0,231.0,689.0
ESO1-sorted-T-cells_S13_L007,90577.0,132472.0,219185.0,264884.0
HM-baseline1C_CGATGT,1257571.0,0.0,2.0,19.0
INY1-sorted-T-cells_S14_L007,87762.0,41370.0,57700.0,89947.0
INY2-sorted-T-cells_S15_L007,305953.0,52066.0,64914.0,99723.0
JSSBaseline-RNA_GTGAAA,1408590.0,150.0,259.0,657.0
LEK-OT110712A_CCGTCC,1157845.0,310.0,334.0,1009.0
LEK-baseline_CGATGT,1769522.0,59.0,71.0,306.0
MP-11-28-12RNA_S2,18617.0,55.0,111.0,186.0
PT0112-B_S3,1006220.0,12.0,31.0,63.0


In [9]:
# Merge dataframes 
merge = pd.merge(merge_IMREP_TRUST4_MIXCR_TCR, total_reads, how='outer', on=['Sample']).fillna(0)

In [10]:
# Calculate frequency of CDR3 reads with respect to CDR3s that occur more than once
merge['frequency_TCR'] = merge['nReads_TCR'] / (merge['total_reads_TCR'] * 1.0)
merge['frequency_MIXCR'] = merge['nReads_MIXCR'] / (merge['total_reads_MIXCR'] * 1.0)
merge['frequency_IMREP'] = merge['nReads_IMREP'] / (merge['total_reads_IMREP'] * 1.0)
merge['frequency_TRUST4'] = merge['nReads_TRUST4'] / (merge['total_reads_TRUST4'] * 1.0)  

merge

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_MIXCR,nReads_IMREP,nReads_TRUST4,sample_type,total_reads_TCR,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4,frequency_TCR,frequency_MIXCR,frequency_IMREP,frequency_TRUST4
0,MP-11-28-12RNA_S2,CASSETGAETQYF,4.0,0.0,0.0,0.0,PBMC,18617.0,55.0,111.0,186.0,0.000215,0.0,0.0,0.000000
1,MP-11-28-12RNA_S2,CASSFETVNNSPLHF,2.0,0.0,0.0,0.0,PBMC,18617.0,55.0,111.0,186.0,0.000107,0.0,0.0,0.000000
2,MP-11-28-12RNA_S2,CAISESGGSSYNEQFF,2.0,0.0,0.0,0.0,PBMC,18617.0,55.0,111.0,186.0,0.000107,0.0,0.0,0.000000
3,MP-11-28-12RNA_S2,CASSVSYRGRFGYTF,4.0,0.0,0.0,0.0,PBMC,18617.0,55.0,111.0,186.0,0.000215,0.0,0.0,0.000000
4,MP-11-28-12RNA_S2,CASTRWGDNEQFF,2.0,0.0,0.0,0.0,PBMC,18617.0,55.0,111.0,186.0,0.000107,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151597,SAR-11-14-12RNA_S1,CASSEWGPGTGDSYEQYF,0.0,0.0,0.0,3.0,PBMC,104779.0,158.0,1354.0,2006.0,0.000000,0.0,0.0,0.001496
151598,SAR-11-14-12RNA_S1,CASSVPGQGSSMYF,0.0,0.0,0.0,2.0,PBMC,104779.0,158.0,1354.0,2006.0,0.000000,0.0,0.0,0.000997
151599,SAR-11-14-12RNA_S1,CASSPRGDEQFF,0.0,0.0,0.0,2.0,PBMC,104779.0,158.0,1354.0,2006.0,0.000000,0.0,0.0,0.000997
151600,SAR-11-14-12RNA_S1,CASSLASVLQPQHF,0.0,0.0,0.0,2.0,PBMC,104779.0,158.0,1354.0,2006.0,0.000000,0.0,0.0,0.000997


In [11]:
# Differentiate monoclonal and polyclonal

# Calculate Normalized Shannon-Wiener index
diversity = merge[['Sample','sample_type','CDR3','nReads_TCR','total_reads_TCR','frequency_TCR']]
diversity = diversity[diversity.nReads_TCR != 0]
clonotype_count = diversity.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count')

diversity1 = diversity.copy()
diversity1['shannon_index'] = -(diversity['frequency_TCR']*np.log(diversity['frequency_TCR']))

shannon= diversity1.groupby(['Sample']).agg({'shannon_index':'sum'}).reset_index().rename(columns={'':"shannon_index"})

shannon = pd.merge(shannon, clonotype_count, on=['Sample'])
shannon['normalized_shannon_wiener_index'] = shannon['shannon_index']/np.log(shannon['clonotype_count'])

# Define monoclonal sample as the normalized_shannon_wiener_index < 0.2, polyclonal sample as the normalized_shannon_wiener_index >= 0.2
shannon['clonal_type'] = ['monoclonal' if x < 0.2 else 'polyclonal' for x  in shannon['normalized_shannon_wiener_index']]

clonal_type = shannon[['Sample','clonal_type']]
clonal_type

Unnamed: 0,Sample,clonal_type
0,CMT-baseline1C_CAGATC,polyclonal
1,ESO1-sorted-T-cells_S13_L007,monoclonal
2,HM-baseline1C_CGATGT,polyclonal
3,INY1-sorted-T-cells_S14_L007,monoclonal
4,INY2-sorted-T-cells_S15_L007,monoclonal
5,JSSBaseline-RNA_GTGAAA,polyclonal
6,LEK-OT110712A_CCGTCC,polyclonal
7,LEK-baseline_CGATGT,polyclonal
8,MP-11-28-12RNA_S2,polyclonal
9,PT0112-B_S3,polyclonal


In [12]:
# Generate metadata 
metadata = pd.merge(merge, clonal_type, how='outer', on=['Sample'])

metadata.to_csv('../summary_data/150bp/all_tools_TRB.csv', index=False)
metadata

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_MIXCR,nReads_IMREP,nReads_TRUST4,sample_type,total_reads_TCR,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4,frequency_TCR,frequency_MIXCR,frequency_IMREP,frequency_TRUST4,clonal_type
0,MP-11-28-12RNA_S2,CASSETGAETQYF,4.0,0.0,0.0,0.0,PBMC,18617.0,55.0,111.0,186.0,0.000215,0.0,0.0,0.000000,polyclonal
1,MP-11-28-12RNA_S2,CASSFETVNNSPLHF,2.0,0.0,0.0,0.0,PBMC,18617.0,55.0,111.0,186.0,0.000107,0.0,0.0,0.000000,polyclonal
2,MP-11-28-12RNA_S2,CAISESGGSSYNEQFF,2.0,0.0,0.0,0.0,PBMC,18617.0,55.0,111.0,186.0,0.000107,0.0,0.0,0.000000,polyclonal
3,MP-11-28-12RNA_S2,CASSVSYRGRFGYTF,4.0,0.0,0.0,0.0,PBMC,18617.0,55.0,111.0,186.0,0.000215,0.0,0.0,0.000000,polyclonal
4,MP-11-28-12RNA_S2,CASTRWGDNEQFF,2.0,0.0,0.0,0.0,PBMC,18617.0,55.0,111.0,186.0,0.000107,0.0,0.0,0.000000,polyclonal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151597,SAR-11-14-12RNA_S1,CASSEWGPGTGDSYEQYF,0.0,0.0,0.0,3.0,PBMC,104779.0,158.0,1354.0,2006.0,0.000000,0.0,0.0,0.001496,polyclonal
151598,SAR-11-14-12RNA_S1,CASSVPGQGSSMYF,0.0,0.0,0.0,2.0,PBMC,104779.0,158.0,1354.0,2006.0,0.000000,0.0,0.0,0.000997,polyclonal
151599,SAR-11-14-12RNA_S1,CASSPRGDEQFF,0.0,0.0,0.0,2.0,PBMC,104779.0,158.0,1354.0,2006.0,0.000000,0.0,0.0,0.000997,polyclonal
151600,SAR-11-14-12RNA_S1,CASSLASVLQPQHF,0.0,0.0,0.0,2.0,PBMC,104779.0,158.0,1354.0,2006.0,0.000000,0.0,0.0,0.000997,polyclonal
