In [1]:
import pandas as pd
import numpy as np

In [2]:
IMREP_df = pd.read_csv("../summary_data/original/IMREP/IMREP_TRA_merged_extracted_features.csv")
TRUST4_df = pd.read_csv("../summary_data/original/TRUST4/TRUST4_TRA_merged_extracted_features.csv")
MIXCR_df = pd.read_csv("../summary_data/original/MIXCR/MIXCR_TRA_merged_extracted_features.csv")
TCR_df = pd.read_csv("../summary_data/original/TCR_Seq/TCR_merged_extracted_features.csv")

In [3]:
# Rename TCR SAMPLE names based on 1:1 matchup in python dictionary
Sample_name_matchup_RNA_TCR_dict = {
    "ESO1-sorted-T-cells_S13_L007" : "RearrangementDetails_ESO1_sorted_infusion",
    "INY1-sorted-T-cells_S14_L007" : "RearrangementDetails_INY1_sorted_infusion",
    "INY2-sorted-T-cells_S15_L007" : "RearrangementDetails_INY2_sorted_infusion",
    "TR2-PBMC_S12" : "RearrangementDetails_TR-PBMC",
    "SAR-11-14-12RNA_S1" : "SAR_11-14_PBMC",
    "MP-11-28-12RNA_S2" : "MP_11-28_PBMC",
    "CMT-baseline1C_CAGATC" : "Pt204_Baseline_TCR_seq",
    "HM-baseline1C_CGATGT" : "Pt310_baseline_TCRseq",
    "PT0310_S9" : "Pt310_on-tx_TCR_seq",
    "LEK-baseline_CGATGT" : "Pt294_baseline_TCR_seq",
    "LEK-OT110712A_CCGTCC" : "Pt294_on-tx_TCR_seq",
    "JSSBaseline-RNA_GTGAAA" : "Pt_308_baseline_TCR_seq",
    "RAS-baseline_TGACCA" : "Pt_325_baseline_TCR_seq",
    "PT0112-B_S3" : "Pt_112_baseline_TCR_seq",
    "PT0285-B_S5" : "Pt_285_baseline_TCR_seq"
}

for matchup in Sample_name_matchup_RNA_TCR_dict:
    TCR_df.loc[(TCR_df['Sample'] == Sample_name_matchup_RNA_TCR_dict[matchup]), 'Sample' ] = matchup

In [4]:
# Select rows with nReads greater than 1
IMREP_df = IMREP_df[IMREP_df['nReads'] > 1]
TRUST4_df = TRUST4_df[TRUST4_df['nReads'] > 1]
MIXCR_df = MIXCR_df[MIXCR_df['nReads'] > 1]
TCR_df = TCR_df[TCR_df['nReads'] > 1]

In [5]:
# Rename nReads and frequencies columns according to tool for proper merging
IMREP_df = IMREP_df.rename(columns={"nReads": "nReads_IMREP"})
TRUST4_df = TRUST4_df.rename(columns={"nReads": "nReads_TRUST4"})
MIXCR_df = MIXCR_df.rename(columns={"nReads": "nReads_MIXCR"})
TCR_df = TCR_df.rename(columns={"nReads": "nReads_TCR"})

Complete dataframe across all samples and tools

In [6]:
# Merge dataframes based on two key combination: Sample and CDR3. Outer join ensures no data is lost for instances that do not have overlap
merge_IMREP_TRUST4 = pd.merge(IMREP_df, TRUST4_df, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_IMREP_TRUST4_MIXCR = pd.merge(MIXCR_df, merge_IMREP_TRUST4, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_complete = pd.merge(TCR_df, merge_IMREP_TRUST4_MIXCR, how='outer', on=['Sample', 'CDR3']).fillna(0)

merge_complete

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_MIXCR,nReads_IMREP,nReads_TRUST4
0,MP-11-28-12RNA_S2,CASSVNPGGYNEQFF,6.0,0.0,0.0,0.0
1,MP-11-28-12RNA_S2,CASIRTRNEKLFF,2.0,0.0,0.0,0.0
2,MP-11-28-12RNA_S2,CASSPGAANTEAFF,2.0,0.0,0.0,0.0
3,MP-11-28-12RNA_S2,CASRNQGLNTEAFF,2.0,0.0,0.0,0.0
4,MP-11-28-12RNA_S2,CASSLTGNRAYNEQFF,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...
387135,TR2-PBMC_S12,CAGVGGSNYKLTF,0.0,0.0,0.0,2.0
387136,TR2-PBMC_S12,CALTRRGNQGGKLIF,0.0,0.0,0.0,2.0
387137,TR2-PBMC_S12,CLNDMRF,0.0,0.0,0.0,10.0
387138,TR2-PBMC_S12,CAPRLSTGRRALTF,0.0,0.0,0.0,2.0


In [7]:
# Add the tissue type
merge_complete.loc[merge_complete['Sample']=='CMT-baseline1C_CAGATC','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='ESO1-sorted-T-cells_S13_L007','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='HM-baseline1C_CGATGT','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='INY1-sorted-T-cells_S14_L007','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='INY2-sorted-T-cells_S15_L007','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='JSSBaseline-RNA_GTGAAA','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='LEK-OT110712A_CCGTCC','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='LEK-baseline_CGATGT','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='MP-11-28-12RNA_S2','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='PT0112-B_S3','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='PT0285-B_S5','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='PT0310_S9','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='RAS-baseline_TGACCA','tissue'] = 'melanoma'
merge_complete.loc[merge_complete['Sample']=='SAR-11-14-12RNA_S1','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='TR2-PBMC_S12','tissue'] = 'PBMC'
merge_complete.loc[merge_complete['Sample']=='SRR5233639','tissue'] = 'lymph_node'
merge_complete.loc[merge_complete['Sample']=='SRR5233637','tissue'] = 'small_intestine'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-4862','tissue'] = 'kidney'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-5463','tissue'] = 'kidney'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-5985','tissue'] = 'kidney'

In [8]:
# Add T cell rich or poor tissue type
merge_complete.loc[merge_complete['Sample']=='CMT-baseline1C_CAGATC','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='ESO1-sorted-T-cells_S13_L007','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='HM-baseline1C_CGATGT','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='INY1-sorted-T-cells_S14_L007','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='INY2-sorted-T-cells_S15_L007','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='JSSBaseline-RNA_GTGAAA','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='LEK-OT110712A_CCGTCC','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='LEK-baseline_CGATGT','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='MP-11-28-12RNA_S2','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='PT0112-B_S3','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='PT0285-B_S5','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='PT0310_S9','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='RAS-baseline_TGACCA','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='SAR-11-14-12RNA_S1','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='TR2-PBMC_S12','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='SRR5233639','tissue_type'] = 'T_cell_rich'
merge_complete.loc[merge_complete['Sample']=='SRR5233637','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-4862','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-5463','tissue_type'] = 'T_cell_poor'
merge_complete.loc[merge_complete['Sample']=='TCGA-CZ-5985','tissue_type'] = 'T_cell_poor'

In [9]:
# Calculate total number of reads in each sample
total_reads = merge_complete[['Sample','nReads_TCR','nReads_MIXCR','nReads_IMREP','nReads_TRUST4']].groupby('Sample').sum().rename(columns={'nReads_TCR':'total_reads_TCR','nReads_MIXCR':'total_reads_MIXCR','nReads_IMREP':'total_reads_IMREP','nReads_TRUST4':'total_reads_TRUST4'})
total_reads

Unnamed: 0_level_0,total_reads_TCR,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CMT-baseline1C_CAGATC,906121.0,58.0,154.0,508.0
ESO1-sorted-T-cells_S13_L007,90577.0,185894.0,228248.0,335248.0
HM-baseline1C_CGATGT,1257571.0,2.0,15.0,7.0
INY1-sorted-T-cells_S14_L007,87762.0,48760.0,57538.0,83325.0
INY2-sorted-T-cells_S15_L007,305953.0,67676.0,74871.0,124841.0
JSSBaseline-RNA_GTGAAA,1408590.0,98.0,196.0,320.0
LEK-OT110712A_CCGTCC,1157845.0,182.0,184.0,522.0
LEK-baseline_CGATGT,1769522.0,22.0,37.0,102.0
MP-11-28-12RNA_S2,18617.0,158.0,489.0,299.0
PT0112-B_S3,1006220.0,4.0,12.0,35.0


In [10]:
# Merge dataframes 
merge = pd.merge(merge_complete, total_reads, how='outer', on=['Sample']).fillna(0)

# Calculate frequency of CDR3 reads with respect to CDR3s that occur more than once
merge['frequency_TCR'] = merge['nReads_TCR'] / (merge['total_reads_TCR'] * 1.0)
merge['frequency_MIXCR'] = merge['nReads_MIXCR'] / (merge['total_reads_MIXCR'] * 1.0)
merge['frequency_IMREP'] = merge['nReads_IMREP'] / (merge['total_reads_IMREP'] * 1.0)
merge['frequency_TRUST4'] = merge['nReads_TRUST4'] / (merge['total_reads_TRUST4'] * 1.0)  
merge.fillna(0, inplace=True)
merge

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_MIXCR,nReads_IMREP,nReads_TRUST4,tissue,tissue_type,total_reads_TCR,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4,frequency_TCR,frequency_MIXCR,frequency_IMREP,frequency_TRUST4
0,MP-11-28-12RNA_S2,CASSVNPGGYNEQFF,6.0,0.0,0.0,0.0,PBMC,T_cell_rich,18617.0,158.0,489.0,299.0,0.000322,0.0,0.0,0.000000
1,MP-11-28-12RNA_S2,CASIRTRNEKLFF,2.0,0.0,0.0,0.0,PBMC,T_cell_rich,18617.0,158.0,489.0,299.0,0.000107,0.0,0.0,0.000000
2,MP-11-28-12RNA_S2,CASSPGAANTEAFF,2.0,0.0,0.0,0.0,PBMC,T_cell_rich,18617.0,158.0,489.0,299.0,0.000107,0.0,0.0,0.000000
3,MP-11-28-12RNA_S2,CASRNQGLNTEAFF,2.0,0.0,0.0,0.0,PBMC,T_cell_rich,18617.0,158.0,489.0,299.0,0.000107,0.0,0.0,0.000000
4,MP-11-28-12RNA_S2,CASSLTGNRAYNEQFF,2.0,0.0,0.0,0.0,PBMC,T_cell_rich,18617.0,158.0,489.0,299.0,0.000107,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387135,TCGA-CZ-5985,CAVHTGTASKLTF,0.0,0.0,0.0,4.0,kidney,T_cell_poor,12998.0,0.0,437.0,77.0,0.000000,0.0,0.0,0.051948
387136,TCGA-CZ-5985,CAVATGAQKLVF,0.0,0.0,0.0,2.0,kidney,T_cell_poor,12998.0,0.0,437.0,77.0,0.000000,0.0,0.0,0.025974
387137,TCGA-CZ-5985,CVVKDDYKLSF,0.0,0.0,0.0,5.0,kidney,T_cell_poor,12998.0,0.0,437.0,77.0,0.000000,0.0,0.0,0.064935
387138,TCGA-CZ-5985,CVVNLRVNTGFQKLVF,0.0,0.0,0.0,15.0,kidney,T_cell_poor,12998.0,0.0,437.0,77.0,0.000000,0.0,0.0,0.194805


Differentiate monoclonal and polyclonal repertoire

In [11]:
# Calculate Normalized Shannon-Wiener index
diversity_TCR = merge[['Sample','tissue','tissue_type','CDR3','nReads_TCR','total_reads_TCR','frequency_TCR']]
diversity_TCR = diversity_TCR[diversity_TCR.nReads_TCR != 0]
clonotype_count_TCR = diversity_TCR.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_TCR')

diversity_TCR['shannon_index_TCR'] = -(diversity_TCR['frequency_TCR']*np.log(diversity_TCR['frequency_TCR']))
shannon_TCR = diversity_TCR.groupby(['Sample']).agg({'shannon_index_TCR':'sum'}).reset_index().rename(columns={'':"shannon_index_TCR"})
shannon_TCR = pd.merge(shannon_TCR, clonotype_count_TCR, on=['Sample'])

# Define monoclonal sample as the shannon_index < 1.5, polyclonal sample as the shannon_index >= 1.5
shannon_TCR['repertoire_type'] = ['monoclonal' if x < 1.5 else 'polyclonal' for x  in shannon_TCR['shannon_index_TCR']]
repertoire_type = shannon_TCR[['Sample','repertoire_type']]
shannon_TCR

Unnamed: 0,Sample,shannon_index_TCR,clonotype_count_TCR,repertoire_type
0,CMT-baseline1C_CAGATC,8.172351,9354,polyclonal
1,ESO1-sorted-T-cells_S13_L007,1.072675,3001,monoclonal
2,HM-baseline1C_CGATGT,4.680209,3548,polyclonal
3,INY1-sorted-T-cells_S14_L007,1.001202,3229,monoclonal
4,INY2-sorted-T-cells_S15_L007,0.960516,7697,monoclonal
5,JSSBaseline-RNA_GTGAAA,8.035551,12844,polyclonal
6,LEK-OT110712A_CCGTCC,7.651713,19018,polyclonal
7,LEK-baseline_CGATGT,7.207395,16417,polyclonal
8,MP-11-28-12RNA_S2,8.005115,5440,polyclonal
9,PT0112-B_S3,7.730309,6178,polyclonal


In [12]:
# Generate metadata 
metadata = pd.merge(merge, repertoire_type, how='outer', on=['Sample'])
metadata.loc[:,'class'] = metadata["tissue_type"] +"_"+ metadata["repertoire_type"]

metadata.to_csv('../summary_data/original/all_tools_TRA.csv', index=False)
metadata

Unnamed: 0,Sample,CDR3,nReads_TCR,nReads_MIXCR,nReads_IMREP,nReads_TRUST4,tissue,tissue_type,total_reads_TCR,total_reads_MIXCR,total_reads_IMREP,total_reads_TRUST4,frequency_TCR,frequency_MIXCR,frequency_IMREP,frequency_TRUST4,repertoire_type,class
0,MP-11-28-12RNA_S2,CASSVNPGGYNEQFF,6.0,0.0,0.0,0.0,PBMC,T_cell_rich,18617.0,158.0,489.0,299.0,0.000322,0.0,0.0,0.000000,polyclonal,T_cell_rich_polyclonal
1,MP-11-28-12RNA_S2,CASIRTRNEKLFF,2.0,0.0,0.0,0.0,PBMC,T_cell_rich,18617.0,158.0,489.0,299.0,0.000107,0.0,0.0,0.000000,polyclonal,T_cell_rich_polyclonal
2,MP-11-28-12RNA_S2,CASSPGAANTEAFF,2.0,0.0,0.0,0.0,PBMC,T_cell_rich,18617.0,158.0,489.0,299.0,0.000107,0.0,0.0,0.000000,polyclonal,T_cell_rich_polyclonal
3,MP-11-28-12RNA_S2,CASRNQGLNTEAFF,2.0,0.0,0.0,0.0,PBMC,T_cell_rich,18617.0,158.0,489.0,299.0,0.000107,0.0,0.0,0.000000,polyclonal,T_cell_rich_polyclonal
4,MP-11-28-12RNA_S2,CASSLTGNRAYNEQFF,2.0,0.0,0.0,0.0,PBMC,T_cell_rich,18617.0,158.0,489.0,299.0,0.000107,0.0,0.0,0.000000,polyclonal,T_cell_rich_polyclonal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387135,TCGA-CZ-5985,CAVHTGTASKLTF,0.0,0.0,0.0,4.0,kidney,T_cell_poor,12998.0,0.0,437.0,77.0,0.000000,0.0,0.0,0.051948,polyclonal,T_cell_poor_polyclonal
387136,TCGA-CZ-5985,CAVATGAQKLVF,0.0,0.0,0.0,2.0,kidney,T_cell_poor,12998.0,0.0,437.0,77.0,0.000000,0.0,0.0,0.025974,polyclonal,T_cell_poor_polyclonal
387137,TCGA-CZ-5985,CVVKDDYKLSF,0.0,0.0,0.0,5.0,kidney,T_cell_poor,12998.0,0.0,437.0,77.0,0.000000,0.0,0.0,0.064935,polyclonal,T_cell_poor_polyclonal
387138,TCGA-CZ-5985,CVVNLRVNTGFQKLVF,0.0,0.0,0.0,15.0,kidney,T_cell_poor,12998.0,0.0,437.0,77.0,0.000000,0.0,0.0,0.194805,polyclonal,T_cell_poor_polyclonal


Calculate normalized Shannon-Wiener index and relative error for each sample across different tools

In [13]:
# MIXCR
diversity_MIXCR = merge[['Sample','tissue','tissue_type','CDR3','nReads_MIXCR','total_reads_MIXCR','frequency_MIXCR']]
diversity_MIXCR = diversity_MIXCR[diversity_MIXCR.nReads_MIXCR != 0]
clonotype_count_MIXCR = diversity_MIXCR.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity_MIXCR['shannon_index_tool'] = -(diversity_MIXCR['frequency_MIXCR']*np.log(diversity_MIXCR['frequency_MIXCR']))
shannon_MIXCR = diversity_MIXCR.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
shannon_MIXCR = pd.merge(shannon_MIXCR, clonotype_count_MIXCR, on=['Sample'])
shannon_MIXCR['tool'] = 'MIXCR'

# IMREP
diversity_IMREP = merge[['Sample','tissue','tissue_type','CDR3','nReads_IMREP','total_reads_IMREP','frequency_IMREP']]
diversity_IMREP = diversity_IMREP[diversity_IMREP.nReads_IMREP != 0]
clonotype_count_IMREP = diversity_IMREP.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity_IMREP['shannon_index_tool'] = -(diversity_IMREP['frequency_IMREP']*np.log(diversity_IMREP['frequency_IMREP']))
shannon_IMREP = diversity_IMREP.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
shannon_IMREP = pd.merge(shannon_IMREP, clonotype_count_IMREP, on=['Sample'])
shannon_IMREP['tool'] = 'IMREP'

# TRUST4
diversity_TRUST4 = merge[['Sample','tissue','tissue_type','CDR3','nReads_TRUST4','total_reads_TRUST4','frequency_TRUST4']]
diversity_TRUST4 = diversity_TRUST4[diversity_TRUST4.nReads_TRUST4 != 0]
clonotype_count_TRUST4 = diversity_TRUST4.groupby(['Sample'], sort=False).size().reset_index(name='clonotype_count_tool')

diversity_TRUST4['shannon_index_tool'] = -(diversity_TRUST4['frequency_TRUST4']*np.log(diversity_TRUST4['frequency_TRUST4']))
shannon_TRUST4 = diversity_TRUST4.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
shannon_TRUST4 = pd.merge(shannon_TRUST4, clonotype_count_TRUST4, on=['Sample'])
shannon_TRUST4['tool'] = 'TRUST4'

diversity = pd.concat([shannon_MIXCR,shannon_IMREP,shannon_TRUST4])
diversity = pd.merge(diversity, shannon_TCR, how='outer', on=['Sample'])
diversity = diversity.fillna(0)
tissue_type = merge[['Sample','tissue','tissue_type']].drop_duplicates(keep='first')
diversity = pd.merge(diversity, tissue_type, how='inner', on=['Sample'])
diversity['absolute_error'] = np.abs(diversity['shannon_index_TCR'] - diversity['shannon_index_tool'])
diversity.loc[:,'class'] = diversity["tissue_type"] +"_"+ diversity["repertoire_type"]
diversity.to_csv('../summary_data/original/all_tools_TRA_diversity.csv', index=False)

diversity

Unnamed: 0,Sample,shannon_index_tool,clonotype_count_tool,tool,shannon_index_TCR,clonotype_count_TCR,repertoire_type,tissue,tissue_type,absolute_error,class
0,CMT-baseline1C_CAGATC,3.092193,23,MIXCR,8.172351,9354,polyclonal,melanoma,T_cell_poor,5.080157,T_cell_poor_polyclonal
1,CMT-baseline1C_CAGATC,4.125943,65,IMREP,8.172351,9354,polyclonal,melanoma,T_cell_poor,4.046408,T_cell_poor_polyclonal
2,CMT-baseline1C_CAGATC,4.938231,160,TRUST4,8.172351,9354,polyclonal,melanoma,T_cell_poor,3.234119,T_cell_poor_polyclonal
3,ESO1-sorted-T-cells_S13_L007,0.535771,2215,MIXCR,1.072675,3001,monoclonal,PBMC,T_cell_rich,0.536904,T_cell_rich_monoclonal
4,ESO1-sorted-T-cells_S13_L007,0.824119,2953,IMREP,1.072675,3001,monoclonal,PBMC,T_cell_rich,0.248555,T_cell_rich_monoclonal
5,ESO1-sorted-T-cells_S13_L007,0.958934,3427,TRUST4,1.072675,3001,monoclonal,PBMC,T_cell_rich,0.113741,T_cell_rich_monoclonal
6,HM-baseline1C_CGATGT,0.0,1,MIXCR,4.680209,3548,polyclonal,melanoma,T_cell_poor,4.680209,T_cell_poor_polyclonal
7,HM-baseline1C_CGATGT,0.764754,3,IMREP,4.680209,3548,polyclonal,melanoma,T_cell_poor,3.915455,T_cell_poor_polyclonal
8,HM-baseline1C_CGATGT,1.078992,3,TRUST4,4.680209,3548,polyclonal,melanoma,T_cell_poor,3.601217,T_cell_poor_polyclonal
9,INY1-sorted-T-cells_S14_L007,0.780964,1414,MIXCR,1.001202,3229,monoclonal,PBMC,T_cell_rich,0.220238,T_cell_rich_monoclonal


Calculate number of TCR derived reads per one million RNA-Seq reads

In [14]:
RNA_seq_reads = pd.read_csv("../summary_data/original/RNA_Seq_reads.csv")
RNA_seq_reads

Unnamed: 0,Sample,total_reads_RNA_seq
0,CMT-baseline1C_CAGATC,82476159
1,ESO1-sorted-T-cells_S13_L007,104984482
2,HM-baseline1C_CGATGT,72397468
3,INY1-sorted-T-cells_S14_L007,73892845
4,INY2-sorted-T-cells_S15_L007,71654976
5,JSSBaseline-RNA_GTGAAA,85492431
6,LEK-OT110712A_CCGTCC,68584414
7,LEK-baseline_CGATGT,63320771
8,MP-11-28-12RNA_S2,40524817
9,PT0112-B_S3,55727841


In [15]:
reads = metadata[['Sample','tissue','tissue_type','repertoire_type','class','total_reads_TCR','total_reads_MIXCR','total_reads_IMREP','total_reads_TRUST4']]
reads = reads.drop_duplicates(keep='first')
reads

# Add the column of RNA-Seq reads
reads = pd.merge(reads, RNA_seq_reads, how='left', on=['Sample'])
reads

# Calculate number of TCR derived reads per one million RNA-Seq reads 
# MIXCR
MIXCR = reads[['Sample','tissue','tissue_type','repertoire_type','class','total_reads_MIXCR','total_reads_RNA_seq']]
MIXCR.rename(columns={'total_reads_MIXCR':'total_reads_tool'}, inplace=True)
MIXCR['tool'] = 'MIXCR'
MIXCR.loc[:,'TCR_derived_by_RNA_seq_tool'] = MIXCR['total_reads_tool']/MIXCR['total_reads_RNA_seq'] *1000000

# IMREP
IMREP = reads[['Sample','tissue','tissue_type','repertoire_type','class','total_reads_IMREP','total_reads_RNA_seq']]
IMREP.rename(columns={'total_reads_IMREP':'total_reads_tool'}, inplace=True)
IMREP['tool'] = 'IMREP'
IMREP.loc[:,'TCR_derived_by_RNA_seq_tool'] = IMREP['total_reads_tool']/IMREP['total_reads_RNA_seq']*1000000

# TRUST4
TRUST4 = reads[['Sample','tissue','tissue_type','repertoire_type','class','total_reads_TRUST4','total_reads_RNA_seq']]
TRUST4.rename(columns={'total_reads_TRUST4':'total_reads_tool'}, inplace=True)
TRUST4['tool'] = 'TRUST4'
TRUST4.loc[:,'TCR_derived_by_RNA_seq_tool'] = TRUST4['total_reads_tool']/TRUST4['total_reads_RNA_seq']*1000000

reads_count = pd.concat([MIXCR,IMREP,TRUST4])
reads_count.to_csv('../summary_data/original/all_tools_TRA_reads.csv', index=False)

reads_count

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.

Unnamed: 0,Sample,tissue,tissue_type,repertoire_type,class,total_reads_tool,total_reads_RNA_seq,tool,TCR_derived_by_RNA_seq_tool
0,MP-11-28-12RNA_S2,PBMC,T_cell_rich,polyclonal,T_cell_rich_polyclonal,158.0,40524817,MIXCR,3.898845
1,CMT-baseline1C_CAGATC,melanoma,T_cell_poor,polyclonal,T_cell_poor_polyclonal,58.0,82476159,MIXCR,0.703234
2,LEK-baseline_CGATGT,melanoma,T_cell_poor,polyclonal,T_cell_poor_polyclonal,22.0,63320771,MIXCR,0.347437
3,LEK-OT110712A_CCGTCC,melanoma,T_cell_poor,polyclonal,T_cell_poor_polyclonal,182.0,68584414,MIXCR,2.653664
4,HM-baseline1C_CGATGT,melanoma,T_cell_poor,polyclonal,T_cell_poor_polyclonal,2.0,72397468,MIXCR,0.027625
5,PT0310_S9,melanoma,T_cell_poor,polyclonal,T_cell_poor_polyclonal,720.0,80622502,MIXCR,8.930509
6,PT0112-B_S3,melanoma,T_cell_poor,polyclonal,T_cell_poor_polyclonal,4.0,55727841,MIXCR,0.071777
7,PT0285-B_S5,melanoma,T_cell_poor,polyclonal,T_cell_poor_polyclonal,3.0,107919183,MIXCR,0.027799
8,JSSBaseline-RNA_GTGAAA,melanoma,T_cell_poor,polyclonal,T_cell_poor_polyclonal,98.0,85492431,MIXCR,1.1463
9,RAS-baseline_TGACCA,melanoma,T_cell_poor,polyclonal,T_cell_poor_polyclonal,13.0,55931661,MIXCR,0.232426
