In [30]:
import pandas as pd
import numpy as np
from scipy import stats
from functools import reduce

Create the metadata

In [31]:
IMREP_df = pd.read_csv("../summary_data/original/IMREP/IMREP_TRB_merged_extracted_features.csv")
TRUST4_df = pd.read_csv("../summary_data/original/TRUST4/TRUST4_TRB_merged_extracted_features.csv")
MIXCR_df = pd.read_csv("../summary_data/original/MIXCR/MIXCR_TRB_merged_extracted_features.csv")
TCR_df = pd.read_csv("../summary_data/original/TCR_Seq/TCR_merged_extracted_features.csv")

In [32]:
# Rename TCR SAMPLE names based on 1:1 matchup in python dictionary
Sample_name_matchup_RNA_TCR_dict = {
    "ESO1-sorted-T-cells_S13_L007" : "RearrangementDetails_ESO1_sorted_infusion",
    "INY1-sorted-T-cells_S14_L007" : "RearrangementDetails_INY1_sorted_infusion",
    "INY2-sorted-T-cells_S15_L007" : "RearrangementDetails_INY2_sorted_infusion",
    "TR2-PBMC_S12" : "RearrangementDetails_TR-PBMC",
    "SAR-11-14-12RNA_S1" : "SAR_11-14_PBMC",
    "MP-11-28-12RNA_S2" : "MP_11-28_PBMC",
    "CMT-baseline1C_CAGATC" : "Pt204_Baseline_TCR_seq",
    "HM-baseline1C_CGATGT" : "Pt310_baseline_TCRseq",
    "PT0310_S9" : "Pt310_on-tx_TCR_seq",
    "LEK-baseline_CGATGT" : "Pt294_baseline_TCR_seq",
    "LEK-OT110712A_CCGTCC" : "Pt294_on-tx_TCR_seq",
    "JSSBaseline-RNA_GTGAAA" : "Pt_308_baseline_TCR_seq",
    "RAS-baseline_TGACCA" : "Pt_325_baseline_TCR_seq",
    "PT0112-B_S3" : "Pt_112_baseline_TCR_seq",
    "PT0285-B_S5" : "Pt_285_baseline_TCR_seq"
}

for matchup in Sample_name_matchup_RNA_TCR_dict:
    TCR_df.loc[(TCR_df['Sample'] == Sample_name_matchup_RNA_TCR_dict[matchup]), 'Sample' ] = matchup

In [33]:
# Select rows with nReads greater than 1
IMREP_df = IMREP_df[IMREP_df['nReads'] > 1]
TRUST4_df = TRUST4_df[TRUST4_df['nReads'] > 1]
MIXCR_df = MIXCR_df[MIXCR_df['nReads'] > 1]
TCR_df = TCR_df[TCR_df['nReads'] > 1]

In [34]:
# Rename nReads and frequencies columns according to tool for proper merging
IMREP_df = IMREP_df.rename(columns={"nReads": "nReads_IMREP"})
TRUST4_df = TRUST4_df.rename(columns={"nReads": "nReads_TRUST4"})
MIXCR_df = MIXCR_df.rename(columns={"nReads": "nReads_MIXCR"})
TCR_df = TCR_df.rename(columns={"nReads": "nReads_TCR"})

In [35]:
# Merge dataframes based on two key combination: Sample and CDR3. Outer join ensures no data is lost for instances that do not have overlap
merge_IMREP_TRUST4 = pd.merge(IMREP_df, TRUST4_df, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_IMREP_TRUST4_MIXCR = pd.merge(MIXCR_df, merge_IMREP_TRUST4, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_complete = pd.merge(TCR_df, merge_IMREP_TRUST4_MIXCR, how='outer', on=['Sample', 'CDR3']).fillna(0)

In [36]:
# Select monoclonal repertoires
monoclonal = merge_complete.loc[merge_complete['Sample'].isin(['ESO1-sorted-T-cells_S13_L007','INY1-sorted-T-cells_S14_L007','INY2-sorted-T-cells_S15_L007'])]

In [37]:
# Calculate total number of reads in each sample
total_reads = monoclonal[['Sample','nReads_TCR','nReads_MIXCR','nReads_IMREP','nReads_TRUST4']].groupby('Sample').sum().rename(columns={'nReads_TCR':'total_reads_TCR','nReads_MIXCR':'total_reads_MIXCR','nReads_IMREP':'total_reads_IMREP','nReads_TRUST4':'total_reads_TRUST4'})

In [38]:
# Merge dataframes 
merge = pd.merge(monoclonal, total_reads, how='outer', on=['Sample']).fillna(0)

# Calculate frequency of CDR3 reads with respect to CDR3s that occur more than once
merge['frequency_TCR'] = merge['nReads_TCR'] / (merge['total_reads_TCR'] * 1.0)
merge['frequency_MIXCR'] = merge['nReads_MIXCR'] / (merge['total_reads_MIXCR'] * 1.0)
merge['frequency_IMREP'] = merge['nReads_IMREP'] / (merge['total_reads_IMREP'] * 1.0)
merge['frequency_TRUST4'] = merge['nReads_TRUST4'] / (merge['total_reads_TRUST4'] * 1.0)  
merge.fillna(0, inplace=True)

In [39]:
diversity_original = merge[['Sample','CDR3','frequency_TCR']]
diversity_original = diversity_original[diversity_original.frequency_TCR != 0]
diversity_original['shannon_index_TCR'] = -(diversity_original['frequency_TCR']*np.log(diversity_original['frequency_TCR']))
shannon_original = diversity_original.groupby(['Sample']).agg({'shannon_index_TCR':'sum'}).reset_index().rename(columns={'':"shannon_index_TCR"})
shannon_original = shannon_original[['Sample', 'shannon_index_TCR']]
shannon_original

Unnamed: 0,Sample,shannon_index_TCR
0,ESO1-sorted-T-cells_S13_L007,1.072675
1,INY1-sorted-T-cells_S14_L007,1.001202
2,INY2-sorted-T-cells_S15_L007,0.960516


In [40]:
# MIXCR
MIXCR = merge[['Sample','CDR3','nReads_MIXCR','total_reads_MIXCR','frequency_MIXCR','frequency_TCR']]
MIXCR.rename(columns={'nReads_MIXCR': 'nReads_tool', 'total_reads_MIXCR': 'total_reads_tool','frequency_MIXCR':'frequency_tool'}, inplace=True)
MIXCR.loc[:,'tool'] = 'MIXCR'

# IMREP
IMREP = merge[['Sample','CDR3','nReads_IMREP','total_reads_IMREP','frequency_IMREP','frequency_TCR']]
IMREP.rename(columns={'nReads_IMREP': 'nReads_tool', 'total_reads_IMREP': 'total_reads_tool','frequency_IMREP':'frequency_tool'}, inplace=True)
IMREP.loc[:,'tool'] = 'IMREP'

# TRUST4
TRUST4 = merge[['Sample','CDR3','nReads_TRUST4','total_reads_TRUST4','frequency_TRUST4','frequency_TCR']]
TRUST4.rename(columns={'nReads_TRUST4': 'nReads_tool', 'total_reads_TRUST4': 'total_reads_tool','frequency_TRUST4':'frequency_tool'}, inplace=True)
TRUST4.loc[:,'tool'] = 'TRUST4'

df = pd.concat([MIXCR,IMREP,TRUST4], sort=False)
df

Unnamed: 0,Sample,CDR3,nReads_tool,total_reads_tool,frequency_tool,frequency_TCR,tool
0,ESO1-sorted-T-cells_S13_L007,CASSLSGGINEQFF,0.0,132472.0,0.00000,0.000022,MIXCR
1,ESO1-sorted-T-cells_S13_L007,CASNPLRDAYNEQFF,0.0,132472.0,0.00000,0.000022,MIXCR
2,ESO1-sorted-T-cells_S13_L007,CASSLGADGGELFF,0.0,132472.0,0.00000,0.000022,MIXCR
3,ESO1-sorted-T-cells_S13_L007,CASSQEEGLTYGYTF,0.0,132472.0,0.00000,0.000022,MIXCR
4,ESO1-sorted-T-cells_S13_L007,CASSLEPTGGKAFF,0.0,132472.0,0.00000,0.000022,MIXCR
...,...,...,...,...,...,...,...
28838,INY2-sorted-T-cells_S15_L007,CASSYAANTGELSF,2.0,99723.0,0.00002,0.000000,TRUST4
28839,INY2-sorted-T-cells_S15_L007,CASSLTHNEQFF,2.0,99723.0,0.00002,0.000000,TRUST4
28840,INY2-sorted-T-cells_S15_L007,CASPGGHSPLHF,2.0,99723.0,0.00002,0.000000,TRUST4
28841,INY2-sorted-T-cells_S15_L007,CASIPKGGARPGEQFF,2.0,99723.0,0.00002,0.000000,TRUST4


Reduce the number of reads from MIXCR, IMREP, and TRUST4 to 28,000

In [41]:
samples = ['ESO1-sorted-T-cells_S13_L007','INY1-sorted-T-cells_S14_L007','INY2-sorted-T-cells_S15_L007']

df1 = pd.DataFrame()
for sample in samples: 
    sample = df.loc[df['Sample']==sample]
    sample.loc[:,'new_nReads_tool'] = 10000*sample['frequency_tool']
    sample.loc[:,'new_nReads_tool'] = sample['new_nReads_tool'].round().astype(int)
    sample.loc[:,'new_total_reads_tool'] = sample['new_nReads_tool'].sum()
    sample.loc[:,'new_frequency_tool'] = sample['new_nReads_tool'] / (sample['new_total_reads_tool'] * 1.0)
    tool_count = sample[sample.new_nReads_tool != 0]
    clonotype_count = tool_count.groupby(['tool'], sort=False).size().reset_index(name='clonotype_count_tool')
    sample = pd.merge(sample, clonotype_count, on=['tool'])
    df1 =pd.concat([df1, sample])
    
df1.loc[:,'est_reads'] = 28000

In [42]:
tools = ['MIXCR','IMREP','TRUST4']
shannon1 = pd.DataFrame()
for tool in tools:
    subsample = df1.loc[df1['tool']==tool]
    diversity = subsample[subsample.new_frequency_tool != 0]
    diversity.loc[:,'shannon_index_tool'] = -(diversity['new_frequency_tool']*np.log(diversity['new_frequency_tool']))
    shannon = diversity.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
    shannon = shannon[['Sample', 'shannon_index_tool']]
    shannon.loc[:,'tool'] = tool
    new_reads = df1[['Sample','tool','new_total_reads_tool','est_reads']].drop_duplicates(keep='first', inplace=False)
    shannon = pd.merge(shannon, new_reads, how='left', on=['Sample','tool'])
    shannon1 =pd.concat([shannon1, shannon])

Reduce the number of reads from MIXCR, IMREP, and TRUST4 to 13,500

In [43]:
samples = ['ESO1-sorted-T-cells_S13_L007','INY1-sorted-T-cells_S14_L007','INY2-sorted-T-cells_S15_L007']
df2 = pd.DataFrame()
for sample in samples: 
    sample = df.loc[df['Sample']==sample]
    sample.loc[:,'new_nReads_tool'] = 5000*sample['frequency_tool']
    sample.loc[:,'new_nReads_tool'] = sample['new_nReads_tool'].round().astype(int)
    sample.loc[:,'new_total_reads_tool'] = sample['new_nReads_tool'].sum()
    sample.loc[:,'new_frequency_tool'] = sample['new_nReads_tool'] / (sample['new_total_reads_tool'] * 1.0)
    tool_count = sample[sample.new_nReads_tool != 0]
    clonotype_count = tool_count.groupby(['tool'], sort=False).size().reset_index(name='clonotype_count_tool')
    sample = pd.merge(sample, clonotype_count, on=['tool'])
    df2 =pd.concat([df2, sample])
df2.loc[:,'est_reads'] = 13500

In [44]:
tools = ['MIXCR','IMREP','TRUST4']
shannon2 = pd.DataFrame()
for tool in tools:
    subsample = df2.loc[df2['tool']==tool]
    diversity = subsample[subsample.new_frequency_tool != 0]
    diversity.loc[:,'shannon_index_tool'] = -(diversity['new_frequency_tool']*np.log(diversity['new_frequency_tool']))
    shannon = diversity.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
    shannon = shannon[['Sample', 'shannon_index_tool']]
    shannon.loc[:,'tool'] = tool
    new_reads = df2[['Sample','tool','new_total_reads_tool','est_reads']].drop_duplicates(keep='first', inplace=False)
    shannon = pd.merge(shannon, new_reads, how='left', on=['Sample','tool'])
    shannon2 =pd.concat([shannon2, shannon])

Reduce the number of reads from MIXCR, IMREP, and TRUST4 to 2600

In [45]:
samples = ['ESO1-sorted-T-cells_S13_L007','INY1-sorted-T-cells_S14_L007','INY2-sorted-T-cells_S15_L007']
df3 = pd.DataFrame()
for sample in samples: 
    sample = df.loc[df['Sample']==sample]
    sample.loc[:,'new_nReads_tool'] = 1000*sample['frequency_tool']
    sample.loc[:,'new_nReads_tool'] = sample['new_nReads_tool'].round().astype(int)
    sample.loc[:,'new_total_reads_tool'] = sample['new_nReads_tool'].sum()
    sample.loc[:,'new_frequency_tool'] = sample['new_nReads_tool'] / (sample['new_total_reads_tool'] * 1.0)
    tool_count = sample[sample.new_nReads_tool != 0]
    clonotype_count = tool_count.groupby(['tool'], sort=False).size().reset_index(name='clonotype_count_tool')
    sample = pd.merge(sample, clonotype_count, on=['tool'])
    df3 =pd.concat([df3, sample])
df3.loc[:,'est_reads'] = 2600

In [46]:
tools = ['MIXCR','IMREP','TRUST4']
shannon3 = pd.DataFrame()
for tool in tools:
    subsample = df3.loc[df3['tool']==tool]
    diversity = subsample[subsample.new_frequency_tool != 0]
    diversity.loc[:,'shannon_index_tool'] = -(diversity['new_frequency_tool']*np.log(diversity['new_frequency_tool']))
    shannon = diversity.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
    shannon = shannon[['Sample', 'shannon_index_tool']]
    shannon.loc[:,'tool'] = tool
    new_reads = df3[['Sample','tool','new_total_reads_tool','est_reads']].drop_duplicates(keep='first', inplace=False)
    shannon = pd.merge(shannon, new_reads, how='left', on=['Sample','tool'])
    shannon3 =pd.concat([shannon3, shannon])

Reduce the number of reads from MIXCR, IMREP, and TRUST4 to 1300

In [47]:
samples = ['ESO1-sorted-T-cells_S13_L007','INY1-sorted-T-cells_S14_L007','INY2-sorted-T-cells_S15_L007']
df4 = pd.DataFrame()
for sample in samples: 
    sample = df.loc[df['Sample']==sample]
    sample.loc[:,'new_nReads_tool'] = 500*sample['frequency_tool']
    sample.loc[:,'new_nReads_tool'] = sample['new_nReads_tool'].round().astype(int)
    sample.loc[:,'new_total_reads_tool'] = sample['new_nReads_tool'].sum()
    sample.loc[:,'new_frequency_tool'] = sample['new_nReads_tool'] / (sample['new_total_reads_tool'] * 1.0)
    tool_count = sample[sample.new_nReads_tool != 0]
    clonotype_count = tool_count.groupby(['tool'], sort=False).size().reset_index(name='clonotype_count_tool')
    sample = pd.merge(sample, clonotype_count, on=['tool'])
    df4 =pd.concat([df4, sample])
df4.loc[:,'est_reads'] = 1300

In [48]:
tools = ['MIXCR','IMREP','TRUST4']
shannon4 = pd.DataFrame()
for tool in tools:
    subsample = df4.loc[df4['tool']==tool]
    diversity = subsample[subsample.new_frequency_tool != 0]
    diversity.loc[:,'shannon_index_tool'] = -(diversity['new_frequency_tool']*np.log(diversity['new_frequency_tool']))
    shannon = diversity.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
    shannon = shannon[['Sample', 'shannon_index_tool']]
    shannon.loc[:,'tool'] = tool
    new_reads = df4[['Sample','tool','new_total_reads_tool','est_reads']].drop_duplicates(keep='first', inplace=False)
    shannon = pd.merge(shannon, new_reads, how='left', on=['Sample','tool'])
    shannon4 =pd.concat([shannon4, shannon])

Reduce the number of reads from MIXCR, IMREP, and TRUST4 to 260

In [49]:
samples = ['ESO1-sorted-T-cells_S13_L007','INY1-sorted-T-cells_S14_L007','INY2-sorted-T-cells_S15_L007']
df5 = pd.DataFrame()
for sample in samples: 
    sample = df.loc[df['Sample']==sample]
    sample.loc[:,'new_nReads_tool'] = 100*sample['frequency_tool']
    sample.loc[:,'new_nReads_tool'] = sample['new_nReads_tool'].round().astype(int)
    sample.loc[:,'new_total_reads_tool'] = sample['new_nReads_tool'].sum()
    sample.loc[:,'new_frequency_tool'] = sample['new_nReads_tool'] / (sample['new_total_reads_tool'] * 1.0)
    tool_count = sample[sample.new_nReads_tool != 0]
    clonotype_count = tool_count.groupby(['tool'], sort=False).size().reset_index(name='clonotype_count_tool')
    sample = pd.merge(sample, clonotype_count, on=['tool'])
    df5 =pd.concat([df5, sample])
df5.loc[:,'est_reads'] = 260

In [50]:
df_T_cell_poor_monoclonal = df5[['Sample', 'CDR3','frequency_TCR','tool','new_nReads_tool','new_total_reads_tool','new_frequency_tool']]
df_T_cell_poor_monoclonal.loc[:,'Sample'] = df_T_cell_poor_monoclonal['Sample'].replace({'ESO1-sorted-T-cells_S13_L007': 'insilco_sample1', 'INY1-sorted-T-cells_S14_L007': 'insilco_sample2', 'INY2-sorted-T-cells_S15_L007':'insilco_sample3'})
df_T_cell_poor_monoclonal.loc[:,'tissue_type'] = 'T_cell_poor'
df_T_cell_poor_monoclonal.loc[:,'repertoire_type'] = 'monoclonal'
df_T_cell_poor_monoclonal.loc[:,'class'] = 'T_cell_poor_monoclonal'

MIXCR = df_T_cell_poor_monoclonal.loc[df_T_cell_poor_monoclonal['tool'] == 'MIXCR']
MIXCR = MIXCR.rename({'new_nReads_tool':'nReads_MIXCR','new_total_reads_tool':'total_reads_MIXCR','new_frequency_tool':'frequency_MIXCR'}, axis=1)
MIXCR = MIXCR.drop(['tool'], axis=1)

IMREP = df_T_cell_poor_monoclonal.loc[df_T_cell_poor_monoclonal['tool'] == 'IMREP']
IMREP = IMREP.rename({'new_nReads_tool':'nReads_IMREP','new_total_reads_tool':'total_reads_IMREP','new_frequency_tool':'frequency_IMREP'}, axis=1)
IMREP = IMREP.drop(['tool'], axis=1)

TRUST4 = df_T_cell_poor_monoclonal.loc[df_T_cell_poor_monoclonal['tool'] == 'TRUST4']
TRUST4 = TRUST4.rename({'new_nReads_tool':'nReads_TRUST4','new_total_reads_tool':'total_reads_TRUST4','new_frequency_tool':'frequency_TRUST4'}, axis=1)
TRUST4 = TRUST4.drop(['tool'], axis=1)

dfs = [MIXCR,IMREP,TRUST4]
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Sample','CDR3','frequency_TCR','tissue_type','repertoire_type','class'],how='outer'), dfs)
df_merged

Unnamed: 0,Sample,CDR3,frequency_TCR,nReads_MIXCR,total_reads_MIXCR,frequency_MIXCR,tissue_type,repertoire_type,class,nReads_IMREP,total_reads_IMREP,frequency_IMREP,nReads_TRUST4,total_reads_TRUST4,frequency_TRUST4
0,insilco_sample1,CASSLSGGINEQFF,0.000022,0,271,0.0,T_cell_poor,monoclonal,T_cell_poor_monoclonal,0,271,0.0,0,271,0.0
1,insilco_sample1,CASNPLRDAYNEQFF,0.000022,0,271,0.0,T_cell_poor,monoclonal,T_cell_poor_monoclonal,0,271,0.0,0,271,0.0
2,insilco_sample1,CASSLGADGGELFF,0.000022,0,271,0.0,T_cell_poor,monoclonal,T_cell_poor_monoclonal,0,271,0.0,0,271,0.0
3,insilco_sample1,CASSQEEGLTYGYTF,0.000022,0,271,0.0,T_cell_poor,monoclonal,T_cell_poor_monoclonal,0,271,0.0,0,271,0.0
4,insilco_sample1,CASSLEPTGGKAFF,0.000022,0,271,0.0,T_cell_poor,monoclonal,T_cell_poor_monoclonal,0,271,0.0,0,271,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28838,insilco_sample3,CASSYAANTGELSF,0.000000,0,261,0.0,T_cell_poor,monoclonal,T_cell_poor_monoclonal,0,261,0.0,0,261,0.0
28839,insilco_sample3,CASSLTHNEQFF,0.000000,0,261,0.0,T_cell_poor,monoclonal,T_cell_poor_monoclonal,0,261,0.0,0,261,0.0
28840,insilco_sample3,CASPGGHSPLHF,0.000000,0,261,0.0,T_cell_poor,monoclonal,T_cell_poor_monoclonal,0,261,0.0,0,261,0.0
28841,insilco_sample3,CASIPKGGARPGEQFF,0.000000,0,261,0.0,T_cell_poor,monoclonal,T_cell_poor_monoclonal,0,261,0.0,0,261,0.0


In [51]:
df_merged.to_csv('../summary_data/subsample/T_cell_poor_monoclonal.csv', index=False)

In [52]:
tools = ['MIXCR','IMREP','TRUST4']
shannon5 = pd.DataFrame()
for tool in tools:
    subsample = df5.loc[df5['tool']==tool]
    diversity = subsample[subsample.new_frequency_tool != 0]
    diversity.loc[:,'shannon_index_tool'] = -(diversity['new_frequency_tool']*np.log(diversity['new_frequency_tool']))
    shannon = diversity.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
    shannon = shannon[['Sample', 'shannon_index_tool']]
    shannon.loc[:,'tool'] = tool
    new_reads = df5[['Sample','tool','new_total_reads_tool','est_reads']].drop_duplicates(keep='first', inplace=False)
    shannon = pd.merge(shannon, new_reads, how='left', on=['Sample','tool'])
    shannon5 =pd.concat([shannon5, shannon])

In [53]:
subsample_combined = pd.concat([df1, df2, df3, df4, df5])
clonotype_count = subsample_combined[['Sample','clonotype_count_tool','tool','est_reads']].drop_duplicates(keep='first', inplace=False)

In [54]:
subsample_combined = subsample_combined.drop(['clonotype_count_tool'], axis=1)
subsample_combined.to_csv('../summary_data/subsample/subsample_complete.csv', index=False)

In [55]:
shannon_combined = pd.concat([shannon1, shannon2, shannon3, shannon4, shannon5])
diversity_combined = pd.merge(shannon_original, shannon_combined, how='outer', on=['Sample'])
diversity_combined = pd.merge(diversity_combined, clonotype_count, how='outer', on=['Sample','tool','est_reads'])
diversity_combined['absolute_error'] = np.abs(diversity_combined['shannon_index_tool'] - diversity_combined['shannon_index_TCR'])
diversity_combined

Unnamed: 0,Sample,shannon_index_TCR,shannon_index_tool,tool,new_total_reads_tool,est_reads,clonotype_count_tool,absolute_error
0,ESO1-sorted-T-cells_S13_L007,1.072675,0.448437,MIXCR,28349,28000,219,0.624237
1,ESO1-sorted-T-cells_S13_L007,1.072675,0.547504,IMREP,28349,28000,369,0.525171
2,ESO1-sorted-T-cells_S13_L007,1.072675,0.51891,TRUST4,28349,28000,316,0.553765
3,ESO1-sorted-T-cells_S13_L007,1.072675,0.38909,MIXCR,13889,13500,31,0.683585
4,ESO1-sorted-T-cells_S13_L007,1.072675,0.459795,IMREP,13889,13500,93,0.61288
5,ESO1-sorted-T-cells_S13_L007,1.072675,0.461263,TRUST4,13889,13500,109,0.611412
6,ESO1-sorted-T-cells_S13_L007,1.072675,0.372418,MIXCR,2739,2600,3,0.700256
7,ESO1-sorted-T-cells_S13_L007,1.072675,0.406998,IMREP,2739,2600,13,0.665677
8,ESO1-sorted-T-cells_S13_L007,1.072675,0.396686,TRUST4,2739,2600,11,0.675989
9,ESO1-sorted-T-cells_S13_L007,1.072675,0.366791,MIXCR,1362,1300,1,0.705884


In [56]:
diversity_combined.to_csv('../summary_data/subsample/subsample_diversity.csv', index=False)

In [57]:
diversity_T_cell_poor_monoclonal = diversity_combined.loc[diversity_combined['est_reads']==260]
diversity_T_cell_poor_monoclonal.loc[:,'Sample'] = diversity_T_cell_poor_monoclonal['Sample'].replace({'ESO1-sorted-T-cells_S13_L007': 'insilco_sample1', 'INY1-sorted-T-cells_S14_L007': 'insilco_sample2', 'INY2-sorted-T-cells_S15_L007':'insilco_sample3'})
diversity_T_cell_poor_monoclonal.loc[:,'tissue_type'] = 'T_cell_poor'
diversity_T_cell_poor_monoclonal.loc[:,'repertoire_type'] = 'monoclonal'
diversity_T_cell_poor_monoclonal.loc[:,'class'] = 'T_cell_poor_monoclonal'
diversity_T_cell_poor_monoclonal = diversity_T_cell_poor_monoclonal.drop(['est_reads'], axis=1)
diversity_T_cell_poor_monoclonal = diversity_T_cell_poor_monoclonal.drop(['new_total_reads_tool'], axis=1)
diversity_T_cell_poor_monoclonal

Unnamed: 0,Sample,shannon_index_TCR,shannon_index_tool,tool,clonotype_count_tool,absolute_error,tissue_type,repertoire_type,class
12,insilco_sample1,1.072675,0.367031,MIXCR,1,0.705644,T_cell_poor,monoclonal,T_cell_poor_monoclonal
13,insilco_sample1,1.072675,0.365682,IMREP,1,0.706992,T_cell_poor,monoclonal,T_cell_poor_monoclonal
14,insilco_sample1,1.072675,0.365682,TRUST4,1,0.706992,T_cell_poor,monoclonal,T_cell_poor_monoclonal
27,insilco_sample2,1.001202,0.367879,MIXCR,1,0.633323,T_cell_poor,monoclonal,T_cell_poor_monoclonal
28,insilco_sample2,1.001202,0.36336,IMREP,1,0.637842,T_cell_poor,monoclonal,T_cell_poor_monoclonal
29,insilco_sample2,1.001202,0.364596,TRUST4,1,0.636605,T_cell_poor,monoclonal,T_cell_poor_monoclonal
42,insilco_sample3,0.960516,0.452226,MIXCR,3,0.50829,T_cell_poor,monoclonal,T_cell_poor_monoclonal
43,insilco_sample3,0.960516,0.384446,IMREP,2,0.576071,T_cell_poor,monoclonal,T_cell_poor_monoclonal
44,insilco_sample3,0.960516,0.38619,TRUST4,2,0.574326,T_cell_poor,monoclonal,T_cell_poor_monoclonal


In [58]:
diversity_T_cell_poor_monoclonal.to_csv('../summary_data/subsample/T_cell_poor_monoclonal_diversity.csv', index=False)