In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from functools import reduce

Create the metadata

In [2]:
IMREP_df = pd.read_csv("../summary_data/original/IMREP/IMREP_TRB_merged_extracted_features.csv")
TRUST4_df = pd.read_csv("../summary_data/original/TRUST4/TRUST4_TRB_merged_extracted_features.csv")
MIXCR_df = pd.read_csv("../summary_data/original/MIXCR/MIXCR_TRB_merged_extracted_features.csv")
CATT_df = pd.read_csv("../summary_data/original/CATT/CATT_TRB_merged_extracted_features.csv")
TCR_df = pd.read_csv("../summary_data/original/TCR_Seq/TCR_merged_extracted_features.csv")

In [3]:
# Select rows with nReads greater than 1
IMREP_df = IMREP_df[IMREP_df['nReads'] > 1]
TRUST4_df = TRUST4_df[TRUST4_df['nReads'] > 1]
MIXCR_df = MIXCR_df[MIXCR_df['nReads'] > 1]
CATT_df = CATT_df[CATT_df['nReads'] > 1]
TCR_df = TCR_df[TCR_df['nReads'] > 1]

In [4]:
# Rename nReads and frequencies columns according to tool for proper merging
IMREP_df = IMREP_df.rename(columns={"nReads": "nReads_IMREP"})
TRUST4_df = TRUST4_df.rename(columns={"nReads": "nReads_TRUST4"})
MIXCR_df = MIXCR_df.rename(columns={"nReads": "nReads_MIXCR"})
CATT_df = CATT_df.rename(columns={"nReads": "nReads_CATT"})
TCR_df = TCR_df.rename(columns={"nReads": "nReads_TCR"})

In [5]:
# Merge dataframes based on two key combination: Sample and CDR3. Outer join ensures no data is lost for instances that do not have overlap
merge_IMREP_TRUST4 = pd.merge(IMREP_df, TRUST4_df, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_IMREP_TRUST4_MIXCR = pd.merge(MIXCR_df, merge_IMREP_TRUST4, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_IMREP_TRUST4_MIXCR_CATT = pd.merge(CATT_df, merge_IMREP_TRUST4_MIXCR, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_complete = pd.merge(TCR_df, merge_IMREP_TRUST4_MIXCR_CATT, how='outer', on=['Sample', 'CDR3']).fillna(0)

In [6]:
# Select T cell rich low SDI samples
low_SDI = merge_complete.loc[merge_complete['Sample'].isin(['sample01','sample02','sample03'])]

In [7]:
# Calculate total number of reads in each sample
total_reads = low_SDI[['Sample','nReads_TCR','nReads_MIXCR','nReads_IMREP','nReads_TRUST4','nReads_CATT']].groupby('Sample').sum().rename(columns={'nReads_TCR':'total_reads_TCR','nReads_MIXCR':'total_reads_MIXCR','nReads_IMREP':'total_reads_IMREP','nReads_TRUST4':'total_reads_TRUST4','nReads_CATT':'total_reads_CATT'})

In [8]:
# Merge dataframes 
merge = pd.merge(low_SDI, total_reads, how='outer', on=['Sample']).fillna(0)

# Calculate frequency of CDR3 reads with respect to CDR3s that occur more than once
merge['frequency_TCR'] = merge['nReads_TCR'] / (merge['total_reads_TCR'] * 1.0)
merge['frequency_MIXCR'] = merge['nReads_MIXCR'] / (merge['total_reads_MIXCR'] * 1.0)
merge['frequency_IMREP'] = merge['nReads_IMREP'] / (merge['total_reads_IMREP'] * 1.0)
merge['frequency_TRUST4'] = merge['nReads_TRUST4'] / (merge['total_reads_TRUST4'] * 1.0)  
merge['frequency_CATT'] = merge['nReads_CATT'] / (merge['total_reads_CATT'] * 1.0)
merge.fillna(0, inplace=True)

In [9]:
diversity_original = merge[['Sample','CDR3','frequency_TCR']]
diversity_original = diversity_original[diversity_original.frequency_TCR != 0]
diversity_original['shannon_index_TCR'] = -(diversity_original['frequency_TCR']*np.log(diversity_original['frequency_TCR']))
shannon_original = diversity_original.groupby(['Sample']).agg({'shannon_index_TCR':'sum'}).reset_index().rename(columns={'':"shannon_index_TCR"})
shannon_original = shannon_original[['Sample', 'shannon_index_TCR']]
shannon_original

Unnamed: 0,Sample,shannon_index_TCR
0,sample01,1.072675
1,sample02,1.001202
2,sample03,0.960516


In [10]:
# MIXCR
MIXCR = merge[['Sample','CDR3','nReads_MIXCR','total_reads_MIXCR','frequency_MIXCR','frequency_TCR']]
MIXCR.rename(columns={'nReads_MIXCR': 'nReads_tool', 'total_reads_MIXCR': 'total_reads_tool','frequency_MIXCR':'frequency_tool'}, inplace=True)
MIXCR.loc[:,'tool'] = 'MIXCR'

# IMREP
IMREP = merge[['Sample','CDR3','nReads_IMREP','total_reads_IMREP','frequency_IMREP','frequency_TCR']]
IMREP.rename(columns={'nReads_IMREP': 'nReads_tool', 'total_reads_IMREP': 'total_reads_tool','frequency_IMREP':'frequency_tool'}, inplace=True)
IMREP.loc[:,'tool'] = 'IMREP'

# TRUST4
TRUST4 = merge[['Sample','CDR3','nReads_TRUST4','total_reads_TRUST4','frequency_TRUST4','frequency_TCR']]
TRUST4.rename(columns={'nReads_TRUST4': 'nReads_tool', 'total_reads_TRUST4': 'total_reads_tool','frequency_TRUST4':'frequency_tool'}, inplace=True)
TRUST4.loc[:,'tool'] = 'TRUST4'

# CATT
CATT = merge[['Sample','CDR3','nReads_CATT','total_reads_CATT','frequency_CATT','frequency_TCR']]
CATT.rename(columns={'nReads_CATT': 'nReads_tool', 'total_reads_CATT': 'total_reads_tool','frequency_CATT':'frequency_tool'}, inplace=True)
CATT.loc[:,'tool'] = 'CATT'

df = pd.concat([MIXCR,IMREP,TRUST4,CATT], sort=False)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0,Sample,CDR3,nReads_tool,total_reads_tool,frequency_tool,frequency_TCR,tool
0,sample01,CAWRGDTAQQPQHF,0.0,132472.0,0.000000,0.000022,MIXCR
1,sample01,CASSRDSPETQYF,0.0,132472.0,0.000000,0.000022,MIXCR
2,sample01,CASSYSGRALGTGELFF,0.0,132472.0,0.000000,0.000022,MIXCR
3,sample01,CASSPDGGLRSPLHF,0.0,132472.0,0.000000,0.000022,MIXCR
4,sample01,CASTPRGTVTSNQPQHF,3.0,132472.0,0.000023,0.000044,MIXCR
...,...,...,...,...,...,...,...
31201,sample03,CAISVSVPLGDEQFF,0.0,79629.0,0.000000,0.000000,CATT
31202,sample03,CASSLDSEQFF,0.0,79629.0,0.000000,0.000000,CATT
31203,sample03,CSVETGLALDTDTQYF,0.0,79629.0,0.000000,0.000000,CATT
31204,sample03,CASSPPGRGYTF,0.0,79629.0,0.000000,0.000000,CATT


Reduce the number of reads from MIXCR, IMREP, TRUST4, CATT to 28,000

In [11]:
samples = ['sample01','sample02','sample03']

df1 = pd.DataFrame()
for sample in samples: 
    sample = df.loc[df['Sample']==sample]
    sample.loc[:,'new_nReads_tool'] = 7600*sample['frequency_tool']
    sample.loc[:,'new_nReads_tool'] = sample['new_nReads_tool'].round().astype(int)
    sample.loc[:,'new_total_reads_tool'] = sample['new_nReads_tool'].sum()
    sample.loc[:,'new_frequency_tool'] = sample['new_nReads_tool'] / (sample['new_total_reads_tool'] * 1.0)
    tool_count = sample[sample.new_nReads_tool != 0]
    clonotype_count = tool_count.groupby(['tool'], sort=False).size().reset_index(name='clonotype_count_tool')
    sample = pd.merge(sample, clonotype_count, on=['tool'])
    df1 =pd.concat([df1, sample])
    
df1.loc[:,'est_reads'] = 28000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [12]:
tools = ['MIXCR','IMREP','TRUST4','CATT']
shannon1 = pd.DataFrame()
for tool in tools:
    subsample = df1.loc[df1['tool']==tool]
    diversity = subsample[subsample.new_frequency_tool != 0]
    diversity.loc[:,'shannon_index_tool'] = -(diversity['new_frequency_tool']*np.log(diversity['new_frequency_tool']))
    shannon = diversity.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
    shannon = shannon[['Sample', 'shannon_index_tool']]
    shannon.loc[:,'tool'] = tool
    new_reads = df1[['Sample','tool','new_total_reads_tool','est_reads']].drop_duplicates(keep='first', inplace=False)
    shannon = pd.merge(shannon, new_reads, how='left', on=['Sample','tool'])
    shannon1 =pd.concat([shannon1, shannon])

Reduce the number of reads from MIXCR, IMREP, TRUST4, CATT to 13,500

In [13]:
samples = ['sample01','sample02','sample03']

df2 = pd.DataFrame()
for sample in samples: 
    sample = df.loc[df['Sample']==sample]
    sample.loc[:,'new_nReads_tool'] = 3750*sample['frequency_tool']
    sample.loc[:,'new_nReads_tool'] = sample['new_nReads_tool'].round().astype(int)
    sample.loc[:,'new_total_reads_tool'] = sample['new_nReads_tool'].sum()
    sample.loc[:,'new_frequency_tool'] = sample['new_nReads_tool'] / (sample['new_total_reads_tool'] * 1.0)
    tool_count = sample[sample.new_nReads_tool != 0]
    clonotype_count = tool_count.groupby(['tool'], sort=False).size().reset_index(name='clonotype_count_tool')
    sample = pd.merge(sample, clonotype_count, on=['tool'])
    df2 =pd.concat([df2, sample])
df2.loc[:,'est_reads'] = 13500

In [14]:
tools = ['MIXCR','IMREP','TRUST4','CATT']
shannon2 = pd.DataFrame()
for tool in tools:
    subsample = df2.loc[df2['tool']==tool]
    diversity = subsample[subsample.new_frequency_tool != 0]
    diversity.loc[:,'shannon_index_tool'] = -(diversity['new_frequency_tool']*np.log(diversity['new_frequency_tool']))
    shannon = diversity.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
    shannon = shannon[['Sample', 'shannon_index_tool']]
    shannon.loc[:,'tool'] = tool
    new_reads = df2[['Sample','tool','new_total_reads_tool','est_reads']].drop_duplicates(keep='first', inplace=False)
    shannon = pd.merge(shannon, new_reads, how='left', on=['Sample','tool'])
    shannon2 =pd.concat([shannon2, shannon])

Reduce the number of reads from MIXCR, IMREP, TRUST4, CATT to 2600

In [15]:
samples = ['sample01','sample02','sample03']

df3 = pd.DataFrame()
for sample in samples: 
    sample = df.loc[df['Sample']==sample]
    sample.loc[:,'new_nReads_tool'] = 750*sample['frequency_tool']
    sample.loc[:,'new_nReads_tool'] = sample['new_nReads_tool'].round().astype(int)
    sample.loc[:,'new_total_reads_tool'] = sample['new_nReads_tool'].sum()
    sample.loc[:,'new_frequency_tool'] = sample['new_nReads_tool'] / (sample['new_total_reads_tool'] * 1.0)
    tool_count = sample[sample.new_nReads_tool != 0]
    clonotype_count = tool_count.groupby(['tool'], sort=False).size().reset_index(name='clonotype_count_tool')
    sample = pd.merge(sample, clonotype_count, on=['tool'])
    df3 =pd.concat([df3, sample])
df3.loc[:,'est_reads'] = 2600

In [16]:
tools = ['MIXCR','IMREP','TRUST4','CATT']
shannon3 = pd.DataFrame()
for tool in tools:
    subsample = df3.loc[df3['tool']==tool]
    diversity = subsample[subsample.new_frequency_tool != 0]
    diversity.loc[:,'shannon_index_tool'] = -(diversity['new_frequency_tool']*np.log(diversity['new_frequency_tool']))
    shannon = diversity.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
    shannon = shannon[['Sample', 'shannon_index_tool']]
    shannon.loc[:,'tool'] = tool
    new_reads = df3[['Sample','tool','new_total_reads_tool','est_reads']].drop_duplicates(keep='first', inplace=False)
    shannon = pd.merge(shannon, new_reads, how='left', on=['Sample','tool'])
    shannon3 =pd.concat([shannon3, shannon])

Reduce the number of reads from MIXCR, IMREP, TRUST4, CATT to 1300

In [17]:
samples = ['sample01','sample02','sample03']

df4 = pd.DataFrame()
for sample in samples: 
    sample = df.loc[df['Sample']==sample]
    sample.loc[:,'new_nReads_tool'] = 380*sample['frequency_tool']
    sample.loc[:,'new_nReads_tool'] = sample['new_nReads_tool'].round().astype(int)
    sample.loc[:,'new_total_reads_tool'] = sample['new_nReads_tool'].sum()
    sample.loc[:,'new_frequency_tool'] = sample['new_nReads_tool'] / (sample['new_total_reads_tool'] * 1.0)
    tool_count = sample[sample.new_nReads_tool != 0]
    clonotype_count = tool_count.groupby(['tool'], sort=False).size().reset_index(name='clonotype_count_tool')
    sample = pd.merge(sample, clonotype_count, on=['tool'])
    df4 =pd.concat([df4, sample])
df4.loc[:,'est_reads'] = 1300

In [18]:
tools = ['MIXCR','IMREP','TRUST4','CATT']
shannon4 = pd.DataFrame()
for tool in tools:
    subsample = df4.loc[df4['tool']==tool]
    diversity = subsample[subsample.new_frequency_tool != 0]
    diversity.loc[:,'shannon_index_tool'] = -(diversity['new_frequency_tool']*np.log(diversity['new_frequency_tool']))
    shannon = diversity.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
    shannon = shannon[['Sample', 'shannon_index_tool']]
    shannon.loc[:,'tool'] = tool
    new_reads = df4[['Sample','tool','new_total_reads_tool','est_reads']].drop_duplicates(keep='first', inplace=False)
    shannon = pd.merge(shannon, new_reads, how='left', on=['Sample','tool'])
    shannon4 =pd.concat([shannon4, shannon])

Reduce the number of reads from MIXCR, IMREP, TRUST4, CATT to 310

In [19]:
samples = ['sample01','sample02','sample03']

df5 = pd.DataFrame()
for sample in samples: 
    sample = df.loc[df['Sample']==sample]
    sample.loc[:,'new_nReads_tool'] = 88*sample['frequency_tool']
    sample.loc[:,'new_nReads_tool'] = sample['new_nReads_tool'].round().astype(int)
    sample.loc[:,'new_total_reads_tool'] = sample['new_nReads_tool'].sum()
    sample.loc[:,'new_frequency_tool'] = sample['new_nReads_tool'] / (sample['new_total_reads_tool'] * 1.0)
    tool_count = sample[sample.new_nReads_tool != 0]
    clonotype_count = tool_count.groupby(['tool'], sort=False).size().reset_index(name='clonotype_count_tool')
    sample = pd.merge(sample, clonotype_count, on=['tool'])
    df5 =pd.concat([df5, sample])
df5.loc[:,'est_reads'] = 310

In [20]:
tools = ['MIXCR','IMREP','TRUST4','CATT']
shannon5 = pd.DataFrame()
for tool in tools:
    subsample = df5.loc[df5['tool']==tool]
    diversity = subsample[subsample.new_frequency_tool != 0]
    diversity.loc[:,'shannon_index_tool'] = -(diversity['new_frequency_tool']*np.log(diversity['new_frequency_tool']))
    shannon = diversity.groupby(['Sample']).agg({'shannon_index_tool':'sum'}).reset_index().rename(columns={'':"shannon_index_tool"})
    shannon = shannon[['Sample', 'shannon_index_tool']]
    shannon.loc[:,'tool'] = tool
    new_reads = df5[['Sample','tool','new_total_reads_tool','est_reads']].drop_duplicates(keep='first', inplace=False)
    shannon = pd.merge(shannon, new_reads, how='left', on=['Sample','tool'])
    shannon5 =pd.concat([shannon5, shannon])

In [21]:
subsample_combined = pd.concat([df1, df2, df3, df4, df5])
clonotype_count = subsample_combined[['Sample','clonotype_count_tool','tool','est_reads']].drop_duplicates(keep='first', inplace=False)

In [22]:
subsample_combined = subsample_combined.drop(['clonotype_count_tool'], axis=1)
subsample_combined.to_csv('../summary_data/subsample/subsample_complete.csv', index=False)

In [23]:
shannon_combined = pd.concat([shannon1, shannon2, shannon3, shannon4, shannon5])
diversity_combined = pd.merge(shannon_original, shannon_combined, how='outer', on=['Sample'])
diversity_combined = pd.merge(diversity_combined, clonotype_count, how='outer', on=['Sample','tool','est_reads'])
diversity_combined['SDI_absolute_error'] = np.abs(diversity_combined['shannon_index_tool'] - diversity_combined['shannon_index_TCR'])

diversity_combined.to_csv('../summary_data/subsample/subsample_diversity.csv', index=False)
diversity_combined

Unnamed: 0,Sample,shannon_index_TCR,shannon_index_tool,tool,new_total_reads_tool,est_reads,clonotype_count_tool,SDI_absolute_error
0,sample01,1.072675,0.388575,MIXCR,28309,28000,111,0.6841
1,sample01,1.072675,0.446116,IMREP,28309,28000,194,0.626559
2,sample01,1.072675,0.440557,TRUST4,28309,28000,199,0.632118
3,sample01,1.072675,0.439513,CATT,28309,28000,126,0.633161
4,sample01,1.072675,0.354839,MIXCR,13769,13500,10,0.717836
5,sample01,1.072675,0.406557,IMREP,13769,13500,64,0.666117
6,sample01,1.072675,0.402618,TRUST4,13769,13500,67,0.670056
7,sample01,1.072675,0.410509,CATT,13769,13500,32,0.662166
8,sample01,1.072675,0.348609,MIXCR,2721,2600,1,0.724065
9,sample01,1.072675,0.364779,IMREP,2721,2600,7,0.707896
