In [1]:
import pandas as pd
import numpy as np
from scipy import stats

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
samples = ['sample01','sample02','sample03','sample04','sample05','sample06','sample07','sample08',
           'sample09','sample10','sample11','sample12','sample13','sample14',
            'SRR5233639','SRR5233637','TCGA-CZ-4862','TCGA-CZ-5463','TCGA-CZ-5985']

In [3]:
columns = ['th', 'observed_portion_frequency', 'sample']
df_MIXCR = pd.DataFrame(columns=columns)

for sample in samples: 
    df_temp_aa = pd.read_csv('../summary_data/original/capturing_per_sample/capturing_MIXCR_'+sample+'.csv')
    df_temp_aa['th'] = df_temp_aa['th']*100
    
    df_temp_aa['sample'] = sample

    df_MIXCR = pd.concat([df_MIXCR, df_temp_aa], ignore_index=True, sort=False)
    df_MIXCR['tool'] = 'MIXCR'

In [4]:
columns = ['th', 'observed_portion_frequency', 'sample']
df_IMREP = pd.DataFrame(columns=columns)

for sample in samples: 
    df_temp_aa = pd.read_csv('../summary_data/original/capturing_per_sample/capturing_IMREP_'+sample+'.csv')
    df_temp_aa['th'] = df_temp_aa['th']*100
    
    df_temp_aa['sample'] = sample

    df_IMREP = pd.concat([df_IMREP, df_temp_aa], ignore_index=True,sort=False)
    df_IMREP['tool'] = 'IMREP'

In [5]:
columns = ['th', 'observed_portion_frequency', 'sample']
df_TRUST4 = pd.DataFrame(columns=columns)

for sample in samples: 
    df_temp_aa = pd.read_csv('../summary_data/original/capturing_per_sample/capturing_TRUST4_'+sample+'.csv')
    df_temp_aa['th'] = df_temp_aa['th']*100
   
    df_temp_aa['sample'] = sample

    df_TRUST4 = pd.concat([df_TRUST4, df_temp_aa], ignore_index=True, sort=False)
    df_TRUST4['tool'] = 'TRUST4'

In [6]:
columns = ['th', 'observed_portion_frequency', 'sample']
df_CATT = pd.DataFrame(columns=columns)

for sample in samples: 
    df_temp_aa = pd.read_csv('../summary_data/original/capturing_per_sample/capturing_CATT_'+sample+'.csv')
    df_temp_aa['th'] = df_temp_aa['th']*100
   
    df_temp_aa['sample'] = sample

    df_CATT = pd.concat([df_CATT, df_temp_aa], ignore_index=True, sort=False)
    df_CATT['tool'] = 'CATT'

In [7]:
df_combined = pd.concat([df_MIXCR,df_IMREP,df_TRUST4,df_CATT], ignore_index=True, sort=False)
df_combined['th'] = df_combined['th']

In [8]:
df = pd.read_csv("../summary_data/original/all_tools_TRB_diversity.csv")

tissue_type = df[['Sample','class','tissue_type','repertoire_type']].drop_duplicates(keep='first')
tissue_type.rename(columns={'Sample':'sample'}, inplace=True)
df_combined = pd.merge(df_combined, tissue_type, how='inner', on=['sample'])

In [9]:
portion = df_combined.loc[df_combined['th'] == 0]
portion

Unnamed: 0,th,observed_portion_frequency,sample,tool,class,tissue_type,repertoire_type
0,0.0,0.927752,sample01,MIXCR,T_cell_rich_low_SDI,T_cell_rich,low_SDI
100000,0.0,0.931384,sample01,IMREP,T_cell_rich_low_SDI,T_cell_rich,low_SDI
200000,0.0,0.934840,sample01,TRUST4,T_cell_rich_low_SDI,T_cell_rich,low_SDI
300000,0.0,0.932676,sample01,CATT,T_cell_rich_low_SDI,T_cell_rich,low_SDI
400000,0.0,0.920729,sample02,MIXCR,T_cell_rich_low_SDI,T_cell_rich,low_SDI
...,...,...,...,...,...,...,...
7100000,0.0,0.527295,TCGA-CZ-5463,CATT,T_cell_poor_high_SDI,T_cell_poor,high_SDI
7200000,0.0,0.000000,TCGA-CZ-5985,MIXCR,T_cell_poor_high_SDI,T_cell_poor,high_SDI
7300000,0.0,0.057547,TCGA-CZ-5985,IMREP,T_cell_poor_high_SDI,T_cell_poor,high_SDI
7400000,0.0,0.092014,TCGA-CZ-5985,TRUST4,T_cell_poor_high_SDI,T_cell_poor,high_SDI


In [10]:
mean_portion = portion.groupby(["class","tool"])["observed_portion_frequency"].agg(["mean", "std"]) 
display(mean_portion)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
class,tool,Unnamed: 2_level_1,Unnamed: 3_level_1
T_cell_poor_high_SDI,CATT,0.187147,0.151146
T_cell_poor_high_SDI,IMREP,0.135212,0.144851
T_cell_poor_high_SDI,MIXCR,0.065509,0.075887
T_cell_poor_high_SDI,TRUST4,0.196882,0.176464
T_cell_poor_low_SDI,CATT,0.759611,
T_cell_poor_low_SDI,IMREP,0.759302,
T_cell_poor_low_SDI,MIXCR,0.771268,
T_cell_poor_low_SDI,TRUST4,0.767794,
T_cell_rich_high_SDI,CATT,0.106577,0.095847
T_cell_rich_high_SDI,IMREP,0.119767,0.118781


## T_cell_rich_low_SDI samples

In [11]:
T_cell_rich_low_SDI = portion.loc[portion['class'] == 'T_cell_rich_low_SDI']

In [12]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'IMREP']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=-0.5075701861322702, pvalue=0.6384616941484779)

In [13]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=-0.8679121976592222, pvalue=0.4344108573086754)

In [14]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=-0.5637620092029205, pvalue=0.603022740115076)

In [15]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'IMREP']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=-0.33832907158905223, pvalue=0.7521279345465594)

In [16]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'IMREP']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=-0.048734834568821765, pvalue=0.9634669486376922)

In [17]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'TRUST4']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=0.29298957308320356, pvalue=0.7841011112316416)

## T_cell_rich_high_SDI samples

In [18]:
T_cell_rich_high_SDI = portion.loc[portion['class'] == 'T_cell_rich_high_SDI']

In [19]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'IMREP']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=-0.42493445290693493, pvalue=0.6927461842203302)

In [20]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=-0.606398128234843, pvalue=0.5769811381094867)

In [21]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=-0.30437465565421395, pvalue=0.7760204631367096)

In [22]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'IMREP']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=-0.17850995808555148, pvalue=0.8669989664933072)

In [23]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'IMREP']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=0.1496905438172644, pvalue=0.888253114583344)

In [24]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'TRUST4']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=0.33666085830138415, pvalue=0.7532942813115686)

## T_cell_poor_high_SDI samples

In [25]:
T_cell_poor_high_SDI = portion.loc[portion['class'] == 'T_cell_poor_high_SDI']

In [26]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'IMREP']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=-1.4765704459871538, pvalue=0.15396263739846128)

In [27]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=-2.3691403257946027, pvalue=0.02702705241260024)

In [28]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=-2.491421008939769, pvalue=0.020752080825163558)

In [29]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'IMREP']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=-0.935741049280507, pvalue=0.35956528236497365)

In [30]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'IMREP']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=-0.8593766961524805, pvalue=0.3994003677136857)

In [31]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'TRUST4']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['observed_portion_frequency'],df_group2['observed_portion_frequency'])

Ttest_indResult(statistic=0.14513469749976585, pvalue=0.885926001052608)