In [1]:
import pandas as pd
import numpy as np
from scipy import stats

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
df = pd.read_csv("../summary_data/original/all_tools_TRB.csv")
df = df.loc[~(df['frequency_TCR'] == 0)]
df2 = pd.read_csv("../summary_data/original/all_tools_TRB_diversity.csv")

In [3]:
## Calculate TCR-Seq confirmed clonotypes

# MIXCR
MIXCR = df[df.nReads_MIXCR != 0]
clonotype_count_MIXCR = MIXCR.groupby(['Sample'], sort=False).size().reset_index(name='confirmed_clonotype_count_tool')
clonotype_count_MIXCR['tool'] = 'MIXCR'

# IMREP
IMREP = df[df.nReads_IMREP != 0]
clonotype_count_IMREP = IMREP.groupby(['Sample'], sort=False).size().reset_index(name='confirmed_clonotype_count_tool')
clonotype_count_IMREP['tool'] = 'IMREP'

# TRUST4
TRUST4 = df[df.nReads_TRUST4 != 0]
clonotype_count_TRUST4 = TRUST4.groupby(['Sample'], sort=False).size().reset_index(name='confirmed_clonotype_count_tool')
clonotype_count_TRUST4['tool'] = 'TRUST4'

# CATT
CATT = df[df.nReads_CATT != 0]
clonotype_count_CATT = CATT.groupby(['Sample'], sort=False).size().reset_index(name='confirmed_clonotype_count_tool')
clonotype_count_CATT['tool'] = 'CATT'

confirmed_clonotype_count = pd.concat([clonotype_count_MIXCR,clonotype_count_IMREP,clonotype_count_TRUST4,clonotype_count_CATT])
clonotype_count = df2[['Sample','tissue','class','clonotype_count_tool','tool']]
clonotype = pd.merge(confirmed_clonotype_count, clonotype_count, how='inner', on=['Sample','tool'])

clonotype

Unnamed: 0,Sample,confirmed_clonotype_count_tool,tool,tissue,class,clonotype_count_tool
0,SRR5233637,24,MIXCR,small_intestine,T_cell_poor_high_SDI,24
1,SRR5233639,340,MIXCR,lymph_node,T_cell_rich_high_SDI,353
2,sample01,473,MIXCR,PBMC,T_cell_rich_low_SDI,2592
3,sample02,170,MIXCR,PBMC,T_cell_rich_low_SDI,1565
4,sample03,307,MIXCR,PBMC,T_cell_rich_low_SDI,1012
...,...,...,...,...,...,...
67,sample10,36,CATT,melanoma,T_cell_poor_high_SDI,103
68,sample11,6,CATT,melanoma,T_cell_poor_high_SDI,71
69,sample12,6,CATT,melanoma,T_cell_poor_high_SDI,105
70,sample13,15,CATT,melanoma,T_cell_poor_low_SDI,242


In [4]:
mean_confirmed_clonotype = clonotype.groupby("class")["confirmed_clonotype_count_tool"].agg(["mean", "std"])
display(mean_confirmed_clonotype)
tools = ['MIXCR','IMREP','TRUST4','CATT']              
for tool in tools:
    print(tool)
    df_tool = clonotype.loc[clonotype['tool'] == tool]
    mean_confirmed_clonotype_tool = df_tool.groupby("class")["confirmed_clonotype_count_tool"].agg(["mean", "std"]) 
    display(mean_confirmed_clonotype_tool)

Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,30.659091,29.999181
T_cell_poor_low_SDI,15.5,1.290994
T_cell_rich_high_SDI,456.75,671.706392
T_cell_rich_low_SDI,448.416667,161.019174


MIXCR


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,19.875,13.043197
T_cell_poor_low_SDI,17.0,
T_cell_rich_high_SDI,128.333333,183.962859
T_cell_rich_low_SDI,316.666667,151.731122


IMREP


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,25.583333,27.586091
T_cell_poor_low_SDI,14.0,
T_cell_rich_high_SDI,666.666667,975.836735
T_cell_rich_low_SDI,478.333333,159.678844


TRUST4


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,42.416667,39.566879
T_cell_poor_low_SDI,16.0,
T_cell_rich_high_SDI,701.0,979.815799
T_cell_rich_low_SDI,538.0,160.94409


CATT


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,31.166667,28.625587
T_cell_poor_low_SDI,15.0,
T_cell_rich_high_SDI,331.0,440.809483
T_cell_rich_low_SDI,460.666667,168.921086


## T_cell_rich_low_SDI samples

In [5]:
T_cell_rich_low_SDI = clonotype.loc[clonotype['class'] == 'T_cell_rich_low_SDI']

In [6]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'IMREP']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=-1.271224361326518, pvalue=0.272542170089371)

In [7]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=-1.7331675748412383, pvalue=0.15809582408501627)

In [8]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=-1.0984510391726001, pvalue=0.33368414870440843)

In [9]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'IMREP']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=-0.45583685539221214, pvalue=0.6721572368224253)

In [10]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'IMREP']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=0.1316409659006896, pvalue=0.9016241071699981)

In [11]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'TRUST4']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=0.574088617520101, pvalue=0.596646935114547)

## T_cell_rich_high_SDI samples

In [12]:
T_cell_rich_high_SDI = clonotype.loc[clonotype['class'] == 'T_cell_rich_high_SDI']

In [13]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'IMREP']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=-0.9389695167540955, pvalue=0.4009085124598878)

In [14]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=-0.994936274539376, pvalue=0.3760804584211256)

In [15]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=-0.7348989710095004, pvalue=0.5031609625972828)

In [16]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'IMREP']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=-0.043003024462574785, pvalue=0.967760151214584)

In [17]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'IMREP']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=0.5429608210695903, pvalue=0.6159962382352773)

In [18]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'TRUST4']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=0.5964763400871438, pvalue=0.5829742750149459)

## T_cell_poor_high_SDI samples

In [19]:
T_cell_poor_high_SDI = clonotype.loc[clonotype['class'] == 'T_cell_poor_high_SDI']

In [20]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'IMREP']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=-0.5426209509956582, pvalue=0.5940463874370556)

In [21]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=-1.5441691388793204, pvalue=0.1399464277774178)

In [22]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=-1.0390062709182442, pvalue=0.3125581752542148)

In [23]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'IMREP']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=-1.2089445718795617, pvalue=0.2395172890990509)

In [24]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'IMREP']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=-0.486517402263252, pvalue=0.6314118645577844)

In [25]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'TRUST4']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['confirmed_clonotype_count_tool'],df_group2['confirmed_clonotype_count_tool'])

Ttest_indResult(statistic=0.7979991018708762, pvalue=0.433400843231587)