In [1]:
import pandas as pd
import numpy as np
from scipy import stats

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
df = pd.read_csv("../summary_data/original/all_tools_TRB.csv")
df = df.loc[~(df['frequency_TCR'] == 0)]
df2 = pd.read_csv("../summary_data/original/all_tools_TRB_diversity.csv")

In [3]:
## Calculate TCR-Seq confirmed clonotypes

# MIXCR
MIXCR = df[df.nReads_MIXCR != 0]
clonotype_count_MIXCR = MIXCR.groupby(['Sample'], sort=False).size().reset_index(name='confirmed_clonotype_count_tool')
clonotype_count_MIXCR['tool'] = 'MIXCR'

# IMREP
IMREP = df[df.nReads_IMREP != 0]
clonotype_count_IMREP = IMREP.groupby(['Sample'], sort=False).size().reset_index(name='confirmed_clonotype_count_tool')
clonotype_count_IMREP['tool'] = 'IMREP'

# TRUST4
TRUST4 = df[df.nReads_TRUST4 != 0]
clonotype_count_TRUST4 = TRUST4.groupby(['Sample'], sort=False).size().reset_index(name='confirmed_clonotype_count_tool')
clonotype_count_TRUST4['tool'] = 'TRUST4'

# CATT
CATT = df[df.nReads_CATT != 0]
clonotype_count_CATT = CATT.groupby(['Sample'], sort=False).size().reset_index(name='confirmed_clonotype_count_tool')
clonotype_count_CATT['tool'] = 'CATT'

confirmed_clonotype_count = pd.concat([clonotype_count_MIXCR,clonotype_count_IMREP,clonotype_count_TRUST4,clonotype_count_CATT])
clonotype_count = df2[['Sample','tissue','class','clonotype_count_tool','tool']]
clonotype = pd.merge(confirmed_clonotype_count, clonotype_count, how='inner', on=['Sample','tool'])

clonotype

Unnamed: 0,Sample,confirmed_clonotype_count_tool,tool,tissue,class,clonotype_count_tool
0,SRR5233637,24,MIXCR,small_intestine,T_cell_poor_high_SDI,24
1,SRR5233639,340,MIXCR,lymph_node,T_cell_rich_high_SDI,353
2,sample01,473,MIXCR,PBMC,T_cell_rich_low_SDI,2592
3,sample02,170,MIXCR,PBMC,T_cell_rich_low_SDI,1565
4,sample03,307,MIXCR,PBMC,T_cell_rich_low_SDI,1012
...,...,...,...,...,...,...
67,sample10,36,CATT,melanoma,T_cell_poor_high_SDI,103
68,sample11,6,CATT,melanoma,T_cell_poor_high_SDI,71
69,sample12,6,CATT,melanoma,T_cell_poor_high_SDI,105
70,sample13,15,CATT,melanoma,T_cell_poor_low_SDI,242


In [4]:
mean_confirmed_clonotype = clonotype.groupby("class")["clonotype_count_tool"].agg(["mean", "std"])
display(mean_confirmed_clonotype)
tools = ['MIXCR','IMREP','TRUST4','CATT']              
for tool in tools:
    print(tool)
    df_tool = clonotype.loc[clonotype['tool'] == tool]
    mean_confirmed_clonotype_tool = df_tool.groupby("class")["clonotype_count_tool"].agg(["mean", "std"]) 
    display(mean_confirmed_clonotype_tool)

Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,58.045455,52.270742
T_cell_poor_low_SDI,157.25,65.703247
T_cell_rich_high_SDI,690.083333,851.657897
T_cell_rich_low_SDI,3480.166667,1239.397422


MIXCR


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,24.875,15.084879
T_cell_poor_low_SDI,96.0,
T_cell_rich_high_SDI,145.0,181.353246
T_cell_rich_low_SDI,1723.0,801.762434


IMREP


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,37.583333,39.619688
T_cell_poor_low_SDI,116.0,
T_cell_rich_high_SDI,965.666667,1216.758124
T_cell_rich_low_SDI,3862.333333,466.422913


TRUST4


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,66.75,59.29606
T_cell_poor_low_SDI,175.0,
T_cell_rich_high_SDI,1037.666667,1192.201465
T_cell_rich_low_SDI,4450.666667,723.176558


CATT


Unnamed: 0_level_0,mean,std
class,Unnamed: 1_level_1,Unnamed: 2_level_1
T_cell_poor_high_SDI,91.916667,53.467846
T_cell_poor_low_SDI,242.0,
T_cell_rich_high_SDI,612.0,551.881328
T_cell_rich_low_SDI,3884.666667,745.962689


## T_cell_rich_low_SDI samples

In [5]:
T_cell_rich_low_SDI = clonotype.loc[clonotype['class'] == 'T_cell_rich_low_SDI']

In [6]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'IMREP']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-3.9948052758136705, pvalue=0.016199965821544975)

In [7]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-4.375609256369988, pvalue=0.011915708603934079)

In [8]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-3.4189161832010964, pvalue=0.02680684138575569)

In [9]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'IMREP']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-1.1841627225785132, pvalue=0.3019064069859213)

In [10]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'IMREP']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-0.043968404093680975, pvalue=0.9670369715406371)

In [11]:
df_group1 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'TRUST4']
df_group2 = T_cell_rich_low_SDI[T_cell_rich_low_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=0.9435759567400029, pvalue=0.3988134626125689)

## T_cell_rich_high_SDI samples

In [12]:
T_cell_rich_high_SDI = clonotype.loc[clonotype['class'] == 'T_cell_rich_high_SDI']

In [13]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'IMREP']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-1.1554524742721475, pvalue=0.3122253886558171)

In [14]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-1.2821324667954137, pvalue=0.26906112753823963)

In [15]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-1.3924033833498795, pvalue=0.2362124306443352)

In [16]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'IMREP']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-0.07320757871656497, pvalue=0.945155533749319)

In [17]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'IMREP']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=0.45848668252886726, pvalue=0.6704074619471836)

In [18]:
df_group1 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'TRUST4']
df_group2 = T_cell_rich_high_SDI[T_cell_rich_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=0.561203375730923, pvalue=0.6046091608141135)

## T_cell_poor_high_SDI samples

In [19]:
T_cell_poor_high_SDI = clonotype.loc[clonotype['class'] == 'T_cell_poor_high_SDI']

In [20]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'IMREP']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-0.8601556044059201, pvalue=0.40101513700513103)

In [21]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-1.9396599329431308, pvalue=0.06825625697855864)

In [22]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'MIXCR']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-3.4283331929405594, pvalue=0.002997322586991822)

In [23]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'IMREP']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'TRUST4']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-1.4167728994111142, pvalue=0.17055496281284893)

In [24]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'IMREP']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-2.8283109981761516, pvalue=0.00978481366332312)

In [25]:
df_group1 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'TRUST4']
df_group2 = T_cell_poor_high_SDI[T_cell_poor_high_SDI['tool'] == 'CATT']

stats.ttest_ind(df_group1['clonotype_count_tool'],df_group2['clonotype_count_tool'])

Ttest_indResult(statistic=-1.0918975776181423, pvalue=0.2866919459760997)