In [None]:
import scanpy as sc
from glob import glob
import pandas as pd
import numpy as np
import seaborn as sns
import anndata
import scipy
import re
import os
import matplotlib
import math
import random
import itertools
import gseapy as gp
import statsmodels.stats.multitest
from statannot import add_stat_annotation
from matplotlib import pyplot as plt
from matplotlib import rcParams
from matplotlib.legend import Legend
import matplotlib.gridspec as gridspec

import generalfunctions as gf
import populationfunctions as pf
import airrfunctions as airr
import dgexfunctions as dgexfunc

%matplotlib inline
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)
pd.options.display.max_seq_items = 2000

sc.set_figure_params(scanpy=True, dpi=300, dpi_save=300, frameon=True, vector_friendly=True, fontsize=12, 
                         color_map='Dark2', format='pdf', transparent=True, ipython_format='png2x')

rcParams.update({'font.size': 8})
rcParams.update({'font.family': 'Helvetica'})
rcParams['pdf.fonttype'] = 42
rcParams['ps.fonttype'] = 42
rcParams['svg.fonttype'] = 'none'
rcParams['figure.facecolor'] = (1,1,1,1)

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.core.display import display, HTML
display(HTML("""
<style>
#notebook-container {
    width: 100%
}
 
.code_cell {
   flex-direction: row !important;
}
 
.code_cell .input {
    width: 60%
}
 
.code_cell .output_wrapper {
    width: 40%
}
</style>
"""))

In [None]:
## load celltype data

embFiles = {'tcell_filtered':'umap_n-0055_md-0.80_s-2.28.npy',
           'bcell_filtered':'umap_n-0028_md-0.40_s-1.39.npy',
           'myeloid_filtered':'umap_n-0064_md-0.10_s-1.61.npy'}

clustFiles = {'tcell_filtered':'scvi_cugraph_leiden_nbr100_res0.6.npy',
           'bcell_filtered':'scvi_cugraph_leiden_nbr100_res0.6.npy',
           'myeloid_filtered':'scvi_cugraph_leiden_nbr30_res0.8.npy'}


celltype = 'tcell_filtered'
path = '/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/scvi_outputs/'

adata = sc.read_h5ad('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/h5adfiles/PembroRT_immune_R100_final.h5ad')

BC = np.load(path+'/'+celltype+'/barcodes.npy',allow_pickle=True)
adata = adata[list(BC)].copy()
adata

emb = np.load(path+'/'+celltype+'/'+embFiles[celltype])
clust = np.load(path+'/'+celltype+'/'+clustFiles[celltype])

adata.obsm['X_umap'] = emb
adata.obs['leiden'] = [str(x) for x in clust]
adata.obs.leiden = adata.obs.leiden.astype('category')

metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/PEMBRORT_CLINICAL_METADATA_FORSCSEQ_KHG20210624.csv',index_col=None,header=0)
cols = [x for x in metadata.columns if x not in adata.obs.columns]
metadata = metadata[cols]
adata.obs = adata.obs.reset_index().merge(metadata,left_on='cohort',right_on='Patient_Number',how='left').set_index('index')

dotsize = (120000/len(adata))*2

sc.pl.umap(adata,color='leiden',legend_loc='on data',size=dotsize,show=False)
# plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/umap_by_leiden.png',dpi=600,bbox_inches='tight')

adata.raw = adata
sc.pp.normalize_per_cell(adata, counts_per_cell_after=10000)
sc.pp.log1p(adata)

In [None]:
## diffusion component embedding

BC = np.load(path+'/'+celltype+'/barcodes.npy',allow_pickle=True)
adata = adata[list(BC)].copy()

adata = adata[(adata.obs.leiden!='00')&(adata.obs.leiden!='01')&(adata.obs.leiden!='07')]

emb = np.load(path+'/'+celltype+'/diffmap_nbr100_filtered.npy',allow_pickle=True)
adata.obsm['X_diffmap'] = emb


c = ['1,2','1,3','1,6','2,6','3,6','2,3']
st = adata.obs.leiden.unique().tolist()
forder = [(x,y) for x in c for y in st]

sns.set_style("white", rc={"font.family":"Helvetica","axes.grid":False})                                                  
sns.set_context("paper", rc={"font.size":4,"axes.titlesize":4,"axes.labelsize":4,"font.family":"Helvetica","xtick.labelsize":4,"ytick.labelsize":4})
fig,axs = plt.subplots(nrows=len(c),ncols=len(st),sharex=True,sharey=True,figsize=(len(st),len(c)))

for f,ax in zip(forder,np.ravel(axs)):
    sc.pl.diffmap(adata,color='leiden',components=f[0],groups=f[1],projection='2d',size=dotsize,ax=ax,title=f[1],show=False)


#### <font size="6"> TCR </font>
---------------------------------------------------------

In [None]:
# ## adding transition/detection options for each TCR - only run once

# adata.obs.drop(columns='combined_tcr',inplace=True)

# df = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/TCR/TCRtable_annotation_expansion.csv',header=0,index_col=0)

# cols = [x for x in df.columns if x not in adata.obs.columns]
# df = df[cols]

# adata.obs = adata.obs.merge(df,how='left',left_index=True,right_index=True)

# tmp = adata[~pd.isnull(adata.obs.medianExpansion)].obs.copy()
# len(tmp)
# tmp['combined_tcr_v2'] = [x+'_'+y for x,y in zip(tmp.cohort,tmp.combined_tcr)]

# Base = tmp[(tmp.treatment=='Base')]['combined_tcr_v2'].unique().tolist()
# PD1 = tmp[(tmp.treatment=='PD1')]['combined_tcr_v2'].unique().tolist()
# RTPD1 = tmp[(tmp.treatment=='RTPD1')]['combined_tcr_v2'].unique().tolist()
# expBase = tmp[(tmp.treatment=='Base')&(tmp.medianExpansion=='expanded')]['combined_tcr_v2'].unique().tolist()
# notexpBase = tmp[(tmp.treatment=='Base')&(tmp.medianExpansion=='not_expanded')]['combined_tcr_v2'].unique().tolist()
# expPD1 = tmp[(tmp.treatment=='PD1')&(tmp.medianExpansion=='expanded')]['combined_tcr_v2'].unique().tolist()
# notexpPD1 = tmp[(tmp.treatment=='PD1')&(tmp.medianExpansion=='not_expanded')]['combined_tcr_v2'].unique().tolist()
# expRTPD1 = tmp[(tmp.treatment=='RTPD1')&(tmp.medianExpansion=='expanded')]['combined_tcr_v2'].unique().tolist()
# notexpRTPD1 = tmp[(tmp.treatment=='RTPD1')&(tmp.medianExpansion=='not_expanded')]['combined_tcr_v2'].unique().tolist()

# D1 = set(Base)
# D2 = set(PD1)
# D3 = set(RTPD1)
# E1 = set(expBase)
# E2 = set(expPD1)
# E3 = set(expRTPD1)
# NE1 = set(notexpBase)
# NE2 = set(notexpPD1)
# NE3 = set(notexpRTPD1)

# E1_E2_NE3 = E1&E2-E3
# E1_E2_E3 = E1&E2&E3
# E1_NE2_NE3 = E1-E2-E3
# E1_E2 = E1&E2
# E1_E3 = E1&E3
# E1_x2_x3 = E1-D2-D3
# D1_E2 = D1&E2
# D1_E3 = D1&E3
# D1_E2_E3 = D1&E2&E3
# D1_E2_NE3 = D1&E2-E3
# x1_E2 = E2-D1
# NE1_E2 = E2-E1
# x1_E2_x3 = E2-D1-D3
# x1_E2_D3 = E2-D1&D3
# NE1_NE2_E3 = E3-E2-E1 
# NE1_E3 = E3-E1
# NE2_E3 = E3-E2
# E2_E3 = E2&E3
# x1_x2_E3 = E3-D1-D2 
# x1_E3 = E3-D1
# x2_E3 = E3-D2
# D1_D2_E3 = D1&D2&E3
# D2_E3 = D2&E3
# x1_D2_E3 = E3&D2-D1
# D1_D2_D3 = D1&D2&D3
# D1_D2_x3 = D1&D2-D3
# D1_x2_D3 = D1&D3-D2
# x1_D2_D3 = D2&D3-D1

# opts = ['E1_E2_NE3','E1_E2_E3','E1_NE2_NE3','E1_E2','E1_E3','E1_x2_x3','D1_E2','D1_E3',
#        'D1_E2_E3','D1_E2_NE3','x1_E2','NE1_E2','x1_E2_x3','x1_E2_D3','NE1_NE2_E3',
#        'NE1_E3','NE2_E3','E2_E3','x1_x2_E3','x1_E3','x2_E3','D1_D2_E3','D2_E3','x1_D2_E3','D1_D2_D3',
#        'D1_D2_x3','D1_x2_D3','x1_D2_D3','D1','D2','D3','E1','E2','E3','NE1','NE2','NE3']

# for o in opts:
    
#     tmp[o] = pd.DataFrame('N',index=tmp.index,columns=[o])
    
#     for t in eval(o):
#         tmp.loc[(tmp.combined_tcr_v2==t),o] = 'Y'
        
#     print(o)
    
# tmp.to_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/TCR/TCRtable_annotation_expansion_detection.csv')

In [None]:
adata.obs.drop(columns='combined_tcr',inplace=True)

df = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/TCR/TCRtable_annotation_expansion_detection.csv',header=0,index_col=0)

cols = [x for x in df.columns if x not in adata.obs.columns]
df = df[cols]

adata.obs = adata.obs.merge(df,how='left',left_index=True,right_index=True)
# adata.obs.to_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/metadata_collection/tcell_filtered_TCR_obs.csv')

sc.pl.umap(adata,color=['medianExpansion'],groups=['expanded','not_expanded'],palette=['tab:blue','tab:gray','tab:orange'],size=dotsize,show=False)
# plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/expansion_umap.png',dpi=600,bbox_inches='tight')

In [None]:
## density umap of expanded vs non-expanded cells

stColors = {'expanded':'b',
          'not_expanded':'r',
           'nan':'w'}

fig,axs = gf.umap_density(adata,groupby='medianExpansion',colors=stColors,dotsize=0.002,figsize=(6,2))
plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/expansion_umap_density.png',dpi=600,bbox_inches='tight')

In [None]:
c = ['1,2','1,3','1,6','2,6','3,6','2,3']

sns.set_style("white", rc={"font.family":"Helvetica","axes.grid":False})                                                  
sns.set_context("paper", rc={"font.size":4,"axes.titlesize":4,"axes.labelsize":4,"font.family":"Helvetica","xtick.labelsize":4,"ytick.labelsize":4})
fig,axs = plt.subplots(nrows=len(c),ncols=1,sharex=True,sharey=True,figsize=(1,len(c)))

for f,ax in zip(c,np.ravel(axs)):
    _= sc.pl.diffmap(adata,color='medianExpansion',groups=['expanded','not_expanded'],palette=['tab:blue','tab:gray','tab:orange'],components=f,projection='2d',size=dotsize/20,ax=ax,show=False)


In [None]:
## density umap of newly expanded cells
grps = ['E1','NE1_E2','NE1_NE2_E3']

stColors = {'Y':'b',
           'N':'r',
           'nan':'w'}

for g in grps:
    fig,axs = gf.umap_density(adata,groupby=g,colors=stColors,dotsize=0.002,figsize=(6,2))
    plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/'+g+'_umap.png',dpi=600,bbox_inches='tight')
    plt.close()

In [None]:
## percent sharing of TCRs between patients

adata = adata[(adata.obs.TRA_cdr3!='nan')].copy()

fig,axs,df = rf.receptor_sharing(adata,calc=True,groupby='combined_tcr',fontsize=2,figsize=(6,4),vmax=1)
# fig,axs,df = rf.receptor_sharing(df=df,calc=False,groupby='combined_tcr',fontsize=2,figsize=(6,4),vmax=1)

plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_sharing_heatmap.svg')

In [None]:
## T-cell clonality
adata = adata[(adata.obs.TRA_cdr3!='nan')].copy()

# fig,axs = tf.clonality_boxplot(adata,df=None,groupby=None,rep='cohort',xcat='treatment',hcat='pCR',
#                              xorder=['Base','PD1','RTPD1'],horder=['R','NR'],
#                              metrics = ['percent_cells','percent_tcrs','shannon','gini'],
#                              show_stats=True,calc_pct=True,drop_na=True,fontsize=4)
# plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_clonality_by_tx_boxplot.svg')
# plt.close()

fig,axs = tf.clonality_lineplot(adata,df=None,rep='cohort',xcat='treatment',hcat='pCR',
                             xorder=['Base','PD1','RTPD1'],horder=['R','NR'],
                             metrics = ['percent_cells','percent_tcrs','shannon','gini'],
                             show_stats=True,calc_pct=True,drop_na=True,fontsize=4)
# plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_clonality_by_tx_lineplot.svg')

In [None]:
## T-cell clonality by cluster
adata = adata[(adata.obs.TRA_cdr3!='nan')].copy()

fig,axs = tf.clonality_boxplot(adata,df=None,groupby='leiden',rep='cohort',xcat='treatment',hcat='pCR',
                             xorder=['Base','PD1','RTPD1'],horder=['R','NR'],
                             metrics = ['percent_cells','percent_tcrs','shannon','gini'],
                             show_stats=True,calc_pct=True,drop_na=True,fontsize=4)
# plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_clonality_by_leiden_by_tx_boxplot.svg')
# plt.close()


In [None]:
## clonality of newly expanded

adata = adata[(adata.obs.TRA_cdr3!='nan')].copy()
tx = ['Base','PD1','RTPD1']
pt = adata.obs.cohort.unique().tolist()
metrics = ['percent_cells','percent_tcrs','shannon','gini']

exp_dict = {'Base':'E1',
           'PD1':'NE1_E2',
           'RTPD1':'NE1_NE2_E3'}

idx = pd.MultiIndex.from_product([pt,tx],names=['cohort','treatment'])
df = pd.DataFrame(index=idx,columns=metrics)

for i in df.index:
    
    total = adata[(adata.obs[idx.names[0]]==i[0])&(adata.obs[idx.names[1]]==i[1])]
    totalsize = len(total)
    
    if (totalsize>0):
        
        df.loc[i,'percent_cells'] = tf.exp_pct(total,new=True,normalization='cells')
        df.loc[i,'percent_tcrs'] = tf.exp_pct(total,new=True,normalization='tcr')
    
    dt = exp_dict[i[1]]
    total = adata[(adata.obs[idx.names[0]]==i[0])&(adata.obs[idx.names[1]]==i[1])&(adata.obs[dt]=='Y')]
    
    if (totalsize>0):
        
        df.loc[i,'shannon'] = tf.shannon(total)
        df.loc[i,'gini'] = tf.gini(total)
    
df.reset_index(inplace=True)

metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/PEMBRORT_CLINICAL_METADATA_FORSCSEQ_KHG20201119.csv',index_col=None,header=0)
df = df.merge(metadata,left_on='cohort',right_on='Patient_Number')

df.dropna(how='any',inplace=True)

fig,axs = tf.clonality_boxplot(adata,df=df,groupby=None,rep='cohort',xcat='treatment',hcat='pCR',
                             xorder=['Base','PD1','RTPD1'],horder=['R','NR'],
                             metrics = ['percent_cells','percent_tcrs','shannon','gini'],
                             show_stats=True,calc_pct=False,drop_na=True,fontsize=4)
# plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_clonality_newexpansion_boxplot.svg')
# plt.close()

fig,axs = tf.clonality_lineplot(adata,df=df,rep='cohort',xcat='treatment',hcat='pCR',
                             xorder=['Base','PD1','RTPD1'],horder=['R','NR'],
                             metrics = ['percent_cells','percent_tcrs','shannon','gini'],
                             show_stats=True,calc_pct=False,drop_na=True,fontsize=4)
# plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_clonality_newexpansion_lineplot.svg')


In [None]:
## comparing therapy ability to induce expansion of pre-existing cells versus expansion of newly-recruited cells

adata = adata[(adata.obs.TRA_cdr3!='nan')].copy()
pt = adata.obs.cohort.unique().tolist()
dt = ['PD1_pre','RTPD1_pre','PD1_new','RTPD1_new']
normalization = ['cells','clonotype']
cellthresh = 4

sns.set_style("white", rc={"font.family":"Helvetica","axes.grid":False})                                                  
sns.set_context("paper", rc={"font.size":4,"axes.titlesize":4,"axes.labelsize":4,"font.family":"Helvetica","xtick.labelsize":4,"ytick.labelsize":4})
fig,axs = plt.subplots(nrows=1,ncols=2,sharex=False,sharey=False,figsize=(2,1))

for n,ax in zip(normalization,np.ravel(axs)):

    df = pd.DataFrame(index=pd.MultiIndex.from_product([pt,dt],names=['cohort','detection']),columns=['percent'])

    for i in df.index:

        if (i[1]=='PD1_pre'):
            total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['E2']=='Y')]
            exp = total[(total.obs['D1']=='Y')&(total.obs['NE1_E2']=='Y')]
            if (n=='clonotype'):
                total = total.obs.combined_tcr_v2.unique().tolist()
                exp = exp.obs.combined_tcr_v2.unique().tolist()

        elif (i[1]=='RTPD1_pre'):
            total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['E3']=='Y')]
            exp = total[(total.obs['D2']=='Y')&(total.obs['NE1_E3']=='Y')&(total.obs['NE2_E3']=='Y')]
            if (n=='clonotype'):
                total = total.obs.combined_tcr_v2.unique().tolist()
                exp = exp.obs.combined_tcr_v2.unique().tolist()

        elif (i[1]=='PD1_new'):
            total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['E2']=='Y')]
            exp = total[(total.obs['D1']=='N')]
            if (n=='clonotype'):
                total = total.obs.combined_tcr_v2.unique().tolist()
                exp = exp.obs.combined_tcr_v2.unique().tolist()

        elif (i[1]=='RTPD1_new'):
            total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['E3']=='Y')]
            exp = total[(total.obs['D1']=='N')&(total.obs['D2']=='N')]
            if (n=='clonotype'):
                total = total.obs.combined_tcr_v2.unique().tolist()
                exp = exp.obs.combined_tcr_v2.unique().tolist()

        if (len(total)>cellthresh):
            df.loc[i,'percent'] = (len(exp)/len(total))*100

    df.reset_index(inplace=True)

    metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/PEMBRORT_CLINICAL_METADATA_FORSCSEQ_KHG20201119.csv',index_col=None,header=0)
    df = df.merge(metadata,left_on='cohort',right_on='Patient_Number')
    
    xorder = ['R','NR']
    horder = ['PD1_pre','RTPD1_pre','PD1_new','RTPD1_new']
    combs = list(itertools.combinations(horder,2))

    _= sns.boxplot(x='pCR',y='percent',hue='detection',data=df,color='w',linewidth=0.5,fliersize=0,palette='colorblind',order=xorder,hue_order=horder,boxprops=dict(alpha=0.2),ax=ax)

#     test_results = add_stat_annotation(ax,x='pCR',y='percent',hue='detection',data=df,order=xorder,hue_order=horder,
#                                    box_pairs = [tuple((x,y) for y in c) for c in combs for x in xorder],
#                                    test='Mann-Whitney',comparisons_correction=None,text_format='simple',loc='outside',verbose=0,
#                                    fontsize=4,linewidth=0.5,line_height=0.01,text_offset=0.01)

    _= sns.stripplot(x='pCR',y='percent',hue='detection',dodge=True,jitter=0.1,data=df,size=2,order=xorder,hue_order=horder,ax=ax)
    _= ax.legend(fontsize=2,title_fontsize=2, markerscale=0.05)
    _= ax.set_ylim(-2,102)
    _= ax.set_ylabel('% of total expanded pool')
    _= ax.set_xlabel('')
    _= ax.set_title(n)
    
    p1 = df[(df.pCR=='R')&(df.detection=='RTPD1_new')]['percent'].to_numpy(dtype=np.float64)
    p0 = df[(df.pCR=='R')&(df.detection=='PD1_new')]['percent'].to_numpy(dtype=np.float64)
    p1F = p1[~np.isnan(p1)&~np.isnan(p0)]
    p0F = p0[~np.isnan(p1)&~np.isnan(p0)]

    scipy.stats.wilcoxon(p1F,p0F,alternative='greater')

plt.tight_layout()

plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_prevsnew_expansion_boxplot_nostats.svg')


In [None]:
## leiden cluster distribution of cells in each expansion category

adata = adata[(adata.obs.TRA_cdr3!='nan')].copy()
pt = adata.obs.cohort.unique().tolist()
dt = ['PD1_pre','RTPD1_pre','PD1_new','RTPD1_new']
st = sorted(adata.obs.leiden.unique().tolist())
cellthresh = 4

df = pd.DataFrame(index=pd.MultiIndex.from_product([pt,dt,st],names=['cohort','detection','leiden']),columns=['percent'])

for i in df.index:
    
    if (i[1]=='PD1_pre'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['E2']=='Y')&
                      (adata.obs['D1']=='Y')&(adata.obs['NE1_E2']=='Y')]
        exp = total[(total.obs.leiden==i[2])]

    elif (i[1]=='RTPD1_pre'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['E3']=='Y')&
                     ((adata.obs['D1']=='Y')|(adata.obs['D2']=='Y'))&(adata.obs['NE1_E3']=='Y')&(adata.obs['NE2_E3']=='Y')]
        exp = total[(total.obs.leiden==i[2])]
        
    elif (i[1]=='PD1_new'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['E2']=='Y')&
                     (adata.obs['D1']=='N')&(adata.obs['E2']=='Y')]
        exp = total[(total.obs.leiden==i[2])]
        
    elif (i[1]=='RTPD1_new'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['E3']=='Y')&
                     (adata.obs['D1']=='N')&(adata.obs['D2']=='N')&(adata.obs['E3']=='Y')]
        exp = total[(total.obs.leiden==i[2])]
            
    if (len(total)>cellthresh):
        df.loc[i,'percent'] = (len(exp)/len(total))*100
        
df.reset_index(inplace=True)

metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/PEMBRORT_CLINICAL_METADATA_FORSCSEQ_KHG20201119.csv',index_col=None,header=0)
df = df.merge(metadata,left_on='cohort',right_on='Patient_Number')

tmp = df[(df.pCR=='R')]


fig,axs = pf.pct_boxplot(adata,df=tmp,groupby=None,xcat='detection',hcat='leiden',ycat='percent',
                xorder=['PD1_pre','RTPD1_pre','PD1_new','RTPD1_new'],horder=st,
                show_stats=False,calc_pct=False,drop_na=True,ylim=None)

# plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_prevsnew_expansion_leiden_boxplot.svg')


# combs = list(itertools.combinations(dt,2))
# for s in st:
#     for c in combs:
#         p1 = tmp[(tmp.leiden==s)&(tmp.detection==c[1])]['percent'].to_numpy(dtype=np.float64)
#         p0 = tmp[(tmp.leiden==s)&(tmp.detection==c[0])]['percent'].to_numpy(dtype=np.float64)
#         p1F = p1[~np.isnan(p1)&~np.isnan(p0)]
#         p0F = p0[~np.isnan(p1)&~np.isnan(p0)]

#         S,P = scipy.stats.wilcoxon(p1F,p0F,alternative='greater')
#         if (P<=0.05):
#             print(s+' '+c[1]+' greater than '+c[0])
#         S,P = scipy.stats.wilcoxon(p1F,p0F,alternative='less')
#         if (P<=0.05):
#             print(s+' '+c[1]+' less than '+c[0])

In [None]:
## clonality of TCRs expanding from pre-existing TCRs vs expanding from new TCRs

adata = adata[(adata.obs.TRA_cdr3!='nan')].copy()
pt = adata.obs.cohort.unique().tolist()
dt = ['PD1_pre','RTPD1_pre','PD1_new','RTPD1_new']
metrics = ['shannon','gini']

df = pd.DataFrame(index=pd.MultiIndex.from_product([pt,dt],names=['cohort','detection']),columns=metrics)

for i in df.index:

    if (i[1]=='PD1_pre'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['E2']=='Y')&
                     (adata.obs['D1']=='Y')&(adata.obs['NE1_E2']=='Y')]

    elif (i[1]=='RTPD1_pre'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['E3']=='Y')&
                     (adata.obs['D2']=='Y')&(adata.obs['NE1_E3']=='Y')&(adata.obs['NE2_E3']=='Y')]
        
    elif (i[1]=='PD1_new'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['E2']=='Y')&
                      (adata.obs['D1']=='N')]

    elif (i[1]=='RTPD1_new'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['E3']=='Y')&
                     (adata.obs['D1']=='N')&(adata.obs['D2']=='N')]
    
    totalsize = len(total)
    
    if (totalsize>0):
        df.loc[i,'shannon'] = tf.shannon(total)
        df.loc[i,'gini'] = tf.gini(total)

df.reset_index(inplace=True)

metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/PEMBRORT_CLINICAL_METADATA_FORSCSEQ_KHG20201119.csv',index_col=None,header=0)
df = df.merge(metadata,left_on='cohort',right_on='Patient_Number')

fig,axs = tf.clonality_boxplot(adata,df=df,groupby=None,xcat='pCR',hcat='detection',
                        xorder=['R','NR'],horder=['PD1_pre','RTPD1_pre','PD1_new','RTPD1_new'],
                        metrics=metrics,show_stats=False,calc_pct=False,drop_na=True,fontsize=4,figsize=(2,1))

plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_prevsnew_expanded_clonality_boxplot_nostats.svg')

# p1 = df[(df.pCR=='R')&(df.detection=='RTPD1_new')][m].to_numpy(dtype=np.float64)
# p0 = df[(df.pCR=='R')&(df.detection=='PD1_new')][m].to_numpy(dtype=np.float64)
# p1F = p1[~np.isnan(p1)&~np.isnan(p0)]
# p0F = p0[~np.isnan(p1)&~np.isnan(p0)]

# scipy.stats.wilcoxon(p1F,p0F,alternative='greater')


In [None]:
## clonality of TCRs that were seen previously vs new TCRs

adata = adata[(adata.obs.TRA_cdr3!='nan')].copy()
pt = adata.obs.cohort.unique().tolist()
dt = ['PD1_pre','RTPD1_pre','PD1_new','RTPD1_new']
metrics = ['percent_cells','percent_tcrs','shannon','gini']

df = pd.DataFrame(index=pd.MultiIndex.from_product([pt,dt],names=['cohort','detection']),columns=metrics)

for i in df.index:

    if (i[1]=='PD1_pre'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['D1']=='Y')]

    elif (i[1]=='RTPD1_pre'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['D2']=='Y')]

    elif (i[1]=='PD1_new'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['D1']=='N')]

    elif (i[1]=='RTPD1_new'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['D1']=='N')&(adata.obs['D2']=='N')]

    totalsize = len(total)
    
    if(totalsize>0):
        df.loc[i,'percent_cells'] = tf.exp_pct(total,new=False,normalization='cells')
        df.loc[i,'percent_tcrs'] = tf.exp_pct(total,new=False,normalization='tcr')
        df.loc[i,'shannon'] = tf.shannon(total)
        df.loc[i,'gini'] = tf.gini(total)

df.reset_index(inplace=True)

metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/PEMBRORT_CLINICAL_METADATA_FORSCSEQ_KHG20201119.csv',index_col=None,header=0)
df = df.merge(metadata,left_on='cohort',right_on='Patient_Number')

fig,axs = tf.clonality_boxplot(adata,df=df,groupby=None,xcat='pCR',hcat='detection',
                        xorder=['R','NR'],horder=['PD1_pre','RTPD1_pre','PD1_new','RTPD1_new'],
                        metrics=metrics,show_stats=False,calc_pct=False,drop_na=True,fontsize=4,figsize=(4,1))

plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_prevsnew_all_clonality_boxplot_nostats.svg')

# p1 = df[(df.pCR=='R')&(df.detection=='RTPD1_new')][m].to_numpy(dtype=np.float64)
# p0 = df[(df.pCR=='R')&(df.detection=='PD1_new')][m].to_numpy(dtype=np.float64)
# p1F = p1[~np.isnan(p1)&~np.isnan(p0)]
# p0F = p0[~np.isnan(p1)&~np.isnan(p0)]

# scipy.stats.wilcoxon(p1F,p0F,alternative='greater')

In [None]:
## leiden distribution of TCRs that were seen previously vs new TCRs

adata = adata[(adata.obs.TRA_cdr3!='nan')].copy()
pt = adata.obs.cohort.unique().tolist()
dt = ['PD1_pre','RTPD1_pre','PD1_new','RTPD1_new']
st = sorted(adata.obs.leiden.unique().tolist())

df = pd.DataFrame(index=pd.MultiIndex.from_product([pt,dt,st],names=['cohort','detection','leiden']),columns=['percent'])

for i in df.index:

    if (i[1]=='PD1_pre'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['D1']=='Y')]

    elif (i[1]=='RTPD1_pre'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['D2']=='Y')]

    elif (i[1]=='PD1_new'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['D1']=='N')]

    elif (i[1]=='RTPD1_new'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['D1']=='N')&(adata.obs['D2']=='N')]

    totalsize = len(total)
    
    if (totalsize>0):
        df.loc[i,'percent'] = (len(total[(total.obs.leiden==i[2])])/len(total))*100

df.reset_index(inplace=True)

metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/PEMBRORT_CLINICAL_METADATA_FORSCSEQ_KHG20201119.csv',index_col=None,header=0)
df = df.merge(metadata,left_on='cohort',right_on='Patient_Number')

tmp = df[(df.pCR=='R')]

fig,axs = pf.pct_boxplot(adata,df=tmp,groupby=None,xcat='detection',hcat='leiden',ycat='percent',
                        xorder=['PD1_pre','RTPD1_pre','PD1_new','RTPD1_new'],horder=st,
                        show_stats=False,calc_pct=False,drop_na=True,ylim=None)

# plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_prevsnew_boxplot.svg')


# combs = list(itertools.combinations(dt,2))
# for s in st:
#     for c in combs:
#         p1 = tmp[(tmp.leiden==s)&(tmp.detection==c[1])]['percent'].to_numpy(dtype=np.float64)
#         p0 = tmp[(tmp.leiden==s)&(tmp.detection==c[0])]['percent'].to_numpy(dtype=np.float64)
#         p1F = p1[~np.isnan(p1)&~np.isnan(p0)]
#         p0F = p0[~np.isnan(p1)&~np.isnan(p0)]

#         S,P = scipy.stats.wilcoxon(p1F,p0F,alternative='greater')
#         if (P<=0.05):
#             print(s+' '+c[1]+' greater than '+c[0])
#         S,P = scipy.stats.wilcoxon(p1F,p0F,alternative='less')
#         if (P<=0.05):
#             print(s+' '+c[1]+' less than '+c[0])

In [None]:
## clonality of expansion from pre-existing TCRs vs expansion from new TCRs, separated by cluster

adata = adata[(adata.obs.TRA_cdr3!='nan')].copy()
pt = adata.obs.cohort.unique().tolist()
dt = ['PD1_pre','RTPD1_pre','PD1_new','RTPD1_new']
st = adata.obs.leiden.unique().tolist()
metrics = ['shannon','gini']

df = pd.DataFrame(index=pd.MultiIndex.from_product([pt,dt,st],names=['cohort','detection','leiden']),columns=metrics)

for i in df.index:

    if (i[1]=='PD1_pre'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['E2']=='Y')&
                     (adata.obs['D1']=='Y')&(adata.obs['NE1_E2']=='Y')&(adata.obs.leiden==i[2])]

    elif (i[1]=='RTPD1_pre'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['E3']=='Y')&
                     (adata.obs['D2']=='Y')&(adata.obs['NE1_E3']=='Y')&(adata.obs['NE2_E3']=='Y')&(adata.obs.leiden==i[2])]
        
    elif (i[1]=='PD1_new'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['E2']=='Y')&
                      (adata.obs['D1']=='N')&(adata.obs.leiden==i[2])]

    elif (i[1]=='RTPD1_new'):
        total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['E3']=='Y')&
                     (adata.obs['D1']=='N')&(adata.obs['D2']=='N')&(adata.obs.leiden==i[2])]
    
    totalsize = len(total)
    
    if (totalsize>0):
        df.loc[i,'shannon'] = tf.shannon(total)
        df.loc[i,'gini'] = tf.gini(total)

df.reset_index(inplace=True)

metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/PEMBRORT_CLINICAL_METADATA_FORSCSEQ_KHG20201119.csv',index_col=None,header=0)
df = df.merge(metadata,left_on='cohort',right_on='Patient_Number')

fig,axs = tf.clonality_boxplot(adata,df=df,groupby='leiden',xcat='pCR',hcat='detection',
                        xorder=['R','NR'],horder=['PD1_pre','RTPD1_pre','PD1_new','RTPD1_new'],
                        metrics=metrics,show_stats=False,calc_pct=False,drop_na=True,fontsize=4)

# plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_prevsnew_clonality_by_leiden_boxplot.svg')


#     p1 = tmp[(tmp.pCR=='R')&(tmp.detection=='RTPD1_new')][m].to_numpy(dtype=np.float64)
#     p0 = tmp[(tmp.pCR=='R')&(tmp.detection=='PD1_new')][m].to_numpy(dtype=np.float64)
#     p1F = p1[~np.isnan(p1)&~np.isnan(p0)]
#     p0F = p0[~np.isnan(p1)&~np.isnan(p0)]
#     if (np.sum(p0F)>0)|(np.sum(p1F)>0):
#         print(s)
#         scipy.stats.wilcoxon(p1F,p0F,alternative='greater')

In [None]:
## Jensen-Shannon Divergence to track clonotype usage sharing across treatments

adata = adata[(adata.obs.TRA_cdr3!='nan')].copy()
tx = [('Base','PD1'),('PD1','RTPD1'),('Base','RTPD1')]
pt = adata.obs.cohort.unique().tolist()

idx = pd.MultiIndex.from_product([pt,tx],names=['cohort','transition'])
df = pd.DataFrame(index=idx,columns=['JSD'])

for i in df.index:
    
    total = adata[(adata.obs.cohort==i[0])]
    
    firstTx = i[1][0]
    secondTx = i[1][1]
    
    df.loc[i,'JSD'] = tf.JSD(total,first=firstTx,second=secondTx,category='treatment',topn=None)
    
df.reset_index(inplace=True)

metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/PEMBRORT_CLINICAL_METADATA_FORSCSEQ_KHG20201119.csv',index_col=None,header=0)
df = df.merge(metadata,left_on='cohort',right_on='Patient_Number')

fig,axs = pf.pct_boxplot(adata,df=df,groupby=None,xcat='transition',hcat='pCR',ycat='JSD',
                        xorder=tx,horder=['R','NR'],
                        show_stats=False,calc_pct=False,drop_na=True,ylim=(0.4,1.1),figsize=(1,1))

plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/JSD_boxplot.svg')

p1 = df[(df.pCR=='R')&(df.transition==('PD1','RTPD1'))]['JSD'].to_numpy(dtype=np.float64)
p0 = df[(df.pCR=='R')&(df.transition==('Base','PD1'))]['JSD'].to_numpy(dtype=np.float64)
p1F = p1[~np.isnan(p1)&~np.isnan(p0)]
p0F = p0[~np.isnan(p1)&~np.isnan(p0)]
scipy.stats.wilcoxon(p1F,p0F,alternative='greater')


In [None]:
## JSD by cluster

adata = adata[(adata.obs.TRA_cdr3!='nan')].copy()
tx = [('Base','PD1'),('PD1','RTPD1'),('Base','RTPD1')]
pt = adata.obs.cohort.unique().tolist()
st = sorted(adata.obs.leiden.unique().tolist())

idx = pd.MultiIndex.from_product([pt,tx,st],names=['cohort','transition','leiden'])
df = pd.DataFrame(index=idx,columns=['JSD'])

for i in df.index:
    
    total = adata[(adata.obs.cohort==i[0])&(adata.obs.leiden==i[2])]
    
    firstTx = i[1][0]
    secondTx = i[1][1]
    
    df.loc[i,'JSD'] = tf.JSD(total,first=firstTx,second=secondTx,category='treatment',topn=None)
    
df.reset_index(inplace=True)

metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/PEMBRORT_CLINICAL_METADATA_FORSCSEQ_KHG20201119.csv',index_col=None,header=0)
df = df.merge(metadata,left_on='cohort',right_on='Patient_Number')

fig,axs = pf.pct_boxplot(adata,df=df,groupby='leiden',xcat='transition',hcat='pCR',ycat='JSD',
                        xorder=tx,horder=['R','NR'],
                        show_stats=True,calc_pct=False,drop_na=True,ylim=(-0.1,1.1))

plt.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/JSD_by_leiden_by_tx_boxplot.svg')


## Clonality for R1 vs R2

In [None]:
adata.obs.drop(columns='combined_tcr',inplace=True)

df = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/TCR/TCRtable_annotation_expansion_detection.csv',header=0,index_col=0)
cols = [x for x in df.columns if x not in adata.obs.columns]
df = df[cols]
adata.obs = adata.obs.merge(df,how='left',left_index=True,right_index=True)

metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Submission-Cell-Aug2022/Tables/Supplementary_Table_CODEX_response_groups.csv', index_col= 0, header= 0)
adata.obs['short_pt'] = adata.obs['batch'].apply(lambda x: x[:3] if 'h03' not in x else f"{x[:3]}T{x[4]}")
adata.obs = adata.obs.merge(metadata['response_group'], how='left', left_on='short_pt', right_index=True)

adata.obs.head()

In [None]:
## T-cell clonality
adata = adata[~pd.isnull(adata.obs['TRA_cdr3'])].copy()

fig,axs = airr.clonality_boxplot(adata,
                                 groupby= None,
                                 rep= 'cohort',
                                 xcat= 'treatment',
                                 hcat= 'response_group',
                                 xorder= ['Base','PD1','RTPD1'],
                                 horder= ['R1','R2','NR'],
                                 receptor_column= 'combined_tcr',
                                 show_stats= True,
                                 drop_na= True,
                                 fontsize= 4)
fig.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_clonality_by_tx_boxplot.pdf', bbox_inches='tight')

fig,axs = airr.clonality_lineplot(adata,
                                  rep= 'cohort',
                                  xcat= 'treatment',
                                  hcat= 'response_group',
                                  xorder= ['Base','PD1','RTPD1'],
                                  horder= ['R1','R2','NR'],
                                  receptor_column= 'combined_tcr',
                                  drop_na= True,
                                  fontsize= 4)
fig.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_clonality_by_tx_lineplot.pdf', bbox_inches='tight')

In [None]:
## T-cell clonality by cluster
adata = adata[~pd.isnull(adata.obs['TRA_cdr3'])].copy()

fig,axs = airr.clonality_boxplot(adata,
                                 groupby= 'leiden',
                                 rep= 'cohort',
                                 xcat= 'treatment',
                                 hcat= 'response_group',
                                 xorder= ['Base','PD1','RTPD1'],
                                 horder= ['R1','R2','NR'],
                                 receptor_column= 'combined_tcr',
                                 show_stats= True,
                                 drop_na= True,
                                 fontsize= 4)
fig.savefig('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/'+celltype+'/TCR_clonality_by_leiden_by_tx_boxplot.pdf', bbox_inches='tight')

In [None]:
## comparing therapy ability to induce expansion of pre-existing cells versus expansion of newly-recruited cells

adata = adata[~pd.isnull(adata.obs['TRA_cdr3'])].copy()
pt = adata.obs.cohort.unique().tolist()
dt = [
        'PD1_pre_nonexp',
        'PD1_pre_exp',
        'RTPD1_pre_nonexp',
        'RTPD1_pre_exp',
        'PD1_new',
        'RTPD1_new_new'
    ]
normalization = ['cells','clonotype']
cellthresh = 4

fig,axs = plt.subplots(nrows= 1,
                       ncols= 2,
                       sharex= False,
                       sharey= False,
                       gridspec_kw= {'hspace':0.2, 'wspace':0.3},
                       figsize= (5, 1.5))

for n,ax in zip(normalization, axs.flat):

    df = pd.DataFrame(index=pd.MultiIndex.from_product([pt,dt],names=['cohort','detection']),columns=['percent'])

    for i in df.index:

        if (i[1]=='PD1_pre_nonexp'):
            total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['E2']=='Y')]
            exp = total[(total.obs['D1']=='Y')&(total.obs['NE1']=='Y')]
            if (n=='clonotype'):
                total = total.obs.combined_tcr_v2.unique().tolist()
                exp = exp.obs.combined_tcr_v2.unique().tolist()

        elif (i[1]=='PD1_pre_exp'):
            total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['E2']=='Y')]
            exp = total[(total.obs['E1']=='Y')]
            if (n=='clonotype'):
                total = total.obs.combined_tcr_v2.unique().tolist()
                exp = exp.obs.combined_tcr_v2.unique().tolist()

        elif (i[1]=='RTPD1_pre_nonexp'):
            total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['E3']=='Y')]
            exp = total[(total.obs['D2']=='Y')&(total.obs['NE1']=='Y')&(total.obs['NE2']=='Y')]
            if (n=='clonotype'):
                total = total.obs.combined_tcr_v2.unique().tolist()
                exp = exp.obs.combined_tcr_v2.unique().tolist()

        elif (i[1]=='RTPD1_pre_exp'):
            total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['E3']=='Y')]
            exp = total[(total.obs['E2']=='Y')]
            if (n=='clonotype'):
                total = total.obs.combined_tcr_v2.unique().tolist()
                exp = exp.obs.combined_tcr_v2.unique().tolist()

        elif (i[1]=='PD1_new'):
            total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='PD1')&(adata.obs['E2']=='Y')]
            exp = total[(total.obs['D1']=='N')]
            if (n=='clonotype'):
                total = total.obs.combined_tcr_v2.unique().tolist()
                exp = exp.obs.combined_tcr_v2.unique().tolist()

        elif (i[1]=='RTPD1_new'):
            total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['E3']=='Y')]
            exp = total[(total.obs['D2']=='N')]
            if (n=='clonotype'):
                total = total.obs.combined_tcr_v2.unique().tolist()
                exp = exp.obs.combined_tcr_v2.unique().tolist()

        elif (i[1]=='RTPD1_new_new'):
            total = adata[(adata.obs.cohort==i[0])&(adata.obs.treatment=='RTPD1')&(adata.obs['E3']=='Y')]
            exp = total[(total.obs['D1']=='N')&(total.obs['D2']=='N')]
            if (n=='clonotype'):
                total = total.obs.combined_tcr_v2.unique().tolist()
                exp = exp.obs.combined_tcr_v2.unique().tolist()

        if (len(total)>cellthresh):
            df.loc[i,'percent'] = (len(exp)/len(total))*100

    df.reset_index(inplace=True)
    
    metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Submission-Cell-Aug2022/Tables/Supplementary_Table_CODEX_response_groups.csv', index_col= 0, header= 0)
    df['short_pt'] = df['cohort'].apply(lambda x: x.replace("Patient", "h"))
    df = df.merge(metadata['response_group'], how='left', left_on='short_pt', right_index=True)

    xorder = ['R1','R2','NR']
    horder = dt
    combs = list(itertools.combinations(horder,2))

    _= sns.boxplot(x= 'response_group',
                   y= 'percent',
                   hue= 'detection',
                   data= df,
                   color= 'w',
                   linewidth= 0.5,
                   fliersize= 0,
                   palette= 'colorblind',
                   order= xorder,
                   hue_order= horder,
                   boxprops= dict(alpha=0.2),
                   ax= ax)

    # test_results = add_stat_annotation(ax,x='response_group',y='percent',hue='detection',data=df,order=xorder,hue_order=horder,
    #                                box_pairs = [tuple((x,y) for y in c) for c in combs for x in xorder],
    #                                test='Mann-Whitney',comparisons_correction=None,text_format='simple',loc='outside',verbose=0,
    #                                fontsize=4,linewidth=0.5,line_height=0.01,text_offset=0.01)

    _= sns.stripplot(x= 'response_group',
                     y= 'percent',
                     hue= 'detection',
                     dodge= True,
                     jitter= 0.1,
                     data= df,
                     size= 2,
                     order= xorder,
                     hue_order= horder,
                     legend= False,
                     ax= ax)

    _= ax.legend(fontsize=2, title_fontsize=2, markerscale=0.05)
    # _= ax.get_legend().remove()
    _= ax.set_ylim(-2, 102)
    _= ax.set_ylabel('% of total expanded pool', fontsize= 8)
    _= ax.set_xlabel('')
    _= ax.set_xticklabels(xorder, fontsize= 8)
    _= ax.set_title(n, fontsize= 8)
    
    # p1 = df[(df.pCR=='R')&(df.detection=='RTPD1_new')]['percent'].to_numpy(dtype=np.float64)
    # p0 = df[(df.pCR=='R')&(df.detection=='PD1_new')]['percent'].to_numpy(dtype=np.float64)
    # p1F = p1[~np.isnan(p1)&~np.isnan(p0)]
    # p0F = p0[~np.isnan(p1)&~np.isnan(p0)]

    # scipy.stats.wilcoxon(p1F,p0F,alternative='greater')

fig.savefig(f'/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/{celltype}/R1R2_TCR_prevsnew_expansion_boxplot.pdf', bbox_inches='tight')


In [None]:
## Cataloguing where expanded clonotypes originate

adata = adata[~pd.isnull(adata.obs['TRA_cdr3'])].copy()
pt = adata.obs.cohort.unique().tolist()
df = pd.DataFrame(  index= pt, 
                    columns= ['PD1_total', 'RTPD1_total',
                                'PD1_Base_exp', 'PD1_Base_nonexp', 'PD1_Base_nodet',
                                'RTPD1_PD1_exp_Base_exp', 'RTPD1_PD1_exp_Base_nonexp', 'RTPD1_PD1_exp_Base_nodet',
                                'RTPD1_PD1_nonexp_Base_exp', 'RTPD1_PD1_nonexp_Base_nonexp', 'RTPD1_PD1_nonexp_Base_nodet',
                                'RTPD1_PD1_nodet_Base_exp', 'RTPD1_PD1_nodet_Base_nonexp', 'RTPD1_PD1_nodet_Base_nodet']
                )

# clonotype = None
clonotype = 'combined_tcr'

for p in pt:

    print(p)

    ## PD1 expanded
    tmp = adata[adata.obs.query(" cohort == @p & E2 == 'Y' ").index].obs.copy()
    tmp_base = adata[adata.obs.query(" cohort == @p & D1 == 'Y' ").index].obs.copy()
    if clonotype:
        total = len(tmp[clonotype].unique())
        total_base = len(tmp_base[clonotype].unique())
    else:
        total = len(tmp)
        total_base = len(tmp_base)
    df.loc[p, 'PD1_total'] = total
    runningSum = 0

    if (total > 0) & (total_base > 0):

        if clonotype is not None:
            tmp2 = len(tmp.query(" E1 == 'Y' ")[clonotype].unique())
            tmp3 = len(tmp.query(" NE1 == 'Y' ")[clonotype].unique())
            tmp4 = len(tmp.query(" D1 == 'N' ")[clonotype].unique())
        else:
            tmp2 = len(tmp.query(" E1 == 'Y' "))
            tmp3 = len(tmp.query(" NE1 == 'Y' "))
            tmp4 = len(tmp.query(" D1 == 'N' "))

        df.loc[p, 'PD1_Base_exp'] = tmp2 / total
        runningSum += tmp2
        df.loc[p, 'PD1_Base_nonexp'] = tmp3 / total
        runningSum += tmp3
        df.loc[p, 'PD1_Base_nodet'] = tmp4 / total
        runningSum += tmp4

        if runningSum == total:
            print("PD1 Good")
        else:
            print("PD1 Bad")


    ## RTPD1 expanded
    tmp = adata[adata.obs.query(" cohort == @p & E3 == 'Y' ").index].obs.copy()
    tmp_base = adata[adata.obs.query(" cohort == @p & D1 == 'Y' ").index].obs.copy()
    tmp_pd1 = adata[adata.obs.query(" cohort == @p & D2 == 'Y' ").index].obs.copy()
    if clonotype is not None:
        total = len(tmp[clonotype].unique())
        total_base = len(tmp_base[clonotype].unique())
        total_pd1 = len(tmp_pd1[clonotype].unique())
    else:
        total = len(tmp)
        total_base = len(tmp_base)
        total_pd1 = len(tmp_pd1)
    df.loc[p, 'RTPD1_total'] = total
    runningSum = 0

    if (total > 0) & (total_base > 0) & (total_pd1 > 0):

        if clonotype is not None:
            tmp2 = len(tmp.query(" E2 == 'Y' & E1 == 'Y' ")[clonotype].unique())
            tmp3 = len(tmp.query(" E2 == 'Y' & NE1 == 'Y' ")[clonotype].unique())
            tmp4 = len(tmp.query(" E2 == 'Y' & D1 == 'N' ")[clonotype].unique())
        else:
            tmp2 = len(tmp.query(" E2 == 'Y' & E1 == 'Y' "))
            tmp3 = len(tmp.query(" E2 == 'Y' & NE1 == 'Y' "))
            tmp4 = len(tmp.query(" E2 == 'Y' & D1 == 'N' "))


        df.loc[p, 'RTPD1_PD1_exp_Base_exp'] = tmp2 / total
        runningSum += tmp2
        df.loc[p, 'RTPD1_PD1_exp_Base_nonexp'] = tmp3 / total
        runningSum += tmp3
        df.loc[p, 'RTPD1_PD1_exp_Base_nodet'] = tmp4 / total
        runningSum += tmp4


        if clonotype is not None:
            tmp2 = len(tmp.query(" NE2 == 'Y' & E1 == 'Y' ")[clonotype].unique())
            tmp3 = len(tmp.query(" NE2 == 'Y' & NE1 == 'Y' ")[clonotype].unique())
            tmp4 = len(tmp.query(" NE2 == 'Y' & D1 == 'N' ")[clonotype].unique())
        else:
            tmp2 = len(tmp.query(" NE2 == 'Y' & E1 == 'Y' "))
            tmp3 = len(tmp.query(" NE2 == 'Y' & NE1 == 'Y' "))
            tmp4 = len(tmp.query(" NE2 == 'Y' & D1 == 'N' "))

        df.loc[p, 'RTPD1_PD1_nonexp_Base_exp'] = tmp2 / total
        runningSum += tmp2
        df.loc[p, 'RTPD1_PD1_nonexp_Base_nonexp'] = tmp3 / total
        runningSum += tmp3
        df.loc[p, 'RTPD1_PD1_nonexp_Base_nodet'] = tmp4 / total
        runningSum += tmp4


        if clonotype is not None:
            tmp2 = len(tmp.query(" D2 == 'N' & E1 == 'Y' ")[clonotype].unique())
            tmp3 = len(tmp.query(" D2 == 'N' & NE1 == 'Y' ")[clonotype].unique())
            tmp4 = len(tmp.query(" D2 == 'N' & D1 == 'N' ")[clonotype].unique())
        else:
            tmp2 = len(tmp.query(" D2 == 'N' & E1 == 'Y' "))
            tmp3 = len(tmp.query(" D2 == 'N' & NE1 == 'Y' "))
            tmp4 = len(tmp.query(" D2 == 'N' & D1 == 'N' "))

        df.loc[p, 'RTPD1_PD1_nodet_Base_exp'] = tmp2 / total
        runningSum += tmp2
        df.loc[p, 'RTPD1_PD1_nodet_Base_nonexp'] = tmp3 / total
        runningSum += tmp3
        df.loc[p, 'RTPD1_PD1_nodet_Base_nodet'] = tmp4 / total
        runningSum += tmp4

        if runningSum == total:
            print("RTPD1 Good")
        else:
            print("RTPD1 Bad")

metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Submission-Cell-Aug2022/Tables/Supplementary_Table_CODEX_response_groups.csv', index_col= 0, header= 0)
df['short_pt'] = [x.replace("Patient", "h") for x in df.index]
df = df.merge(metadata['response_group'], how='left', left_on='short_pt', right_index=True)

df.head()

In [None]:
## PD1
fig, axs = plt.subplots(nrows= 1,
                        ncols= 1,
                        sharex= False,
                        sharey= False,
                        figsize= (4,2))

tab10 = plt.get_cmap('tab10')

cols = ['PD1_Base_exp', 'PD1_Base_nonexp', 'PD1_Base_nodet']
h = df.groupby(by='response_group')[cols].median()
h = h.div(h.sum(axis=1), axis=0)

runningSum = np.zeros(3)

for i,c in enumerate(cols):

    _= axs.bar(x= np.array([1, 2, 3]),
                height= h[c].to_numpy(na_value= 0, dtype=float).flatten(),
                width= 0.5,
                bottom= runningSum,
                label= c,
                color= tab10(i),
                linewidth= 0)
    
    runningSum += h[c].to_numpy(na_value= 0, dtype=float).flatten()

_= axs.set_xticks([1, 2, 3])
_= axs.set_xticklabels(h.index.tolist(), fontsize= 8)
_= axs.set_yticklabels([])
_= axs.set_ylabel('median fraction of clonotypes', fontsize= 8)
_= axs.legend(loc= 'upper left', fontsize=4, title_fontsize=4, markerscale=0.05)
_= axs.set_title('PD1 expanded', fontsize=8)

fig.savefig(f'/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/{celltype}/PD1_TCR_catalog.pdf', bbox_inches='tight')


## RTPD1 condensed
fig, axs = plt.subplots(nrows= 1,
                        ncols= 1,
                        sharex= False,
                        sharey= False,
                        figsize= (4,2))

tab10 = plt.get_cmap('Set1')

runningSum = np.zeros(3)

df2 = pd.DataFrame(index= df.index, columns= ['RTPD1_PD1_exp', 'RTPD1_PD1_nonexp', 'RTPD1_PD1_nodet'])

for c in df2.columns:
    df2[c] = df[[x for x in df.columns if c in x]].sum(axis= 1)

df2['response_group'] = df.loc[df2.index, 'response_group']

h = df2.groupby(by='response_group')[['RTPD1_PD1_exp', 'RTPD1_PD1_nonexp', 'RTPD1_PD1_nodet']].median()
h = h.div(h.sum(axis=1), axis=0)

for i,c in enumerate(h.columns):

    _= axs.bar(x= np.array([1, 2, 3]),
                height= h[c].to_numpy(na_value= 0, dtype=float).flatten(),
                width= 0.5,
                bottom= runningSum,
                label= c,
                color= tab10(i),
                linewidth= 0)

    runningSum += h[c].to_numpy(na_value= 0, dtype=float).flatten()

_= axs.set_xticks([1, 2, 3])
_= axs.set_xticklabels(h.index.tolist(), fontsize= 8)
_= axs.set_yticklabels([])
_= axs.set_ylabel('median fraction of clonotypes', fontsize= 8)
_= axs.legend(loc= 'upper left', fontsize=2, title_fontsize=2, markerscale=0.05)
_= axs.set_title('RTPD1 expanded', fontsize=8)

fig.savefig(f'/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/{celltype}/RTPD1_TCR_catalog_v1.pdf', bbox_inches='tight')


## RTPD1 full
fig, axs = plt.subplots(nrows= 1,
                        ncols= 1,
                        sharex= False,
                        sharey= False,
                        figsize= (4,2))

tab10 = plt.get_cmap('tab20b_r')

cols = [
        'RTPD1_PD1_exp_Base_exp', 'RTPD1_PD1_exp_Base_nonexp', 'RTPD1_PD1_exp_Base_nodet',
        'RTPD1_PD1_nonexp_Base_exp', 'RTPD1_PD1_nonexp_Base_nonexp', 'RTPD1_PD1_nonexp_Base_nodet',
        'RTPD1_PD1_nodet_Base_exp', 'RTPD1_PD1_nodet_Base_nonexp', 'RTPD1_PD1_nodet_Base_nodet'
        ]
h = df.groupby(by='response_group')[cols].median()
h = h.div(h.sum(axis=1), axis=0)

runningSum = np.zeros(3)

for i,c in enumerate(cols):

    _= axs.bar(x= np.array([1, 2, 3]),
                height= h[c].to_numpy(na_value= 0, dtype=float).flatten(),
                width= 0.5,
                bottom= runningSum,
                label= c,
                color= tab10(i),
                linewidth= 0)
    
    runningSum += h[c].to_numpy(na_value= 0, dtype=float).flatten()

_= axs.set_xticks([1, 2, 3])
_= axs.set_xticklabels(h.index.tolist(), fontsize= 8)
_= axs.set_yticklabels([])
_= axs.set_ylabel('median fraction of clonotypes', fontsize= 8)
_= axs.legend(loc= 'upper left', fontsize=2, title_fontsize=2, markerscale=0.05)
_= axs.set_title('RTPD1 expanded', fontsize=8)

fig.savefig(f'/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/{celltype}/RTPD1_TCR_catalog_v2.pdf', bbox_inches='tight')

In [None]:
tmp = df.dropna(how='any')
cat = "PD1_Base_exp"
p1 = tmp.query(" response_group == 'R1' ")[cat].to_numpy(dtype= float)
p0 = tmp.query(" response_group == 'R2' ")[cat].to_numpy(dtype= float)

scipy.stats.mannwhitneyu(p1, p0)

tmp = df.dropna(how='any')
cat = [x for x in tmp.columns if 'PD1_exp' in x]
p1 = tmp.query(" response_group == 'R1' ")[cat].sum(axis= 1).to_numpy(dtype= float)
p0 = tmp.query(" response_group == 'R2' ")[cat].sum(axis= 1).to_numpy(dtype= float)

scipy.stats.mannwhitneyu(p1, p0)

In [None]:
## Cataloguing where expanded clonotypes originate and what their phenotype is

adata = adata[~pd.isnull(adata.obs['TRA_cdr3'])].copy()

pt = adata.obs.cohort.unique().tolist()

grps = ['PD1_total', 'RTPD1_total',
        'PD1_Base_exp', 'PD1_Base_nonexp', 'PD1_Base_nodet',
        'RTPD1_PD1_exp_Base_exp', 'RTPD1_PD1_exp_Base_nonexp', 'RTPD1_PD1_exp_Base_nodet',
        'RTPD1_PD1_nonexp_Base_exp', 'RTPD1_PD1_nonexp_Base_nonexp', 'RTPD1_PD1_nonexp_Base_nodet',
        'RTPD1_PD1_nodet_Base_exp', 'RTPD1_PD1_nodet_Base_nonexp', 'RTPD1_PD1_nodet_Base_nodet']

clust = ['00', '02', '03', '04', '05', '06']

cols = [f'{x}_{y}' for x in grps for y in clust]
cols = ['PD1_total', 'RTPD1_total'] + cols

df = pd.DataFrame(index= pt, 
                  columns= cols)

clonotype = None
# clonotype = 'combined_tcr'

for p in pt:

    print(p)

    ## PD1 expanded
    tmp = adata[adata.obs.query(" cohort == @p & E2 == 'Y' ").index].obs.copy()
    tmp_base = adata[adata.obs.query(" cohort == @p & D1 == 'Y' ").index].obs.copy()
    if clonotype:
        total = len(tmp[clonotype].unique())
        total_base = len(tmp_base[clonotype].unique())
    else:
        total = len(tmp)
        total_base = len(tmp_base)
    df.loc[p, 'PD1_total'] = total
    runningSum = 0

    if (total > 0) & (total_base > 0):

        for c in clust:

            tmp_clust = tmp.query(" leiden == @c ").copy()
            if clonotype:
                total_clust = len(tmp_clust[clonotype].unique())
            else:
                total_clust = len(tmp_clust)
            df.loc[p, f'PD1_total_{c}'] = total_clust / total

            if clonotype is not None:
                tmp2 = len(tmp_clust.query(" E1 == 'Y' ")[clonotype].unique())
                tmp3 = len(tmp_clust.query(" NE1 == 'Y' ")[clonotype].unique())
                tmp4 = len(tmp_clust.query(" D1 == 'N' ")[clonotype].unique())
            else:
                tmp2 = len(tmp_clust.query(" E1 == 'Y' "))
                tmp3 = len(tmp_clust.query(" NE1 == 'Y' "))
                tmp4 = len(tmp_clust.query(" D1 == 'N' "))

            df.loc[p, f'PD1_Base_exp_{c}'] = tmp2 / total
            runningSum += tmp2
            df.loc[p, f'PD1_Base_nonexp_{c}'] = tmp3 / total
            runningSum += tmp3
            df.loc[p, f'PD1_Base_nodet_{c}'] = tmp4 / total
            runningSum += tmp4

        # if runningSum == total:
        #     print("PD1 Good")
        # else:
        #     print("PD1 Bad")


    ## RTPD1 expanded
    tmp = adata[adata.obs.query(" cohort == @p & E3 == 'Y' ").index].obs.copy()
    tmp_base = adata[adata.obs.query(" cohort == @p & D1 == 'Y' ").index].obs.copy()
    tmp_pd1 = adata[adata.obs.query(" cohort == @p & D2 == 'Y' ").index].obs.copy()
    if clonotype is not None:
        total = len(tmp[clonotype].unique())
        total_base = len(tmp_base[clonotype].unique())
        total_pd1 = len(tmp_pd1[clonotype].unique())
    else:
        total = len(tmp)
        total_base = len(tmp_base)
        total_pd1 = len(tmp_pd1)
    df.loc[p, 'RTPD1_total'] = total
    runningSum = 0

    if (total > 0) & (total_base > 0) & (total_pd1 > 0):
        
        for c in clust:

            tmp_clust = tmp.query(" leiden == @c ").copy()
            if clonotype:
                total_clust = len(tmp_clust[clonotype].unique())
            else:
                total_clust = len(tmp_clust)
            df.loc[p, f'RTPD1_total_{c}'] = total_clust / total

            if clonotype is not None:
                tmp2 = len(tmp_clust.query(" E2 == 'Y' & E1 == 'Y' ")[clonotype].unique())
                tmp3 = len(tmp_clust.query(" E2 == 'Y' & NE1 == 'Y' ")[clonotype].unique())
                tmp4 = len(tmp_clust.query(" E2 == 'Y' & D1 == 'N' ")[clonotype].unique())
            else:
                tmp2 = len(tmp_clust.query(" E2 == 'Y' & E1 == 'Y' "))
                tmp3 = len(tmp_clust.query(" E2 == 'Y' & NE1 == 'Y' "))
                tmp4 = len(tmp_clust.query(" E2 == 'Y' & D1 == 'N' "))


            df.loc[p, f'RTPD1_PD1_exp_Base_exp_{c}'] = tmp2 / total
            runningSum += tmp2
            df.loc[p, f'RTPD1_PD1_exp_Base_nonexp_{c}'] = tmp3 / total
            runningSum += tmp3
            df.loc[p, f'RTPD1_PD1_exp_Base_nodet_{c}'] = tmp4 / total
            runningSum += tmp4


            if clonotype is not None:
                tmp2 = len(tmp_clust.query(" NE2 == 'Y' & E1 == 'Y' ")[clonotype].unique())
                tmp3 = len(tmp_clust.query(" NE2 == 'Y' & NE1 == 'Y' ")[clonotype].unique())
                tmp4 = len(tmp_clust.query(" NE2 == 'Y' & D1 == 'N' ")[clonotype].unique())
            else:
                tmp2 = len(tmp_clust.query(" NE2 == 'Y' & E1 == 'Y' "))
                tmp3 = len(tmp_clust.query(" NE2 == 'Y' & NE1 == 'Y' "))
                tmp4 = len(tmp_clust.query(" NE2 == 'Y' & D1 == 'N' "))

            df.loc[p, f'RTPD1_PD1_nonexp_Base_exp_{c}'] = tmp2 / total
            runningSum += tmp2
            df.loc[p, f'RTPD1_PD1_nonexp_Base_nonexp_{c}'] = tmp3 / total
            runningSum += tmp3
            df.loc[p, f'RTPD1_PD1_nonexp_Base_nodet_{c}'] = tmp4 / total
            runningSum += tmp4


            if clonotype is not None:
                tmp2 = len(tmp_clust.query(" D2 == 'N' & E1 == 'Y' ")[clonotype].unique())
                tmp3 = len(tmp_clust.query(" D2 == 'N' & NE1 == 'Y' ")[clonotype].unique())
                tmp4 = len(tmp_clust.query(" D2 == 'N' & D1 == 'N' ")[clonotype].unique())
            else:
                tmp2 = len(tmp_clust.query(" D2 == 'N' & E1 == 'Y' "))
                tmp3 = len(tmp_clust.query(" D2 == 'N' & NE1 == 'Y' "))
                tmp4 = len(tmp_clust.query(" D2 == 'N' & D1 == 'N' "))

            df.loc[p, f'RTPD1_PD1_nodet_Base_exp_{c}'] = tmp2 / total
            runningSum += tmp2
            df.loc[p, f'RTPD1_PD1_nodet_Base_nonexp_{c}'] = tmp3 / total
            runningSum += tmp3
            df.loc[p, f'RTPD1_PD1_nodet_Base_nodet_{c}'] = tmp4 / total
            runningSum += tmp4

        # if runningSum == total:
        #     print("RTPD1 Good")
        # else:
        #     print("RTPD1 Bad")

metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Submission-Cell-Aug2022/Tables/Supplementary_Table_CODEX_response_groups.csv', index_col= 0, header= 0)
df['short_pt'] = [x.replace("Patient", "h") for x in df.index]
df = df.merge(metadata['response_group'], how='left', left_on='short_pt', right_index=True)

df.head()


In [None]:
fig, axs = plt.subplots(nrows= 1,
                        ncols= 1,
                        sharex= False,
                        sharey= False,
                        figsize= (4,2))

clust = ['00', '02', '03', '04', '05', '06']
cols = [f'PD1_total_{x}' for x in clust]
h = df.groupby(by='response_group')[cols].median()
h = h.div(h.sum(axis=1), axis=0)

runningSum = np.zeros(3)

for i,c in enumerate(cols):

    _= axs.bar(x= np.array([1, 2, 3]),
                height= h[c].to_numpy(na_value= 0, dtype=float).flatten(),
                width= 0.5,
                bottom= runningSum,
                label= c,
                color= adata.uns['tcell_leiden_nbr100_res0.6_colors'][int(c[-1])],
                linewidth= 0)
    
    runningSum += h[c].to_numpy(na_value= 0, dtype=float).flatten()

_= axs.set_xticks([1, 2, 3])
_= axs.set_xticklabels(h.index.tolist(), fontsize= 8)
_= axs.set_yticklabels([])
_= axs.set_ylim((0, 1.1))
_= axs.set_ylabel('median fraction of cells', fontsize= 8)
_= axs.legend(loc= 'upper left', fontsize=4, title_fontsize=4, markerscale=0.05)
_= axs.set_title('PD1 expanded', fontsize=8)

fig.savefig(f'/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/{celltype}/PD1_TCR_catalog_byLeiden.pdf', bbox_inches='tight')


fig, axs = plt.subplots(nrows= 1,
                        ncols= 1,
                        sharex= False,
                        sharey= False,
                        figsize= (4,2))

clust = ['00', '02', '03', '04', '05', '06']
cols = [f'RTPD1_total_{x}' for x in clust]
h = df.groupby(by='response_group')[cols].median()
h = h.div(h.sum(axis=1), axis=0)

runningSum = np.zeros(3)

for i,c in enumerate(cols):

    _= axs.bar(x= np.array([1, 2, 3]),
                height= h[c].to_numpy(na_value= 0, dtype=float).flatten(),
                width= 0.5,
                bottom= runningSum,
                label= c,
                color= adata.uns['tcell_leiden_nbr100_res0.6_colors'][int(c[-1])],
                linewidth= 0)
    
    runningSum += h[c].to_numpy(na_value= 0, dtype=float).flatten()

_= axs.set_xticks([1, 2, 3])
_= axs.set_xticklabels(h.index.tolist(), fontsize= 8)
_= axs.set_yticklabels([])
_= axs.set_ylim((0, 1.1))
_= axs.set_ylabel('median fraction of cells', fontsize= 8)
_= axs.legend(loc= 'upper left', fontsize=4, title_fontsize=4, markerscale=0.05)
_= axs.set_title('RTPD1 expanded', fontsize=8)

fig.savefig(f'/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/{celltype}/RTPD1_TCR_catalog_byLeiden.pdf', bbox_inches='tight')


fig, axs = plt.subplots(nrows= 1,
                        ncols= 1,
                        sharex= False,
                        sharey= False,
                        figsize= (4,2))

clust = ['00', '02', '03', '04', '05', '06']

df2 = pd.DataFrame(index= df.index, columns= [f'RTPD1_PD1_exp_{x}' for x in clust])
for c in clust:
    cols = [f'RTPD1_PD1_exp_Base_exp_{c}', f'RTPD1_PD1_exp_Base_nonexp_{c}', f'RTPD1_PD1_exp_Base_nodet_{c}']
    df2[f'RTPD1_PD1_exp_{c}'] = df[cols].sum(axis= 1)

df2['response_group'] = df.loc[df2.index, 'response_group']

h = df2.groupby(by='response_group')[[f'RTPD1_PD1_exp_{x}' for x in clust]].median()
h = h.div(h.sum(axis=1), axis=0)

runningSum = np.zeros(3)

for i,c in enumerate(h.columns):

    _= axs.bar(x= np.array([1, 2, 3]),
                height= h[c].to_numpy(na_value= 0, dtype=float).flatten(),
                width= 0.5,
                bottom= runningSum,
                label= c,
                color= adata.uns['tcell_leiden_nbr100_res0.6_colors'][int(c[-1])],
                linewidth= 0)
    
    runningSum += h[c].to_numpy(na_value= 0, dtype=float).flatten()

_= axs.set_xticks([1, 2, 3])
_= axs.set_xticklabels(h.index.tolist(), fontsize= 8)
_= axs.set_yticklabels([])
_= axs.set_ylim((0, 1.1))
_= axs.set_ylabel('median fraction of cells', fontsize= 8)
_= axs.legend(loc= 'upper left', fontsize=4, title_fontsize=4, markerscale=0.05)
_= axs.set_title('RTPD1 expanded from PD1 expanded', fontsize=8)

fig.savefig(f'/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/{celltype}/RTPD1_PD1_exp_TCR_catalog_byLeiden.pdf', bbox_inches='tight')

In [None]:
tmp = df.dropna(how='any')
cat = "RTPD1_total_03"
p1 = tmp.query(" response_group == 'R1' ")[cat].to_numpy(dtype= float)
p0 = tmp.query(" response_group == 'R2' ")[cat].to_numpy(dtype= float)

scipy.stats.mannwhitneyu(p1, p0)

# tmp = df2.dropna(how='any')
# cat = "RTPD1_PD1_exp_06"
# p1 = tmp.query(" response_group == 'R1' ")[cat].to_numpy(dtype= float)
# p0 = tmp.query(" response_group == 'R2' ")[cat].to_numpy(dtype= float)

# scipy.stats.mannwhitneyu(p1, p0)

In [None]:
## Correlation of expression profiles with clone size
adata = adata[~pd.isnull(adata.obs['TRA_cdr3'])].copy()

## Just use top 2000 HVGs
with open('/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/scvi_outputs/tcell_filtered/snapshot/var_names.csv') as f:
    GOI = f.read().splitlines()

adata.obs['combined_tcr_v3'] = [f"{x}_{y}" for x,y in zip(adata.obs['patient_treatment'], adata.obs['combined_tcr'])]

df = adata.obs[['combined_tcr_v3', 'tcr_counts']]

df_expr = pd.DataFrame(adata[:, GOI].X.toarray(), 
                       index= adata.obs.index, 
                       columns= GOI)

df = df.merge(df_expr, how='left', left_index=True, right_index=True)

df = df.groupby(by='combined_tcr_v3').mean(numeric_only=True)

## Remove Treg TCRs
rm_tcrs = adata.obs.query(" leiden == '03' ")['combined_tcr_v3'].tolist()
df.drop(index= rm_tcrs, inplace= True)

## Remove singletons
df = df.query(" tcr_counts > 1 ").copy()

df['cohort'] = [x.split('_')[0] for x in df.index]
df['treatment'] = [x.split('_')[1] for x in df.index]

metadata = pd.read_csv('/Users/gouink/Documents/RTPD1Manuscript/Submission-Cell-Aug2022/Tables/Supplementary_Table_CODEX_response_groups.csv', index_col= 0, header= 0)
df['short_pt'] = df['cohort'].apply(lambda x: x.replace('Patient', 'h'))
df = df.merge(metadata['response_group'], how='left', left_on='short_pt', right_index=True)

df.head()

In [None]:
for t in ['Base', 'PD1', 'RTPD1']:
    for p in ['R1', 'R2']:

        tmp = df.query(" treatment == @t & response_group == @p ").copy()

        ## Correlation
        result = scipy.stats.spearmanr(a= tmp['tcr_counts'].to_numpy(dtype= float),
                                       b= tmp[GOI].to_numpy(dtype= float))

        corr_df = pd.DataFrame(index= GOI, columns= ['corr', 'pval'])
        corr_df['corr'] = result.statistic[0][1:]
        corr_df['pval'] = result.pvalue[0][1:]

        corr_df.dropna(how='any', inplace=True)

        corr_df.to_csv(f'/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/{celltype}/corr_w_expansion_{t}_{p}.csv')

# corr_df.query(" pval <= 0.05 ").sort_values(ascending=False).head(100)
# corr_df.query(" pval <= 0.05 ").sort_values(ascending=False).tail(100)

In [None]:
### Had to run this with gseapy == 0.10.7
### https://github.com/zqfang/GSEApy/issues/193#issuecomment-1630859649

corr_df = pd.read_csv('/Users/gouink/Desktop/test.csv', index_col= 0, header= 0)

## GSEA
rnk = corr_df.query(" pval <= 0.05 ")['corr'].sort_values(ascending=False)
rnk = pd.DataFrame(rnk).reset_index()

gsea_results = pd.DataFrame(columns=['es','nes','pval','fdr','geneset_size','matched_size','genes','ledge_genes','group'])
genesets = ['/Users/gouink/.cache/gseapy/enrichr.GO_Biological_Process_2018.gmt']

for g in genesets:
    
    pre_res = gp.prerank(rnk=rnk, 
                         gene_sets=g,
                         processes=4,
                         permutation_num=100,
                         ascending=False,
                         outdir='test/prerank_report_kegg', 
                         format='png', 
                         seed=6,
                         min_size=0,
                         max_size=500,
                         verbose=True)

    pre_res = pre_res.res2d
    pre_res = pre_res[(pre_res.pval<=0.05)]
    gsea_results = gsea_results.append(pre_res)

In [None]:
corr_1 = pd.read_csv(f'/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/{celltype}/corr_w_expansion_RTPD1_R1.csv', index_col= 0, header= 0)
corr_2 = pd.read_csv(f'/Users/gouink/Documents/RTPD1Manuscript/Human/manuscript_review_analysis/analysis/{celltype}/corr_w_expansion_RTPD1_R2.csv', index_col= 0, header= 0)

r1 = corr_1.query(" pval <= 0.05 & corr > 0 ").index.tolist()
r2 = corr_2.query(" pval <= 0.05 & corr > 0 ").index.tolist()

r2_unique = list(set(r2) - set(r1))
corr_2.loc[r2_unique, :].sort_values(by= ['corr'], ascending=False)

r1_unique = list(set(r1) - set(r2))
corr_1.loc[r1_unique, :].sort_values(by= ['corr'], ascending=False)

c = list(set(r1) & set(r2))
corr_2.loc[c, :].sort_values(by= ['corr'], ascending=False).head(50)

fig, axs = plt.subplots(figsize=(2,2))

_= axs.scatter(x= corr_2['corr'],
               y= -10 * np.log10(corr_2['pval']),
               s= 1,
               c= 'r')
_= axs.scatter(x= corr_1['corr'],
               y= -10 * np.log10(corr_1['pval']),
               s= 1,
               c= 'b')
_= axs.set_xlabel('correlation with expansion', fontsize= 8)
_= axs.set_ylabel('-log10(pval)', fontsize= 8)