Analysis of inter- and intra-tumor heterogeneity using geomx data (including code for Fig. 2D)

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
import random
import glob
import pyreadr
from operator import itemgetter
import itertools
from itertools import groupby
import seaborn as sns
import matplotlib.ticker as ticker
import sys
import scipy.stats  as stats
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from statsmodels.stats import multitest

In [None]:
tumor_roi_dat = pd.read_csv('new_annotation_Feb2021_KW.txt', delimiter = "\t")
protein_dat=pd.read_csv('ERCC_IgG_norm.csv')
nuc_count = pd.read_csv('nuc_count.csv')

protein_dat=protein_dat.rename({'Unnamed: 0': 'Protein'}, axis=1) 

all_rois=list(protein_dat.columns)[1:]

tumor_core_roi_dict={}
excluded_rois=[]
for loc in range(len(tumor_roi_dat)):
    cur_row = tumor_roi_dat.iloc[loc]
    roi = cur_row['ROI']
    roi_num=roi.split('_')[1]
    if len(roi_num)==1:
        roi = 'ROI_0'+roi_num
    #print(cur_row)
    if roi in all_rois:
        key = (cur_row['Tumor_ID'],cur_row['Core_ID'])
        if key[0]==21 or key[0]==8:
            excluded_rois.append(roi)
        else:
            if key in tumor_core_roi_dict:
                tumor_core_roi_dict[key].append(roi)
            else:
                tumor_core_roi_dict[key] = [roi]            
            
protein_dat=protein_dat.drop(excluded_rois,1)

tumor_roi_dict={}
for k, rois in tumor_core_roi_dict.items():
    tumor=k[0]
    if tumor in tumor_roi_dict.keys():
        tumor_roi_dict[tumor]=tumor_roi_dict[tumor]+rois
    else:
        tumor_roi_dict[tumor]=rois
all_rois =list(itertools.chain(*list(tumor_core_roi_dict.values())))

In [None]:
protein_dat_by_ROI=protein_dat.set_index('Protein').T
protein_dat_by_ROI_scaled = pd.DataFrame(scaler.fit_transform(protein_dat_by_ROI), columns=protein_dat_by_ROI.columns)
protein_dat_by_ROI_scaled.index=protein_dat_by_ROI.index

In [None]:
norm_dict={}
for col in protein_dat_by_ROI_scaled.columns:
    k2,p=stats.normaltest(list(protein_dat_by_ROI_scaled[col]))
    norm_dict[col]=[p]

In [None]:
norm_test_res=pd.DataFrame.from_dict(norm_dict,orient='columns').T.rename(columns={0:'p'})

In [None]:
norm_test_res[norm_test_res['p']>0.001].sort_values(by='p',ascending=False)

In [None]:
roi_tumor_dict={}
for tumor, rois in tumor_roi_dict.items():
    for roi in rois:
        roi_tumor_dict[roi]=tumor

In [None]:
tumor_grouping=[]
for roi in protein_dat_by_ROI_scaled.index:
    tumor_grouping.append(roi_tumor_dict[roi])

In [None]:
protein_dat_by_ROI_scaled['Tumor']=tumor_grouping

In [None]:
prot_list=list(protein_dat_by_ROI.columns)

In [None]:
res_dict={}
for prot in prot_list:
    distinct_groups=[list(protein_dat_by_ROI_scaled[protein_dat_by_ROI_scaled['Tumor']==tumor][prot]) for tumor in tumor_roi_dict.keys()]
    res=stats.kruskal(*distinct_groups)
    res_dict[prot]=[res.statistic,res.pvalue]

In [None]:
res_df=pd.DataFrame.from_dict(res_dict).T.rename(columns={0:'H-statistic',1:'P-value'})

In [None]:
res_df=res_df.sort_values(by='H-statistic',ascending=False)

In [None]:
res_df['Protein']=res_df.index

In [None]:
res_df[res_df['P-value']>0.01]

In [None]:
df_p=res_df['P-value'].to_frame()
df_p=df_p.rename(columns={'P-value':'p'})
df_p.to_csv('inter_intra_KW_pvals.csv')

In [None]:
list(res_df['P-value'])

In [None]:
res_df['Adjusted p']=multitest.fdrcorrection(list(res_df['P-value']))[1]

In [None]:
res_df[res_df['Adjusted p']>0.01]

In [None]:
def arr_to_1d_lis(arr):
    return list(itertools.chain(*list(arr)))
def list_to_arr(lst,length):
    return [lst[x:x+length] for x in range(0, len(lst), length)]
def p_value_adjustment(p_results):
    pvals1D=arr_to_1d_lis(p_results.to_numpy())
    adj_p=multitest.fdrcorrection(pvals1D)[1]
    p_results_adj=pd.DataFrame(list_to_arr(adj_p,len(p_results.columns)))
    p_results_adj=p_results_adj.set_index(p_results.index)
    p_results_adj=p_results_adj.rename(columns={list(p_results_adj.columns)[i]:list(p_results.columns)[i] for i in range(len(p_results.columns))})
    return p_results_adj

In [None]:

fig = plt.figure(figsize=(26,8))
plt.rcParams["font.size"] = 22
plt.rcParams["font.family"] = 'Arial'
res_df=res_df.replace('SNCA_filament','SNCA filament')
sns.barplot(x="Protein", y="H-statistic", data=res_df,palette='rocket')
plt.xticks(rotation=90)
plt.xlabel('')
plt.ylabel('H statistic, inter-tumor \n variability relative to \nintra-tumor variability',fontsize=34)
sns.despine()
plt.tight_layout()
#plt.savefig('H_statistic_all_proteins.pdf')
plt.show()

In [None]:
list(res_df['Protein'])