Analysis of correlations in geomx data (including code for Fig. 2c)

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
import random
import glob
import pyreadr
from operator import itemgetter
import itertools
from itertools import groupby
import seaborn as sns
import matplotlib.ticker as ticker
import sys
import scipy.stats  as stats
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
#Preprocess geomx data

tumor_roi_dat = pd.read_csv('new_annotation_Feb2021_KW.txt', delimiter = "\t")
protein_dat=pd.read_csv('ERCC_IgG_norm.csv')
nuc_count = pd.read_csv('nuc_count.csv')

protein_dat=protein_dat.rename({'Unnamed: 0': 'Protein'}, axis=1) 

all_rois=list(protein_dat.columns)[1:]

tumor_core_roi_dict={}
excluded_rois=[]
for loc in range(len(tumor_roi_dat)):
    cur_row = tumor_roi_dat.iloc[loc]
    roi = cur_row['ROI']
    roi_num=roi.split('_')[1]
    if len(roi_num)==1:
        roi = 'ROI_0'+roi_num
    #print(cur_row)
    if roi in all_rois:
        key = (cur_row['Tumor_ID'],cur_row['Core_ID'])
        if key[0]==21 or key[0]==8:
            excluded_rois.append(roi)
        else:
            if key in tumor_core_roi_dict:
                tumor_core_roi_dict[key].append(roi)
            else:
                tumor_core_roi_dict[key] = [roi]            


tumor_ids=list(tumor_roi_dat['Tumor_ID'])
core_ids=list(tumor_roi_dat['Core_ID'])
fish_cores=list(tumor_roi_dat['FISH_core'])

tumor_core_to_shreya_base={(tumor_ids[i],core_ids[i]):fish_cores[i] for i in range(len(tumor_roi_dat))}
tumor_core_to_shreya={}
for k,v in tumor_core_to_shreya_base.items():
    if k in tumor_core_roi_dict.keys():
        tumor_core_to_shreya[k]=v
all_proteins_cores_shreya=list(tumor_core_to_shreya.values())

tumor_roi_dict={}
for k, rois in tumor_core_roi_dict.items():
    tumor=k[0]
    if tumor in tumor_roi_dict.keys():
        tumor_roi_dict[tumor]=tumor_roi_dict[tumor]+rois
    else:
        tumor_roi_dict[tumor]=rois
            
nuc_count=nuc_count.rename({'Unnamed: 0': 'ROI'}, axis=1) 

tumor_protein_dfs=[]
for tumor,rois in tumor_roi_dict.items():
    cur_df=protein_dat[rois]
    cur_df.index=protein_dat['Protein']
    tot_nuc_count=nuc_count[nuc_count['ROI'].isin(protein_dat[rois])]['nuclei_count'].sum()
    for roi in rois:
        cur_weight=list(nuc_count[nuc_count['ROI']==roi]['nuclei_count'])[0]/tot_nuc_count
        cur_df[roi]=cur_df[roi]*cur_weight 
    cur_df=cur_df.sum(axis=1).to_frame().rename(columns={0:tumor})
    tumor_protein_dfs.append(cur_df)
protein_dat_by_tumor=pd.concat(tumor_protein_dfs,1)

protein_dat_by_tumor=protein_dat_by_tumor.T

protein_dat_by_tumor_rescaled = pd.DataFrame(scaler.fit_transform(protein_dat_by_tumor), columns=protein_dat_by_tumor.columns)
protein_dat_by_tumor_rescaled.index=protein_dat_by_tumor.index

In [None]:
#save for subsequent analysis in R
protein_dat_by_tumor.to_csv('proteins_by_tumor_for_corr_clustering.csv')

In [None]:
"""
R code for generating correlation matrix plot:

> dat<-read.csv('proteins_by_tumor_for_corr_clustering.csv',header=TRUE, row.names="X")
> M<-cor(dat,method="spearman")
> library(corrplot)
> pdf('corplot_all_proteins_by_tumor.pdf')
> corrplot(M, type = "upper",order = "hclust",cl.lim=c(-1,1), col=colorRampPalette(c(rev(brewer.pal(n=11,name = "RdBu"))))(200),tl.cex = 0.3,tl.col="black")
> dev.off()

R code for obtaining spearman correlations and performing p-value adjustment:

> cormat=rcorr(as.matrix(dat),type="spearman")
> spearman_mat=cormat$r
> write.csv(spearman_mat,'cormat_from_proteins_by_tumor_spearman_coefficients.csv')
> pval_mat=cormat$P
> write.csv(pval_mat,'cormat_from_proteins_by_tumor_pvals.csv')
> pval_mat2<-pval_mat
> pval_mat2[]<-p.adjust(pval_mat2, method = "fdr")
> write.csv(pval_mat,'cormat_from_proteins_by_tumor_pvals_adj_fdr.csv')
"""

In [None]:
prots_spearman=pd.read_csv('cormat_from_proteins_by_tumor_spearman_coefficients.csv')
prots_pvals=pd.read_csv('cormat_from_proteins_by_tumor_pvals.csv')
prots_pvals_adj=pd.read_csv('cormat_from_proteins_by_tumor_pvals_adj_fdr.csv')

prots_spearman.index=prots_spearman['Unnamed: 0']
prots_pvals.index=prots_pvals['Unnamed: 0']
prots_pvals_adj.index=prots_pvals_adj['Unnamed: 0']

In [None]:
def get_stats(prot1,prot2):
    print('Coeff = ',prots_spearman.loc[prot1,prot2])
    print('Adj. p-val',prots_pvals_adj.loc[prot1,prot2])

In [None]:
get_stats('EGFR','CD163')

In [None]:
prots_pvals.loc['CD68','Park5']

In [None]:
get_stats('CD68','SYP')

In [None]:
get_stats('CD68','SNCA_filament')

In [None]:
get_stats('CD68','MBP')

In [None]:
get_stats('CD11b','SYP')

In [None]:
get_stats('CD11b','MBP')

In [None]:
get_stats('CD11b','SNCA_filament')

In [None]:
get_stats('HLA.DR','SNCA_filament')

In [None]:
get_stats('HLA.DR','MBP')

In [None]:
get_stats('HLA.DR','SYP')

In [None]:
get_stats('Olig2','Sox2')

In [None]:
clus_neuronal= ['Park5','SNCA_filament','SYP','NRGN','P-TAU S404','TAU','P2RX7','NEFL','MBP','SIRT2']
clus_neuronal= ['SNCA_filament','SYP','NRGN']
clus_immune=['CD68','HLA-DR','CD11b','CD45','IBA1','CD163','CD14']

In [None]:
clus_immune_R=['CD68','HLA.DR','CD11b','CD45','IBA1','CD163','CD14']
spears=[]
for prot in clus_immune_R:
    spears.append(prots_spearman.loc[prot,'SNCA_filament'])
np.mean(spears)

In [None]:
clus_immune_R=['CD68','HLA.DR','CD11b','CD45','IBA1','CD163','CD14']
clus_neuronal_R= ['Park5','SNCA_filament','SYP','NRGN','P.TAU.S404','TAU','P2RX7','NEFL','MBP','SIRT2']
spears_dict={}
for prot1 in clus_neuronal_R:
    spears=[]
    for prot2 in clus_immune_R:
        spears.append(prots_spearman.loc[prot1,prot2])
    spears_dict[prot1]=np.mean(spears)

In [None]:
spears_dict

In [None]:
clus_immune_R=['CD68','HLA.DR','CD11b','CD45','IBA1','CD163','CD14']
for prot in clus_immune_R:
    print('\n',prot)
    get_stats(prot,'SNCA_filament')

In [None]:
prots_spearman[clus_immune].loc[clus_immune]

In [None]:
clus1=['Olig2','Sox2','MAP2','ADAM10','BACE1','Neprilysin','P-TAU S396','P-TAU T231','P-tau S199','P-tau S214','IQGAP2','14-3-3','IDE','Beta-Catenin','PSEN1']
clus1=['P-TAU S396','P-TAU T231','P-tau S199','P-tau S214','P-TAU S404']
clus2=['Lef1','Tuj1','Rspo2','Satb2']
clus2=['Lef1','Rspo2','Satb2']
clus3=['CD68','HLA-DR','CD11b','CD45','IBA1','CD163','CD14']

In [None]:
cor_mat=protein_dat_by_tumor.corr(method='spearman')

In [None]:
clus_R=['P.tau.S214', 'P.TAU.S396', 'P.TAU.T231','P.tau.S199']
clus_R=['EGFR','Olig2','Sox2']
prots_spearman[clus_R].loc[clus_R]

In [None]:
clus=['P-tau S214','P-TAU S396','P-TAU T231','P-tau S199']
cor=cor_mat[clus].loc[clus]
cor.min()

In [None]:
clus=['EGFR','Olig2','Sox2']
cor=cor_mat[clus].loc[clus]
cor

In [None]:
prots_pvals_adj[clus].loc[clus]

In [None]:
clus_R=['CD163', 'IBA1', 'CD14','CD45','CD11b','HLA.DR','CD68']
prots_spearman[clus_R].loc[clus_R]

In [None]:
clus_R=['CD163', 'IBA1', 'CD14','CD45','CD11b','HLA.DR','CD68']

this_dict={}
for k1,k2v in prots_spearman[clus_R].loc[clus_R].iteritems():
    for k2,v in k2v.iteritems():
        k1k2=list(set(sorted([k1,k2])))
        key = tuple(tuple(k1k2))
        if key not in this_dict.keys() and len(key)>1:
            this_dict[key]=v

In [None]:
np.mean(list(this_dict.values()))

In [None]:
min(list(this_dict.values()))

In [None]:
max(list(this_dict.values()))

In [None]:
prots_pvals_adj[clus_R].loc[clus_R]

In [None]:
prots_pvals[clus_R].loc['EGFR']

In [None]:
prots_pvals_adj[clus_R].loc['EGFR']

In [None]:
np.mean(cor3.mean())

In [None]:
cor2=cor_mat[clus2].loc[clus2]
cor2

In [None]:
np.mean(cor2.mean())

In [None]:
cor1=cor_mat[clus1].loc[clus1]
cor1

In [None]:
np.mean(cor1.mean())