Protein expression analysis by OR (including code for Figures 5A and 5B)

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
import random
import glob
from operator import itemgetter
import itertools
from itertools import groupby
import seaborn as sns
import matplotlib.ticker as ticker
import sys
import scipy.stats  as stats
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
import statsmodels.api as sm

In [None]:
tumor_roi_dat = pd.read_csv('new_annotation_Feb2021_KW.txt', delimiter = "\t")
protein_dat=pd.read_csv('ERCC_IgG_norm.csv')
nuc_count = pd.read_csv('nuc_count.csv')

protein_dat=protein_dat.rename({'Unnamed: 0': 'Protein'}, axis=1) 

all_rois=list(protein_dat.columns)[1:]

tumor_core_roi_dict={}
excluded_rois=[]
for loc in range(len(tumor_roi_dat)):
    cur_row = tumor_roi_dat.iloc[loc]
    roi = cur_row['ROI']
    roi_num=roi.split('_')[1]
    if len(roi_num)==1:
        roi = 'ROI_0'+roi_num
    #print(cur_row)
    if roi in all_rois:
        key = (cur_row['Tumor_ID'],cur_row['Core_ID'])
        if key[0]==21 or key[0]==8:
            excluded_rois.append(roi)
        else:
            if key in tumor_core_roi_dict:
                tumor_core_roi_dict[key].append(roi)
            else:
                tumor_core_roi_dict[key] = [roi]            

tumor_ids=list(tumor_roi_dat['Tumor_ID'])
core_ids=list(tumor_roi_dat['Core_ID'])
fish_cores=list(tumor_roi_dat['FISH_core'])

tumor_core_to_shreya_base={(tumor_ids[i],core_ids[i]):fish_cores[i] for i in range(len(tumor_roi_dat))}
tumor_core_to_shreya={}
for k,v in tumor_core_to_shreya_base.items():
    if k in tumor_core_roi_dict.keys():
        tumor_core_to_shreya[k]=v
all_proteins_cores_shreya=list(tumor_core_to_shreya.values())

tumor_roi_dict={}
for k, rois in tumor_core_roi_dict.items():
    tumor=k[0]
    if tumor in tumor_roi_dict.keys():
        tumor_roi_dict[tumor]=tumor_roi_dict[tumor]+rois
    else:
        tumor_roi_dict[tumor]=rois
            
nuc_count=nuc_count.rename({'Unnamed: 0': 'ROI'}, axis=1) 

tumor_protein_dfs=[]
for tumor,rois in tumor_roi_dict.items():
    cur_df=protein_dat[rois]
    cur_df.index=protein_dat['Protein']
    tot_nuc_count=nuc_count[nuc_count['ROI'].isin(protein_dat[rois])]['nuclei_count'].sum()
    for roi in rois:
        cur_weight=list(nuc_count[nuc_count['ROI']==roi]['nuclei_count'])[0]/tot_nuc_count
        cur_df[roi]=cur_df[roi]*cur_weight 
    cur_df=cur_df.sum(axis=1).to_frame().rename(columns={0:tumor})
    tumor_protein_dfs.append(cur_df)
protein_dat_by_tumor=pd.concat(tumor_protein_dfs,1)

protein_dat_by_tumor=protein_dat_by_tumor.T

In [None]:
protein_list=list(protein_dat_by_tumor.columns)

In [None]:
protein_dat_by_tumor_rescaled = pd.DataFrame(scaler.fit_transform(protein_dat_by_tumor), columns=protein_dat_by_tumor.columns)
protein_dat_by_tumor_rescaled.index=protein_dat_by_tumor.index

In [None]:
tumor_OR_df=pd.read_csv('tumor_OR_df.csv').rename(columns={'Unnamed: 0':'Tumor'})
tumor_OR_df.index=tumor_OR_df['Tumor']
tumor_OR_df=tumor_OR_df.drop(['Tumor'],1)

In [None]:
protein_dat_by_tumor_rescaled.index.name='Tumor'

In [None]:
proteins_with_OR_df=pd.concat([protein_dat_by_tumor_rescaled,tumor_OR_df],1)

In [None]:
def get_MW_stats(dist1,dist2):
    MW_results=stats.mannwhitneyu(dist1, dist2)
    MW_statistic=MW_results.statistic
    MW_pval=MW_results.pvalue
    return [MW_statistic,MW_pval]

In [None]:
tumor_group1=list(proteins_with_OR_df[proteins_with_OR_df['OR_char']=='OR, top 1/3'].index)
tumor_group2=list(proteins_with_OR_df[proteins_with_OR_df['OR_char']=='OR, bottom 1/3'].index)

results_MW_dict={}
for protein in protein_list:
    dist1=[proteins_with_OR_df.loc[tumor][protein] for tumor in tumor_group1]
    dist2=[proteins_with_OR_df.loc[tumor][protein] for tumor in tumor_group2]
    results_MW_dict[protein]=get_MW_stats(dist1,dist2)

df_MW=pd.DataFrame.from_dict(results_MW_dict).T.rename(columns={0:'MW_statistic',1:'P-value'})
df_MW_sig_min=df_MW[df_MW['P-value']<0.05].sort_values(by='P-value',ascending=True)

In [None]:

results_mean_diff_dict={}
for protein in protein_list:
    dist1=[proteins_with_OR_df.loc[tumor][protein] for tumor in tumor_group1]
    dist2=[proteins_with_OR_df.loc[tumor][protein] for tumor in tumor_group2]
    results_mean_diff_dict[protein]=abs(np.mean(dist1)-np.mean(dist2))

df_diff_means=pd.DataFrame.from_dict(results_mean_diff_dict,orient='index').rename(columns={0:'Abs. diff. in means'})
df_diff_means=df_diff_means.sort_values(by='Abs. diff. in means',ascending=False)

In [None]:
df_diff_means_MW_set=df_diff_means.loc[df_MW_sig_min.index].sort_values(by='Abs. diff. in means',ascending=False)

In [None]:
new_dfs=[]
sample_prot_list=list(df_MW_sig_min.index)
for prot in sample_prot_list:
    new_df=proteins_with_OR_df[[prot,'OR_char']]
    new_df['Protein']=prot
    new_df=new_df.rename(columns={prot:'Normalized expression'})
    new_dfs.append(new_df)
full_df=pd.concat(new_dfs)
plt.rcParams["font.family"] = "Arial"
plt.rcParams["font.size"] = "28"
fig = plt.figure(figsize=(22,8))
ax=sns.boxplot(x="Protein", y="Normalized expression",hue="OR_char", palette=["lightsteelblue", 'lightgray',"salmon"], data=full_df,width=0.5)
ax.legend_.remove()
plt.xlabel('')
plt.xticks(rotation=45)
plt.setp(ax.artists, edgecolor = 'black')
plt.setp(ax.lines, color='black')
for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(1.2)
plt.tight_layout()
#plt.savefig('all_sig_proteins_from_MW.pdf')
plt.show()

In [None]:
df_for_corr=proteins_with_OR_df[list(df_MW_sig_min.index)]#.drop(['Tumor_OR'],1)
cor=df_for_corr.corr(method='spearman')
plt.rcParams["font.family"] = "Arial"
plt.rcParams["font.size"] = 22
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.clustermap(cor, cmap=cmap, metric='correlation', linewidths=0,yticklabels=1,xticklabels=1,vmin=-1,vmax=1,cbar_kws={"ticks":[-1,-0.5,0.,0.5,1]}) 
#plt.tight_layout()
#plt.savefig('MW_sig_proteins_clustered_v2.pdf')
plt.show()

In [None]:
df_MW_sig_min