Analysis for genotype correlations (including code for Figure 3E and Supp. Fig. S5E)

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
import random
import glob
import pyreadr
from operator import itemgetter
import itertools
from itertools import groupby
import seaborn as sns
import matplotlib.ticker as ticker
import sys
import scipy.stats  as stats
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from statsmodels.stats import multitest
import scipy.optimize as opt

In [None]:
tumor_roi_dat = pd.read_csv('new_annotation_Feb2021_KW.txt', delimiter = "\t")
protein_dat=pd.read_csv('ERCC_IgG_norm.csv')
nuc_count = pd.read_csv('nuc_count.csv')

protein_dat=protein_dat.rename({'Unnamed: 0': 'Protein'}, axis=1) 

all_rois=list(protein_dat.columns)[1:]

tumor_core_roi_dict={}
excluded_rois=[]
for loc in range(len(tumor_roi_dat)):
    cur_row = tumor_roi_dat.iloc[loc]
    roi = cur_row['ROI']
    roi_num=roi.split('_')[1]
    if len(roi_num)==1:
        roi = 'ROI_0'+roi_num
    #print(cur_row)
    if roi in all_rois:
        key = (cur_row['Tumor_ID'],cur_row['Core_ID'])
        if key[0]==21 or key[0]==8:
            excluded_rois.append(roi)
        else:
            if key in tumor_core_roi_dict:
                tumor_core_roi_dict[key].append(roi)
            else:
                tumor_core_roi_dict[key] = [roi]            
            

tumor_ids=list(tumor_roi_dat['Tumor_ID'])
core_ids=list(tumor_roi_dat['Core_ID'])
fish_cores=list(tumor_roi_dat['FISH_core'])

tumor_core_to_shreya_base={(tumor_ids[i],core_ids[i]):fish_cores[i] for i in range(len(tumor_roi_dat))}
tumor_core_to_shreya={}
for k,v in tumor_core_to_shreya_base.items():
    if k in tumor_core_roi_dict.keys():
        tumor_core_to_shreya[k]=v
all_proteins_cores_shreya=list(tumor_core_to_shreya.values())

tumor_roi_dict={}
for k, rois in tumor_core_roi_dict.items():
    tumor=k[0]
    if tumor in tumor_roi_dict.keys():
        tumor_roi_dict[tumor]=tumor_roi_dict[tumor]+rois
    else:
        tumor_roi_dict[tumor]=rois
            
nuc_count=nuc_count.rename({'Unnamed: 0': 'ROI'}, axis=1) 

tumor_protein_dfs=[]
for tumor,rois in tumor_roi_dict.items():
    cur_df=protein_dat[rois]
    cur_df.index=protein_dat['Protein']
    tot_nuc_count=nuc_count[nuc_count['ROI'].isin(protein_dat[rois])]['nuclei_count'].sum()
    for roi in rois:
        cur_weight=list(nuc_count[nuc_count['ROI']==roi]['nuclei_count'])[0]/tot_nuc_count
        cur_df[roi]=cur_df[roi]*cur_weight 
    cur_df=cur_df.sum(axis=1).to_frame().rename(columns={0:tumor})
    tumor_protein_dfs.append(cur_df)
protein_dat_by_tumor=pd.concat(tumor_protein_dfs,1)

protein_dat_by_tumor=protein_dat_by_tumor.T

In [None]:
len(all_proteins_cores_shreya)

In [None]:
filenames = glob.glob("FISH_data/table*")
dfs_o = {}
#fnames=[]
for filename in filenames:
    df_o = pd.read_csv(filename)
    shreya_core_num = int(filename.split('_')[2])
    image_num = int(filename.split('_')[3].split('.')[0])
    #dfs_o.append(pd.read_csv(filename))
    #fnames.append(str(filename))
    dfs_o[(shreya_core_num,image_num)]=df_o

    
HI=6
MAX_HI = 50

def geno_hi(geno,df):
    return (df[geno]>=HI)&(df[geno]<=MAX_HI)
def geno_lo(geno,df):
    return (df[geno]>=1)&(df[geno]<HI)

for df_fish in list(dfs_o.values()):
    df_fish['EGFR_amp']=1*(geno_hi('EGFR',df_fish))
    df_fish['PDGFRA_amp']=1*(geno_hi('PDGFRA',df_fish))
    df_fish['CDK4_amp']=1*(geno_hi('CDK4',df_fish))
    
dfs={}    
for key,df in dfs_o.items():

    if key[0] not in tumor_core_to_shreya.values():
        continue
        
    
    df['EGFR_CDK4_amp']=1*((df['EGFR']>=HI)&(df['CDK4']>=HI))
    df['EGFR_NOT_CDK4_amp']=1*((df['EGFR']>=HI)&(df['CDK4']<HI))
    df['CDK4_NOT_EGFR_amp']=1*((df['CDK4']>=HI)&(df['EGFR']<HI))
    
    df['Non_amp']=1*((df['EGFR']<HI)&(df['CDK4']<HI))
    
    dfs[key]=df[(df['EGFR']<=MAX_HI)&(df['PDGFRA']<=MAX_HI)&(df['CDK4']<=MAX_HI)]
  

In [None]:
shreya_to_tumor={v:k[0] for k,v in tumor_core_to_shreya.items()}
core_list=list(set([item[0] for item in dfs.keys()]))
tumor_list = list(set([shreya_to_tumor[core] for core in core_list]))

tumor_image_dict={}
for k in dfs.keys():
    this_tumor=shreya_to_tumor[k[0]]
    if this_tumor in tumor_image_dict.keys():
        tumor_image_dict[this_tumor]=tumor_image_dict[this_tumor]+[k]
    else:
        tumor_image_dict[this_tumor]=[k]
        
dfs_in_tumor={}
for tumor, images in tumor_image_dict.items():
    dfs_in_tumor[tumor]=pd.concat([dfs[image] for image in images])

In [None]:
#TERT
filenames = glob.glob("hTERTmut_count_results/Log_MAX_TMAtert_TMA_*")
tert_files = {}
xy_files = {}

for filename in filenames:
    file_type = filename.split('TMA')[-1].split('_')[-1].split('.')[0]
    core_image = filename.split('TMA')[-1].split('__')[0].split('_')
    core = int(core_image[1])
    image = int(core_image[2])
    if file_type =='tert':
        tert_files[(core,image)]=filename
    elif file_type == 'xyarea':
        xy_files[(core,image)]=filename
        
dfs_TERT={}
for key in list(tert_files.keys()):
    if key in list(xy_files.keys()):
        
        if key[0] not in tumor_core_to_shreya.values():
            continue
        
        df_tert = pd.read_csv(tert_files[key],header=None)
        df_xy = pd.read_csv(xy_files[key],header=None)

        df_xy=df_xy.set_index(0)
        df_xy=df_xy.rename(columns={1:'X',2:'Y'})
        df_xy = df_xy.drop(3,1)

        df_tert=df_tert.set_index(0)
        df_tert=df_tert.rename(columns={1:'TERT_count'})

        df=pd.concat([df_xy,df_tert],axis=1, sort=False)
        df['TERT_bool']=1*(df['TERT_count']>0)
        if len(df)>10:
            dfs_TERT[key]=df

In [None]:
tumor_image_dict_TERT={}
for k in dfs_TERT.keys():
    this_tumor=shreya_to_tumor[k[0]]
    if this_tumor in tumor_image_dict_TERT.keys():
        tumor_image_dict_TERT[this_tumor]=tumor_image_dict_TERT[this_tumor]+[k]
    else:
        tumor_image_dict_TERT[this_tumor]=[k]
        
dfs_in_tumor_TERT={}
for tumor, images in tumor_image_dict_TERT.items():
    dfs_in_tumor_TERT[tumor]=pd.concat([dfs_TERT[image] for image in images])
    
TERT_dict={}
cats=['TERT']
for tumor in tumor_image_dict_TERT.keys():
    this_df=dfs_in_tumor_TERT[tumor]
    TERT_dict[tumor]=len(this_df[this_df['TERT_bool']==1])/len(this_df)
    
TERT_prop_df=pd.DataFrame.from_dict(TERT_dict,orient='index').rename(columns={0:'TERT'})

In [None]:
all_cells_df=pd.concat([this_tumor_df for this_tumor_df in dfs_in_tumor_TERT.values()])

In [None]:
len(all_cells_df)

In [None]:
len(all_cells_df[all_cells_df['TERT_bool']==1])

In [None]:
EGFR_dict={}
CDK4_dict={}
EGFR_NOT_CDK4_dict={}
CDK4_NOT_EGFR_dict={}
EGFR_AND_CDK4_dict={}
Non_amp_dict={}
cats=['EGFR_amp','CDK4_amp','EGFR_CDK4_amp']
for tumor in tumor_list:
    this_df=dfs_in_tumor[tumor]
    EGFR_dict[tumor]=len(this_df[this_df['EGFR_amp']==1])/len(this_df)
    CDK4_dict[tumor]=len(this_df[this_df['CDK4_amp']==1])/len(this_df)
    EGFR_NOT_CDK4_dict[tumor]=len(this_df[this_df['EGFR_NOT_CDK4_amp']==1])/len(this_df)
    CDK4_NOT_EGFR_dict[tumor]=len(this_df[this_df['CDK4_NOT_EGFR_amp']==1])/len(this_df)
    EGFR_AND_CDK4_dict[tumor]=len(this_df[this_df['EGFR_CDK4_amp']==1])/len(this_df)
    Non_amp_dict[tumor]=len(this_df[this_df['Non_amp']==1])/len(this_df)
    
EGFR_df=pd.DataFrame.from_dict(EGFR_dict,orient='index').rename(columns={0:'EGFR_prop'})
CDK4_df=pd.DataFrame.from_dict(CDK4_dict,orient='index').rename(columns={0:'CDK4_prop'})
EGFR_only_df=pd.DataFrame.from_dict(EGFR_NOT_CDK4_dict,orient='index').rename(columns={0:'EGFR_only_prop'})
CDK4_only_df=pd.DataFrame.from_dict(CDK4_NOT_EGFR_dict,orient='index').rename(columns={0:'CDK4_only_prop'})
EGFR_AND_CDK4_df=pd.DataFrame.from_dict(EGFR_AND_CDK4_dict,orient='index').rename(columns={0:'EGFR_AND_CDK4_prop'})
Non_amp_df=pd.DataFrame.from_dict(Non_amp_dict,orient='index').rename(columns={0:'Non_amp_prop'})

df_combined=pd.concat([EGFR_df,CDK4_df,EGFR_AND_CDK4_df,Non_amp_df],1)

In [None]:
df_combined

In [None]:
df_combined=pd.concat([df_combined,TERT_prop_df],1).dropna()

In [None]:
df_combined=df_combined.rename(columns={'EGFR_prop':'E','CDK4_prop':'C','EGFR_AND_CDK4_prop':'EC','Non_amp_prop':'NO'})
df_combined.to_csv('Genotype_TERT_proportions.csv')
#Followed by R code for obtaining Spearman coefficients and adjusted p-values --> Genotype_TERT_proportins_Spearman_coeff.csv

In [None]:
def inverse_fit(x,a,b):
    return a/x+b
def linear_fit(x,a,b):
    return a*x+b

In [None]:
len(df_combined)

In [None]:
df_combined.columns

In [None]:
spearman_coeffs=pd.read_csv('Genotype_TERT_proportins_Spearman_coeff.csv').replace('NO','N/O')
spearman_coeffs=spearman_coeffs.rename(columns={'NO':'N/O'})
spearman_adj_p=pd.read_csv('Genotype_TERT_proportins_pvals_adj.csv').replace('NO','N/O')
spearman_adj_p=spearman_adj_p.rename(columns={'NO':'N/O'})
spearman_coeffs.index=spearman_coeffs['Unnamed: 0']
spearman_adj_p.index=spearman_adj_p['Unnamed: 0']
spearman_coeffs=spearman_coeffs.drop(['X','Unnamed: 0'],1)
spearman_adj_p=spearman_adj_p.drop(['X','Unnamed: 0'],1)
spearman_coeffs=spearman_coeffs.iloc[1: , :]
spearman_adj_p=spearman_adj_p.iloc[1: , :]

In [None]:
df_combined=df_combined.replace('NO','N/O')
df_combined=df_combined.rename(columns={'NO':'N/O'})

In [None]:
spearman_coeffs.loc['N/O']['N/O']

In [None]:
spearman_coeffs.loc['E']['NO']

In [None]:
plt.rcParams["font.size"] = 24
plt.rcParams["font.family"] = 'Arial'
title_size=22
fig, axes = plt.subplots(1,4,figsize=(20,5))
#E vs NA
cat_x='E'
cat_y='TERT'
X=list(df_combined[cat_x])
Y=list(df_combined[cat_y])
coeff=spearman_coeffs.loc[cat_x][cat_y]
pval=spearman_adj_p.loc[cat_x][cat_y]
axes[0].scatter(X,Y,color='seagreen',s=40)
axes[0].set_xlabel(cat_x)
axes[0].set_ylabel(cat_y)
axes[0].set_title('Spearman '+'{:.2f}'.format(coeff)+', p = '+'{:.2f}'.format(pval),fontsize=title_size)


#E vs NA
cat_x='C'
cat_y='TERT'
X=list(df_combined[cat_x])
Y=list(df_combined[cat_y])
coeff=spearman_coeffs.loc[cat_x][cat_y]
pval=spearman_adj_p.loc[cat_x][cat_y]
axes[1].scatter(X,Y,color='seagreen',s=40)
axes[1].set_xlabel(cat_x)
axes[1].set_ylabel(cat_y)
axes[1].set_title('Spearman '+'{:.2f}'.format(coeff)+', p = '+'{:.2f}'.format(pval),fontsize=title_size)


#EC vs E
cat_x='EC'
cat_y='TERT'
X=list(df_combined[cat_x])
Y=list(df_combined[cat_y])
coeff=spearman_coeffs.loc[cat_x][cat_y]
pval=spearman_adj_p.loc[cat_x][cat_y]
axes[2].scatter(X,Y,color='seagreen',s=40)
axes[2].set_xlabel(cat_x)
axes[2].set_ylabel(cat_y)
axes[2].set_title('Spearman '+'{:.2f}'.format(coeff)+', p = '+'{:.2f}'.format(pval),fontsize=title_size)


#EC vs C
cat_x='N/O'
cat_y='TERT'
X=list(df_combined[cat_x])
Y=list(df_combined[cat_y])
coeff=spearman_coeffs.loc[cat_x][cat_y]
pval=spearman_adj_p.loc[cat_x][cat_y]
axes[3].scatter(X,Y,color='seagreen',s=40)
axes[3].set_xlabel(cat_x)
axes[3].set_ylabel(cat_y)
axes[3].set_title('Spearman '+'{:.2f}'.format(coeff)+', p = '+'{:.2f}'.format(pval),fontsize=title_size)


plt.tight_layout()
plt.savefig('SuppFig4TERTcorrelations.pdf')
plt.show()

In [None]:
plt.rcParams["font.size"] = 25
plt.rcParams["font.family"] = 'Arial'
title_size=23
fig, axes = plt.subplots(1, 2,figsize=(11,5))
#E vs NA
cat_x='N/O'
cat_y='E'
X=list(df_combined[cat_x])
Y=list(df_combined[cat_y])
coeff=spearman_coeffs.loc[cat_x][cat_y]
pval=spearman_adj_p.loc[cat_x][cat_y]
#[coeff,pval]=list(stats.spearmanr(list(df_combined[cat_x]),list(df_combined[cat_y]))) #Spearman, p
axes[0].scatter(X,Y,color='blue',s=40)
axes[0].set_xlabel('Proportion of '+cat_x)
axes[0].set_ylabel('Proportion of '+cat_y)
axes[0].set_yticks([0,0.1,0.2,0.3,0.4,0.5,0.6])
#axes[0,0].text(0.63,0.45,'Spearman '+'{:.2f}'.format(coeff)+'\nP-value = '+'{:.1e}'.format(pval))
axes[0].set_title('Spearman '+'{:.2f}'.format(coeff)+', p = '+'{:.1e}'.format(pval),fontsize=title_size)
coeffs_inv,cov_inv=opt.curve_fit(inverse_fit,X,Y)
x_fit=np.arange(min(X)-0.01,max(X)+0.01,0.01)
y_fit=[inverse_fit(x,coeffs_inv[0],coeffs_inv[1]) for x in x_fit] #coeffs_inv[0] is a, coeffs_inv[1] is b
sns.lineplot(x_fit,y_fit,color='darkgray',ci=False,ax=axes[0])

#E vs NA
cat_x='N/O'
cat_y='EC'
X=list(df_combined[cat_x])
Y=list(df_combined[cat_y])
coeff=spearman_coeffs.loc[cat_x][cat_y]
pval=spearman_adj_p.loc[cat_x][cat_y]
#[coeff,pval]=list(stats.spearmanr(list(df_combined[cat_x]),list(df_combined[cat_y]))) #Spearman, p
axes[1].scatter(X,Y,color='blue',s=40)
axes[1].set_xlabel('Proportion of '+cat_x)
axes[1].set_ylabel('Proportion of '+cat_y)
axes[1].set_yticks([0,0.05,0.1,0.15])
#axes[0,1].text(0.63,0.152,'Spearman '+'{:.2f}'.format(coeff)+'\nP-value = '+'{:.1e}'.format(pval))
axes[1].set_title('Spearman '+'{:.2f}'.format(coeff)+', p = '+'{:.1e}'.format(pval),fontsize=title_size)
coeffs_inv,cov_inv=opt.curve_fit(inverse_fit,X,Y)
x_fit=np.arange(min(X)-0.01,max(X)+0.01,0.01)
y_fit=[inverse_fit(x,coeffs_inv[0],coeffs_inv[1]) for x in x_fit] #coeffs_inv[0] is a, coeffs_inv[1] is b
sns.lineplot(x_fit,y_fit,color='darkgray',ci=False,ax=axes[1])

plt.tight_layout()
plt.savefig('Figure2/Fig2correlations_top.pdf',bbox_inches = 'tight',
    pad_inches = 0)
plt.show()

In [None]:
plt.rcParams["font.size"] = 25
plt.rcParams["font.family"] = 'Arial'
title_size=23
fig, axes = plt.subplots(1, 2,figsize=(11,5))
#EC vs E
cat_x='E'
cat_y='EC'
X=list(df_combined[cat_x])
Y=list(df_combined[cat_y])
coeff=spearman_coeffs.loc[cat_x][cat_y]
pval=spearman_adj_p.loc[cat_x][cat_y]
#[coeff,pval]=list(stats.spearmanr(list(df_combined[cat_x]),list(df_combined[cat_y]))) #Spearman, p
axes[0].scatter(X,Y,color='tomato',s=40)
axes[0].set_xlabel('Proportion of '+cat_x)
axes[0].set_ylabel('Proportion of '+cat_y)
axes[0].set_yticks([0,0.05,0.1,0.15])
axes[0].set_xticks([0,0.1,0.2,0.3,0.4,0.5])
#axes[1,0].text(0.05,0.152,'Spearman '+'{:.2f}'.format(coeff)+'\nP-value = '+'{:.1e}'.format(pval))
axes[0].set_title('Spearman '+'{:.2f}'.format(coeff)+', p = '+'{:.1e}'.format(pval),fontsize=title_size)
m, b = np.polyfit(X, Y, 1)
axes[0].plot(X, m*np.array(X) + b,color='darkgray')

#EC vs C
cat_x='C'
cat_y='EC'
X=list(df_combined[cat_x])
Y=list(df_combined[cat_y])
coeff=spearman_coeffs.loc[cat_x][cat_y]
pval=spearman_adj_p.loc[cat_x][cat_y]
#[coeff,pval]=list(stats.spearmanr(list(df_combined[cat_x]),list(df_combined[cat_y]))) #Spearman, p
axes[1].scatter(X,Y,color='tomato',s=40)
axes[1].set_xlabel('Proportion of '+cat_x)
axes[1].set_ylabel('Proportion of '+cat_y)
axes[1].set_xticks([0,0.1,0.2,0.3,0.4,0.5])
axes[1].set_yticks([0,0.05,0.1,0.15])
#axes[1,1].text(0.05,0.152,'Spearman '+'{:.2f}'.format(coeff)+'\nP-value = '+'{:.1e}'.format(pval))
#axes[1,1].set_title('Spearman '+'{:.2f}'.format(coeff)+'\nP-value = '+'{:.1e}'.format(pval),fontsize=title_size)
axes[1].set_title('Spearman '+'{:.2f}'.format(coeff)+', p = '+'{:.1e}'.format(pval),fontsize=title_size)
m, b = np.polyfit(X, Y, 1)
axes[1].plot(X, m*np.array(X) + b,color='darkgray')

plt.tight_layout()
plt.savefig('Figure2/Fig2correlations_bottom.pdf',bbox_inches = 'tight',
    pad_inches = 0)
plt.show()