Analysis of distances and clustering (including code for Figures 4 D,E,G,H)

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
import random
import glob
from operator import itemgetter
import itertools
from itertools import groupby
import seaborn as sns
import matplotlib.ticker as ticker
import sys
import scipy.stats  as stats
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
import statsmodels.api as sm
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
tumor_roi_dat = pd.read_csv('new_annotation_Feb2021_KW.txt', delimiter = "\t")
protein_dat=pd.read_csv('ERCC_IgG_norm.csv')
nuc_count = pd.read_csv('nuc_count.csv')

protein_dat=protein_dat.rename({'Unnamed: 0': 'Protein'}, axis=1) 

all_rois=list(protein_dat.columns)[1:]

tumor_core_roi_dict={}
excluded_rois=[]
for loc in range(len(tumor_roi_dat)):
    cur_row = tumor_roi_dat.iloc[loc]
    roi = cur_row['ROI']
    roi_num=roi.split('_')[1]
    if len(roi_num)==1:
        roi = 'ROI_0'+roi_num
    #print(cur_row)
    if roi in all_rois:
        key = (cur_row['Tumor_ID'],cur_row['Core_ID'])
        if key[0]==21 or key[0]==8:
            excluded_rois.append(roi)
        else:
            if key in tumor_core_roi_dict:
                tumor_core_roi_dict[key].append(roi)
            else:
                tumor_core_roi_dict[key] = [roi]            


tumor_ids=list(tumor_roi_dat['Tumor_ID'])
core_ids=list(tumor_roi_dat['Core_ID'])
fish_cores=list(tumor_roi_dat['FISH_core'])

tumor_core_to_shreya_base={(tumor_ids[i],core_ids[i]):fish_cores[i] for i in range(len(tumor_roi_dat))}
tumor_core_to_shreya={}
for k,v in tumor_core_to_shreya_base.items():
    if k in tumor_core_roi_dict.keys():
        tumor_core_to_shreya[k]=v
all_proteins_cores_shreya=list(tumor_core_to_shreya.values())

tumor_roi_dict={}
for k, rois in tumor_core_roi_dict.items():
    tumor=k[0]
    if tumor in tumor_roi_dict.keys():
        tumor_roi_dict[tumor]=tumor_roi_dict[tumor]+rois
    else:
        tumor_roi_dict[tumor]=rois
            
nuc_count=nuc_count.rename({'Unnamed: 0': 'ROI'}, axis=1) 

tumor_protein_dfs=[]
for tumor,rois in tumor_roi_dict.items():
    cur_df=protein_dat[rois]
    cur_df.index=protein_dat['Protein']
    tot_nuc_count=nuc_count[nuc_count['ROI'].isin(protein_dat[rois])]['nuclei_count'].sum()
    for roi in rois:
        cur_weight=list(nuc_count[nuc_count['ROI']==roi]['nuclei_count'])[0]/tot_nuc_count
        cur_df[roi]=cur_df[roi]*cur_weight 
    cur_df=cur_df.sum(axis=1).to_frame().rename(columns={0:tumor})
    tumor_protein_dfs.append(cur_df)
protein_dat_by_tumor=pd.concat(tumor_protein_dfs,1)

protein_dat_by_tumor=protein_dat_by_tumor.T

In [None]:
filenames = glob.glob("FISH_data/table*")
dfs_o = {}

for filename in filenames:
    df_o = pd.read_csv(filename)
    shreya_core_num = int(filename.split('_')[2])
    image_num = int(filename.split('_')[3].split('.')[0])
    dfs_o[(shreya_core_num,image_num)]=df_o

    
HI=6
MAX_HI = 50

def geno_hi(geno,df):
    return (df[geno]>=HI)&(df[geno]<=MAX_HI)
def geno_lo(geno,df):
    return (df[geno]>=1)&(df[geno]<HI)

for df_fish in list(dfs_o.values()):
    df_fish['EGFR_amp']=1*(geno_hi('EGFR',df_fish))
    df_fish['PDGFRA_amp']=1*(geno_hi('PDGFRA',df_fish))
    df_fish['CDK4_amp']=1*(geno_hi('CDK4',df_fish))
    
dfs={}    
for key,df in dfs_o.items():

    if key[0] not in tumor_core_to_shreya.values():
        continue

    df['EGFR_CDK4_amp']=1*((df['EGFR']>=HI)&(df['CDK4']>=HI))
    df['EGFR_NOT_CDK4_amp']=1*((df['EGFR']>=HI)&(df['CDK4']<HI))
    df['CDK4_NOT_EGFR_amp']=1*((df['CDK4']>=HI)&(df['EGFR']<HI))    
    df['Non_amp']=1*((df['EGFR']<HI)&(df['CDK4']<HI))
    
    dfs[key]=df[(df['EGFR']<=MAX_HI)&(df['PDGFRA']<=MAX_HI)&(df['CDK4']<=MAX_HI)]
 

In [None]:
shreya_to_tumor={v:k[0] for k,v in tumor_core_to_shreya.items()}
core_list=list(set([item[0] for item in dfs.keys()]))
tumor_list = list(set([shreya_to_tumor[core] for core in core_list]))

tumor_image_dict={}
for k in dfs.keys():
    this_tumor=shreya_to_tumor[k[0]]
    if this_tumor in tumor_image_dict.keys():
        tumor_image_dict[this_tumor]=tumor_image_dict[this_tumor]+[k]
    else:
        tumor_image_dict[this_tumor]=[k]
        
dfs_in_tumor={}
for tumor, images in tumor_image_dict.items():
    dfs_in_tumor[tumor]=pd.concat([dfs[image] for image in images])

In [None]:
tumor_OR_df=pd.read_csv('tumor_OR_df.csv').rename(columns={'Unnamed: 0':'Tumor'})
tumor_OR_df.index=tumor_OR_df['Tumor']
tumor_OR_df=tumor_OR_df.drop(['Tumor'],1)

In [None]:
ORhigh_tumors=list(tumor_OR_df[tumor_OR_df['OR_char']=='OR, top 1/3'].index)
ORmiddle_tumors=list(tumor_OR_df[tumor_OR_df['OR_char']=='OR, middle 1/3'].index)
ORlow_tumors=list(tumor_OR_df[tumor_OR_df['OR_char']=='OR, bottom 1/3'].index)

In [None]:
HI_list=[(dfs_in_tumor[tumor][dfs_in_tumor[tumor]['EGFR_NOT_CDK4_amp']==1]['EGFR'].sum())/len(dfs_in_tumor[tumor][dfs_in_tumor[tumor]['EGFR_NOT_CDK4_amp']==1]) for tumor in ORhigh_tumors]
MID_list=[(dfs_in_tumor[tumor][dfs_in_tumor[tumor]['EGFR_NOT_CDK4_amp']==1]['EGFR'].sum())/len(dfs_in_tumor[tumor][dfs_in_tumor[tumor]['EGFR_NOT_CDK4_amp']==1]) for tumor in ORmiddle_tumors]
LO_list=[(dfs_in_tumor[tumor][dfs_in_tumor[tumor]['EGFR_NOT_CDK4_amp']==1]['EGFR'].sum())/len(dfs_in_tumor[tumor][dfs_in_tumor[tumor]['EGFR_NOT_CDK4_amp']==1]) for tumor in ORlow_tumors]
df_high=pd.DataFrame.from_dict({'High':HI_list},orient='index').T.rename(columns={'High':'EGFR copy number'})
df_high['hue']='high'
df_mid=pd.DataFrame.from_dict({'Mid':MID_list},orient='index').T.rename(columns={'Mid':'EGFR copy number'})
df_mid['hue']='mid'
df_low=pd.DataFrame.from_dict({'Low':LO_list},orient='index').T.rename(columns={'Low':'EGFR copy number'})
df_low['hue']='low'
dfs_combined_E=pd.concat([df_high,df_mid,df_low])
dfs_combined_E['Genotype']='E'

plt.rcParams["font.family"] = "Arial"
plt.rcParams["font.size"] = "28"
fig = plt.figure(figsize=(6,6))
ax=sns.boxplot(x="Genotype", y="EGFR copy number",hue="hue",palette=["salmon",'lightgray',"lightsteelblue"], data=dfs_combined_E,width=0.7)
plt.xlabel('')
plt.ylabel('EGFR copy number\nin E cells')
ax.legend_.remove() #comment out to include legend in plot
plt.ylim([7,16])
ax.set(xticklabels=[])
ax.set(xlabel=None)
plt.setp(ax.artists, edgecolor = 'black')
plt.setp(ax.lines, color='black')
for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(1.2)
plt.tight_layout()
plt.savefig('EGFR_copy_number_among_EGFR_amplified_cells.pdf')
plt.show()

In [None]:
HI_list=[(dfs_in_tumor[tumor][dfs_in_tumor[tumor]['CDK4_NOT_EGFR_amp']==1]['CDK4'].sum())/len(dfs_in_tumor[tumor][dfs_in_tumor[tumor]['CDK4_NOT_EGFR_amp']==1]) for tumor in ORhigh_tumors]
MID_list=[(dfs_in_tumor[tumor][dfs_in_tumor[tumor]['CDK4_NOT_EGFR_amp']==1]['CDK4'].sum())/len(dfs_in_tumor[tumor][dfs_in_tumor[tumor]['CDK4_NOT_EGFR_amp']==1]) for tumor in ORmiddle_tumors]
LO_list=[(dfs_in_tumor[tumor][dfs_in_tumor[tumor]['CDK4_NOT_EGFR_amp']==1]['CDK4'].sum())/len(dfs_in_tumor[tumor][dfs_in_tumor[tumor]['CDK4_NOT_EGFR_amp']==1]) for tumor in ORlow_tumors]
df_high=pd.DataFrame.from_dict({'High':HI_list},orient='index').T.rename(columns={'High':'CDK4 copy number'})
df_high['hue']='high'
df_mid=pd.DataFrame.from_dict({'Mid':MID_list},orient='index').T.rename(columns={'Mid':'CDK4 copy number'})
df_mid['hue']='mid'
df_low=pd.DataFrame.from_dict({'Low':LO_list},orient='index').T.rename(columns={'Low':'CDK4 copy number'})
df_low['hue']='low'
dfs_combined_C=pd.concat([df_high,df_mid,df_low])
dfs_combined_C['Genotype']='C'

plt.rcParams["font.family"] = "Arial"
plt.rcParams["font.size"] = "28"
fig = plt.figure(figsize=(6,6))
ax=sns.boxplot(x="Genotype", y="CDK4 copy number",hue="hue",palette=["salmon",'lightgray',"lightsteelblue"], data=dfs_combined_C,width=0.7)

plt.xlabel('')
plt.ylabel('CDK4 copy number\nin C cells')
ax.legend_.remove() #comment out to include legend in plot
plt.ylim([7,16])
ax.set(xticklabels=[])
ax.set(xlabel=None)
plt.setp(ax.artists, edgecolor = 'black')
plt.setp(ax.lines, color='black')
for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(1.2)
plt.tight_layout()
plt.savefig('CDK4_copy_number_among_CDK4_amplified_cells.pdf')
plt.show()

In [None]:
def get_MW_stats(dist1,dist2):
    MW_results=stats.mannwhitneyu(dist1, dist2)
    MW_statistic=MW_results.statistic
    MW_pval=MW_results.pvalue
    return [MW_statistic,MW_pval]

In [None]:
dist1=list(dfs_combined_E[dfs_combined_E['hue']=='high']['EGFR copy number'])
dist2=list(dfs_combined_E[dfs_combined_E['hue']=='low']['EGFR copy number'])
get_MW_stats(dist1,dist2)

In [None]:
np.mean(dist1)

In [None]:
np.mean(dist2)

In [None]:
dfs_combined_C

In [None]:
dist1=list(dfs_combined_C[dfs_combined_C['hue']=='high']['CDK4 copy number'])
dist2=list(dfs_combined_C[dfs_combined_C['hue']=='low']['CDK4 copy number'])
get_MW_stats(dist1,dist2)

In [None]:
def xy_geno(key,geno):
    df=dfs[key]
    subset=df[(df[geno]==1)]
    #print(subset)
    x_dat=subset['X']
    y_dat=subset['Y']
    this_list=list(map(list,zip(x_dat,y_dat)))
    #print(this_list)
    #print([tuple(item) for item in this_list])
    return [tuple(item) for item in this_list]

def dists_same_geno(key,geno):
    df=dfs[key]
    this_list = xy_geno(key,geno)
    all_pairs=list(itertools.combinations(this_list, 2))
    dists= [np.linalg.norm(np.array(pair[0])-np.array(pair[1])) for pair in all_pairs]
    return dists

def get_dist(pt1,pt2):
    return np.linalg.norm(np.array(pt1)-np.array(pt2))

def get_all_pts(key):
    df=dfs[key]
    x_dat=df['X']
    y_dat=df['Y']
    this_list=list(map(list,zip(x_dat,y_dat)))
    return [tuple(item) for item in this_list]

def min_dists_diff_geno(key,geno1,geno2):
    df=dfs[key]
    this_list1 = xy_geno(key,geno1)
    this_list2 = xy_geno(key,geno2)
    min_dists=[]
    if len(this_list1)<5 or len(this_list2)<5:
        return np.nan
    for pt1 in this_list1:
        all_dists=[get_dist(pt1,pt2) for pt2 in this_list2]
        min_dists.append(min(all_dists))
    return np.mean(min_dists)

def min_dists_same_geno(key,geno1):
    df=dfs[key]
    this_list1 = xy_geno(key,geno1)
    min_dists=[]
    if len(this_list1)<5:
        return np.nan
    for pt1 in this_list1:
        all_other_pts_same=[x for x in this_list1 if x!= pt1]
        all_dists=[get_dist(pt1,pt2) for pt2 in all_other_pts_same]        
        min_dists.append(min(all_dists))
    return np.mean(min_dists)

def min_rel_dists_diff_geno(key,geno1,geno2):
    df=dfs[key]
    this_list1 = xy_geno(key,geno1)
    this_list2 = xy_geno(key,geno2)
    if len(this_list1)<5 or len(this_list2)<5:
        return np.nan
    all_pts = get_all_pts(key)
    rel_min_dists=[]
    for pt1 in this_list1:
        all_other_pts=[x for x in all_pts if x!= pt1]
        all_dists_diff_geno=[get_dist(pt1,pt2) for pt2 in this_list2]
        all_dists_any=[get_dist(pt1,pt2) for pt2 in all_other_pts]
        min_dist_diff_geno=min(all_dists_diff_geno)
        min_dist_any=min(all_dists_any)
        rel_min_dists.append(min_dist_diff_geno/min_dist_any)
    return np.mean(rel_min_dists)

def min_rel_dists_same_geno(key,geno1):
    df=dfs[key]
    this_list1 = xy_geno(key,geno1)
    if len(this_list1)<5:
        return np.nan
    all_pts = get_all_pts(key)
    rel_min_dists=[]
    for pt1 in this_list1:
        all_other_pts=[x for x in all_pts if x!= pt1]
        all_other_pts_same=[x for x in this_list1 if x!= pt1]
        all_dists_same_geno=[get_dist(pt1,pt2) for pt2 in all_other_pts_same]
        all_dists_any=[get_dist(pt1,pt2) for pt2 in all_other_pts]
        min_dist_same_geno=min(all_dists_same_geno)
        min_dist_any=min(all_dists_any)
        rel_min_dists.append(min_dist_same_geno/min_dist_any)
    return np.mean(rel_min_dists)

def min_rel_dists_same_geno_tumors(tumor_group,geno1):
    tumor_stats=[]
    for tumor in tumor_group:
        indi_points=[]
        cell_nums=[]
        notnan=0
        for im in tumor_image_dict[tumor]:
            #print(im)
            this_dist=min_rel_dists_same_geno(im,geno1)
            if not np.isnan(this_dist):
                notnan+=1
                indi_points.append(this_dist)
                cell_nums.append(len(dfs[im]))
        if notnan>0:
            weighted_list=list(np.array([indi_points[i]*cell_nums[i] for i in range(len(indi_points))])/sum(cell_nums))
            tumor_stats.append(sum(weighted_list))
    return tumor_stats

def min_dists_same_geno_tumors(tumor_group,geno1):
    tumor_stats=[]
    for tumor in tumor_group:
        indi_points=[]
        cell_nums=[]
        notnan=0
        for im in tumor_image_dict[tumor]:
            this_dist=min_dists_same_geno(im,geno1)
            if not np.isnan(this_dist):
                notnan+=1
                indi_points.append(this_dist)
                cell_nums.append(len(dfs[im]))
        if notnan>0:
            weighted_list=list(np.array([indi_points[i]*cell_nums[i] for i in range(len(indi_points))])/sum(cell_nums))
            tumor_stats.append(sum(weighted_list))
    return tumor_stats

def min_rel_dists_diff_geno_tumors(tumor_group,geno1,geno2):
    tumor_stats=[]
    for tumor in tumor_group:
        indi_points=[]
        cell_nums=[]
        notnan = 0
        for im in tumor_image_dict[tumor]:
            #print(im)
            this_dist=min_rel_dists_diff_geno(im,geno1,geno2)
            if not np.isnan(this_dist):
                notnan+=1
                indi_points.append(this_dist)
                cell_nums.append(len(dfs[im]))
        if notnan>0:
            weighted_list=list(np.array([indi_points[i]*cell_nums[i] for i in range(len(indi_points))])/sum(cell_nums))
            tumor_stats.append(sum(weighted_list))
    return tumor_stats

def min_dists_diff_geno_tumors(tumor_group,geno1,geno2):
    tumor_stats=[]
    for tumor in tumor_group:
        #print(tumor)
        indi_points=[]
        cell_nums=[]
        notnan=0
        for im in tumor_image_dict[tumor]:
            #print(im)
            this_dist=min_dists_diff_geno(im,geno1,geno2)
            if not np.isnan(this_dist):
                notnan+=1
                indi_points.append(this_dist)
                cell_nums.append(len(dfs[im]))
        if notnan>0:
            weighted_list=list(np.array([indi_points[i]*cell_nums[i] for i in range(len(indi_points))])/sum(cell_nums))
            tumor_stats.append(sum(weighted_list))
    return tumor_stats



In [None]:
tumor_groups={'Top OR tertile':ORhigh_tumors,'Middle OR tertile':ORmiddle_tumors,'Bottom OR tertile':ORlow_tumors}
dist_dfs={}
group_dfs=[]
for group_name,tumor_group in tumor_groups.items():
    this_dict={}
    vals=min_rel_dists_diff_geno_tumors(tumor_group,'EGFR_NOT_CDK4_amp','CDK4_NOT_EGFR_amp')
    this_dict['Distance']=vals
    this_dict['Tertile']=[group_name for i in range(len(vals))]
    this_dict['Type']=['E to C' for i in range(len(vals))]
    dist_name='EGFR-CDK4 distance'
    group_dfs.append(pd.DataFrame.from_dict(this_dict,orient='index').T)
dist_dfs[dist_name]=pd.concat(group_dfs)
group_dfs=[]
for group_name,tumor_group in tumor_groups.items():
    this_dict={}
    vals=min_rel_dists_same_geno_tumors(tumor_group,'EGFR_NOT_CDK4_amp')
    this_dict['Distance']=vals
    this_dict['Tertile']=[group_name for i in range(len(vals))]
    this_dict['Type']=['E to E' for i in range(len(vals))]
    dist_name='EGFR-EGFR distance'
    group_dfs.append(pd.DataFrame.from_dict(this_dict,orient='index').T)
dist_dfs[dist_name]=pd.concat(group_dfs)
group_dfs=[]
for group_name,tumor_group in tumor_groups.items():
    this_dict={}
    vals=min_rel_dists_same_geno_tumors(tumor_group,'CDK4_NOT_EGFR_amp')
    this_dict['Distance']=vals
    this_dict['Tertile']=[group_name for i in range(len(vals))]
    this_dict['Type']=['C to C' for i in range(len(vals))]
    dist_name='CDK4-CDK4 distance'
    group_dfs.append(pd.DataFrame.from_dict(this_dict,orient='index').T)
dist_dfs[dist_name]=pd.concat(group_dfs)
group_dfs=[]
for group_name,tumor_group in tumor_groups.items():
    this_dict={}
    vals=min_rel_dists_same_geno_tumors(tumor_group,'EGFR_CDK4_amp')
    this_dict['Distance']=vals
    this_dict['Tertile']=[group_name for i in range(len(vals))]
    this_dict['Type']=['EC to EC' for i in range(len(vals))]
    dist_name='dual-dual distance'
    group_dfs.append(pd.DataFrame.from_dict(this_dict,orient='index').T)
dist_dfs[dist_name]=pd.concat(group_dfs)
group_dfs=[]


In [None]:
full_df=pd.concat(list(dist_dfs.values()))
full_df=full_df.rename(columns={'Distance':'Relative Distance'})
plt.rcParams["font.family"] = "Arial"
plt.rcParams["font.size"] = "28"
fig = plt.figure(figsize=(12,6))
ax=sns.boxplot(x="Type", y="Relative Distance",hue="Tertile", palette=["salmon", 'lightgray',"lightsteelblue"], data=full_df,width=0.6)
#plt.ylim([0.5,1.1])
#plt.xlabel('Cell type to cell type')
plt.xlabel('')
plt.ylabel('\nRelative Distance')
ax.legend_.remove() #comment out to include legend in plot
plt.setp(ax.artists, edgecolor = 'black')
plt.setp(ax.lines, color='black')
for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(1.2)
plt.tight_layout()
plt.savefig('ave_min_relative_distance_distributions_boxplots.pdf')
plt.show()

In [None]:
results_MW_dict={}
for dist_type in dist_dfs.keys():
    this_df=dist_dfs[dist_type]
    dist1=list(this_df[this_df['Tertile']=='Top OR tertile']['Distance'])
    dist2=list(this_df[this_df['Tertile']=='Bottom OR tertile']['Distance'])
    results_MW_dict[dist_type]=get_MW_stats(dist1,dist2)
df_MW=pd.DataFrame.from_dict(results_MW_dict).T.rename(columns={0:'MW_statistic',1:'P-value'})

In [None]:
df_MW

In [None]:
def sil_max(X):
    sil_max = 0.
    kmax = 5
    kmax = min(kmax,len(X)-1)
    for k in range(2, kmax+1):
        kmeans = KMeans(n_clusters = k).fit(X)
        labels = kmeans.labels_
        sil_score=silhouette_score(X, labels, metric = 'euclidean')
        if sil_score<1.0:
            if sil_score>sil_max:
                sil_max = sil_score
                k_best = k
        else:
            break

    return k_best

def clusters_xy_bcss_to_tss(key,geno):
    X=np.asarray(xy_geno(key,geno))
    if len(X)<=3:
        return np.nan
    k = sil_max(X)
    kmeans = KMeans(n_clusters = k).fit(X)
    labels = kmeans.labels_
    wcss=kmeans.inertia_
    
    X_mean = np.mean(X,axis=0)
    tss = (np.linalg.norm(X-X_mean))**2
    bcss = tss-wcss
    return bcss/tss

def ratio_clusters_xy_bcss_to_tss(key,geno1,geno2):
    bcss_tss1=clusters_xy_bcss_to_tss(key,geno1)
    bcss_tss2=clusters_xy_bcss_to_tss(key,geno2)
    
    if pd.isna(bcss_tss1) or pd.isna(bcss_tss2):
        return np.nan
    else:
        return bcss_tss1/bcss_tss2
    
def mean_clus_all_pts(key):
    df=dfs[key]
    x_dat=df['X']
    y_dat=df['Y']
    this_list=list(map(list,zip(x_dat,y_dat)))
    
    X=np.asarray(this_list)
    if len(X)<=3:
        return np.nan
    k = sil_max(X)
    kmeans = KMeans(n_clusters = k).fit(X)
    labels = kmeans.labels_
    wcss=kmeans.inertia_
    
    X_mean = np.mean(X,axis=0)
    tss = (np.linalg.norm(X-X_mean))**2
    bcss = tss-wcss
    return bcss/tss

def clusters_xy_bcss_to_tss_tumors(tumor_group,geno1):
    tumor_stats=[]
    for tumor in tumor_group:
        indi_points=[]
        cell_nums=[]
        notnan=0
        for im in tumor_image_dict[tumor]:
            this_dist=clusters_xy_bcss_to_tss(im,geno1)
            if not np.isnan(this_dist):
                notnan+=1
                indi_points.append(this_dist)
                cell_nums.append(len(dfs[im]))
        if notnan>0:
            weighted_list=list(np.array([indi_points[i]*cell_nums[i] for i in range(len(indi_points))])/sum(cell_nums))
            tumor_stats.append(sum(weighted_list))
    return tumor_stats

In [None]:

dist_dfs={}
group_dfs=[]
for group_name,tumor_group in tumor_groups.items():
    this_dict={}
    vals=clusters_xy_bcss_to_tss_tumors(tumor_group,'EGFR_NOT_CDK4_amp')
    this_dict['BCSS/TSS']=vals
    this_dict['Tertile']=[group_name for i in range(len(vals))]
    this_dict['Type']=['E' for i in range(len(vals))]
    dist_name='EGFR(only) BCSS/TSS'
    group_dfs.append(pd.DataFrame.from_dict(this_dict,orient='index').T)
dist_dfs[dist_name]=pd.concat(group_dfs)
group_dfs=[]
for group_name,tumor_group in tumor_groups.items():
    this_dict={}
    vals=clusters_xy_bcss_to_tss_tumors(tumor_group,'CDK4_NOT_EGFR_amp')
    this_dict['BCSS/TSS']=vals
    this_dict['Tertile']=[group_name for i in range(len(vals))]
    this_dict['Type']=['C' for i in range(len(vals))]
    dist_name='CDK4(only) BCSS/TSS'
    group_dfs.append(pd.DataFrame.from_dict(this_dict,orient='index').T)
dist_dfs[dist_name]=pd.concat(group_dfs)
group_dfs=[]
for group_name,tumor_group in tumor_groups.items():
    this_dict={}
    vals=clusters_xy_bcss_to_tss_tumors(tumor_group,'EGFR_CDK4_amp')
    this_dict['BCSS/TSS']=vals
    this_dict['Tertile']=[group_name for i in range(len(vals))]
    this_dict['Type']=['EC' for i in range(len(vals))]
    dist_name='dual-amp BCSS/TSS'
    group_dfs.append(pd.DataFrame.from_dict(this_dict,orient='index').T)
dist_dfs[dist_name]=pd.concat(group_dfs)
group_dfs=[]
for group_name,tumor_group in tumor_groups.items():
    this_dict={}
    vals=clusters_xy_bcss_to_tss_tumors(tumor_group,'Non_amp')
    this_dict['BCSS/TSS']=vals
    this_dict['Tertile']=[group_name for i in range(len(vals))]
    this_dict['Type']=['NA' for i in range(len(vals))]
    dist_name='Non-amp BCSS/TSS'
    group_dfs.append(pd.DataFrame.from_dict(this_dict,orient='index').T)
dist_dfs[dist_name]=pd.concat(group_dfs)


full_df_cluster=pd.concat(list(dist_dfs.values()))

In [None]:
plt.rcParams["font.family"] = "Arial"
plt.rcParams["font.size"] = "28"
fig = plt.figure(figsize=(12,6))
ax=sns.boxplot(x="Type", y="BCSS/TSS",hue="Tertile", palette=["salmon", 'lightgray',"lightsteelblue"], data=full_df_cluster,width=0.6)
#plt.ylim([0.5,1])
#plt.xlabel('Cell type')
plt.xlabel('')
plt.ylabel('\nBCSS/TSS')
ax.legend_.remove() #comment out to include legend in plot
plt.setp(ax.artists, edgecolor = 'black')
plt.setp(ax.lines, color='black')
for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(1.2)
plt.tight_layout()
plt.savefig('bcss_tss_distributions_boxplots.pdf')
plt.show()

In [None]:
results_MW_dict={}
for dist_type in dist_dfs.keys():
    this_df=dist_dfs[dist_type]
    dist1=list(this_df[this_df['Tertile']=='Top OR tertile']['BCSS/TSS'])
    dist2=list(this_df[this_df['Tertile']=='Bottom OR tertile']['BCSS/TSS'])
    results_MW_dict[dist_type]=get_MW_stats(dist1,dist2)
df_MW=pd.DataFrame.from_dict(results_MW_dict).T.rename(columns={0:'MW_statistic',1:'P-value'})

In [None]:
df_MW