Survival analysis for TCGA data (including code for Fig. 5E)

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats  as stats
pd.set_option('display.max_columns', None)
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
from lifelines import CoxPHFitter


In [None]:
CD163_dat=pd.read_csv('CD163__mRNA_expression_(U133_microarray_only).txt',sep='\t').drop(['Study ID'],1).dropna()
included_patients=list(set(list(CD163_dat['Patient ID'])))

In [None]:
clinical_dat=pd.read_csv('gbm_tcga_clinical_data.tsv',sep='\t')

In [None]:
clinical_dat=clinical_dat[clinical_dat['Patient ID'].isin(included_patients)][['Patient ID','Diagnosis Age','Sex','Disease Free (Months)','Disease Free Status','Overall Survival (Months)','Overall Survival Status']]

In [None]:
IDH_mut_tumor_dat=pd.read_csv('gbm_tcga_clinical_data_only_IDH_mut.tsv',sep='\t')

In [None]:
IDH_mut_tumors=list(IDH_mut_tumor_dat['Patient ID'])

In [None]:
len(clinical_dat)

In [None]:
len(CD163_dat)

In [None]:
len(included_patients)

In [None]:
len(CD163_dat['Patient ID'].unique())

In [None]:
mRNA_dat=CD163_dat.rename(columns={'CD163: mRNA expression (U133 microarray only)':'CD163'}).set_index('Patient ID').drop(['Sample ID'],1)
mRNA_dat['Patient ID']=mRNA_dat.index
mRNA_dat=mRNA_dat.reset_index(level=0, drop=True).reset_index().drop(['index'],1)

In [None]:
survival_dat=pd.read_csv('KM_Plot__Overall_Survival__(months).txt',sep='\t').drop(['Study ID'],1)

In [None]:
clinical_dat['Male']=1*(clinical_dat['Sex']=='Male')
clinical_dat=clinical_dat.drop(['Sex'],1)

In [None]:
combined_df=clinical_dat.merge(mRNA_dat,how='inner',on='Patient ID')

In [None]:
len(IDH_mut_tumors)

In [None]:
included_tumors=list(set(list(combined_df['Patient ID'])).difference(IDH_mut_tumors))

In [None]:
len(included_tumors)

In [None]:
len(combined_df)

In [None]:
combined_df=combined_df[combined_df['Patient ID'].isin(included_tumors)]

In [None]:
combined_df=combined_df.rename(columns={'Patient ID':'case_id','Overall Survival Status':'status','Overall Survival (Months)':'last_observation'})
combined_df=combined_df.replace({'1:DECEASED':1,'0:LIVING':0})

In [None]:
combined_df=combined_df[['case_id','Diagnosis Age','last_observation','status','Male','CD163']]
combined_df=combined_df.rename(columns={'Diagnosis Age':'Age'})

In [None]:
combined_df=combined_df.dropna()

In [None]:
len(combined_df)

In [None]:
df_combined=combined_df.copy()
kmf = KaplanMeierFitter()

In [None]:
cph = CoxPHFitter()
def clean_cph_table(this_df):
    return this_df[['exp(coef)','exp(coef) lower 95%','exp(coef) upper 95%','p']]
def get_survival_stats_sets(cur_set1,cur_set2):
    this_df=df_combined[df_combined['case_id'].isin(cur_set1+cur_set2)]
    this_df['in_group']=1*(this_df['case_id'].isin(cur_set1))
    cph.fit(this_df, duration_col='last_observation', event_col='status', formula="in_group+Male+Age")
    return clean_cph_table(cph.summary)
def format_stats(tab):
    HR=tab['exp(coef)']['in_group']
    HR_lo=tab['exp(coef) lower 95%']['in_group']
    HR_hi=tab['exp(coef) upper 95%']['in_group']
    p_val=tab['p']['in_group']
    return 'Cox HR = %.2f'%HR+ ' [%.2f'%HR_lo+',%.2f'%HR_hi+'],\n                 p = %.3f'%p_val

def show_survival_curves(tumors1,name1,color1,tumors2,name2,color2,cox_stats,fname):
    df1=df_combined[df_combined['case_id'].isin(tumors1)]

    df2=df_combined[df_combined['case_id'].isin(tumors2)]
    

    fig, ax = plt.subplots(figsize=(12, 10))


    df_base = df1.dropna()
    df_base_name=name1
    df2 = df2.dropna()
    df2_name=name2

    lr_result2=logrank_test(df_base['last_observation'], df2['last_observation'], event_observed_A=df_base['status'], event_observed_B=df2['status'])
    pval2=lr_result2.p_value
    plt.rcParams["font.size"] = 30
    plt.rcParams["font.family"] = 'Arial'
    kmf.fit(df_base['last_observation'],df_base['status'],label=df_base_name).plot_survival_function(ax=ax,color=color1)
    kmf.fit(df2['last_observation'],df2['status'],label=df2_name).plot_survival_function(ax=ax,color=color2)
    plt.xlabel('Days',fontsize=30)
    plt.ylabel('Proportion Survived',fontsize=30)
    plt.legend()
    plt.title('Effect of CD163 mRNA expression on survival',fontsize=30)
    plt.text(48,0.6,'Log-rank p ='+'{:.3f}'.format(pval2)+'\n'+cox_stats,fontsize=28)
    plt.setp(ax.artists, edgecolor = 'black')
    plt.setp(ax.lines, color='black')
    for axis in ['top','bottom','left','right']:
        ax.spines[axis].set_linewidth(1.2)
    plt.tight_layout()
    #plt.savefig(fname)
    plt.show()

In [None]:
q_los=[1/3,1/4,1/5,1/2-0.001]
q_his=[2/3,3/4,4/5,1/2+0.001]
ind=0
q_lo=q_los[ind]
q_hi=q_his[ind]
prot='CD163'

quants=df_combined[prot].quantile([q_lo,q_hi])
lo_thresh=quants[q_lo]
hi_thresh=quants[q_hi]
lo_CD163_tumors=list(df_combined[df_combined[prot]<lo_thresh]['case_id'])
hi_CD163_tumors=list(df_combined[df_combined[prot]>=hi_thresh]['case_id'])
tab=get_survival_stats_sets(hi_CD163_tumors,lo_CD163_tumors)
c_stats=format_stats(tab)
t1=hi_CD163_tumors
n1='Expression in top 1/3'
c1='green'
t2=lo_CD163_tumors
n2='Expression in bottom 1/3'
c2='black'
show_survival_curves(t1,n1,c1,t2,n2,c2,c_stats,'TCGA_top_bot_tertiles_exclude_IDH_mut.pdf')