In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from matplotlib.ticker import PercentFormatter
import kaplanmeier as km
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
import seaborn as sns
from statannot import add_stat_annotation
from itertools import combinations 

ModuleNotFoundError: No module named 'kaplanmeier'

# Read in TARGET data files

In [10]:
# Read in sample z-score data from 'data_mRNA_median_all_sample_Zscores.txt' into pandas dataframe
z_scores = pd.read_csv('../nbl_target_2018_pub/data_mRNA_median_all_sample_Zscores.txt',  sep='\t')

In [11]:
#Read in patient and sample clinical data into pandas dataframe
clinical_patient = pd.read_csv('../nbl_target_2018_pub/data_clinical_patient.txt',  sep='\t', error_bad_lines=False)
clinical_sample = pd.read_csv('../nbl_target_2018_pub/data_clinical_sample.txt',  sep='\t', error_bad_lines=False)



In [12]:
#Trim patient and sample clinical data to get rid of extraneous headers
clinical_patient_trimmed = clinical_patient.iloc[4:,:]
clinical_sample_trimmed = clinical_sample.iloc[4:,:]

# Functions for stratifying data

In [13]:
#This function stratifies samples based on numerical information in the 'data_clinical_patient.txt' file
#Inputs: Column you want to stratify data on, numerical threshold you want to stratify by
#Output: 2-D array containing lists of samples in each stratified group
def stratifyNumericalPatientData(stratifyingColumn,threshold):
    
    #Stratify patients based threshold in stratifyingColumn
    group1_patients = clinical_patient_trimmed.loc[clinical_patient_trimmed[stratifyingColumn].astype(float) < threshold]['#Patient Identifier']
    group2_patients = clinical_patient_trimmed.loc[clinical_patient_trimmed[stratifyingColumn].astype(float) >= threshold]['#Patient Identifier']
    
    #Stratify samples based on patients in each group
    group1_samples = clinical_sample_trimmed.loc[clinical_sample_trimmed['#Patient Identifier'].isin(group1_patients)]['Sample Identifier']
    group2_samples = clinical_sample_trimmed.loc[clinical_sample_trimmed['#Patient Identifier'].isin(group2_patients)]['Sample Identifier']
    
    return [group1_samples, group2_samples]




In [14]:
#This function stratifies samples based on categorical information in the 'data_clinical_sample.txt' file
#Inputs: Column you want to stratify data on, the names of the two categories within the stratifyingColumn you want to stratify on
#Output: 2-D array containing lists of samples in each stratified group
def stratifyCategoricalSampleData(stratifyingColumn,category1,category2):
    #Statify samples based on categories in stratifyingColumn
    group1_samples = clinical_sample_trimmed.loc[clinical_sample_trimmed[stratifyingColumn]==category1]['Sample Identifier']
    group2_samples = clinical_sample_trimmed.loc[clinical_sample_trimmed[stratifyingColumn]==category2]['Sample Identifier']
    
    return [group1_samples, group2_samples]
    
    
    

In [15]:
#This function stratifies samples based on the expression of a gene of interest
#Inputs: gene of interest, lower z-score threshold, upper z-score threshold
#Outputs: 2-D array containing lists of samples in each stratified group
def stratifyDataOnGeneExpression(gene,lowerThresh,upperThresh):
    #Obtain the z-scores for the gene of interest
    gene_z_scores = z_scores.loc[z_scores['Hugo_Symbol']==gene].iloc[:,2:].transpose()
    #Find samples for which the z-scores are < lowerThresh
    lower_expression_samples = gene_z_scores.loc[gene_z_scores.iloc[:,0] < lowerThresh].index
    #Find samples for which the z-scores are > upperThresh
    higher_expression_samples = gene_z_scores.loc[gene_z_scores.iloc[:,0] > upperThresh].index
    return [lower_expression_samples, higher_expression_samples]

In [16]:
#This function stratifies samples based on the expression of a gene of interest
#Inputs: gene of interest, lower z-score threshold, upper z-score threshold
#Outputs: 2-D array containing lists of samples in each stratified group
#Stratified data into three groups (includes intermediate group), unlike original function
def stratifyDataOnGeneExpression2(gene,lowerThresh,upperThresh):
    #Obtain the z-scores for gene of interest
    gene_z_scores = z_scores.loc[z_scores['Hugo_Symbol']==gene].iloc[:,2:].transpose()
    #Find samples for which the z-scores are < lowerThresh
    lower_expression_samples = gene_z_scores.loc[gene_z_scores.iloc[:,0] < lowerThresh].index
    #Find samples for which z-scores are between thresholds
    intermediate_z_scores = gene_z_scores.loc[gene_z_scores.iloc[:,0] > lowerThresh]
    intermediate_expression_samples = intermediate_z_scores.loc[intermediate_z_scores.iloc[:,0] < upperThresh].index
    #Find samples for which the z-scores are > upperThresh
    higher_expression_samples = gene_z_scores.loc[gene_z_scores.iloc[:,0] > upperThresh].index
    return [lower_expression_samples, intermediate_expression_samples, higher_expression_samples]

# Function for comparing expression of a single gene between two groups

In [17]:
#This function compares the expression of a gene of interest between two groups
#Inputs: Hugo_Symbol of gene of interest, 2-D array containing lists of samples in two groups (output of functions above),
#label of stratifying variable and labels of each category within the stratifying variable (for labelling graphs)
#Outputs: boxplot and violin plot of gene expression compared between two groups, summary statistics
def compareExpression(gene,groups,stratifyingLabel,label1,label2):
    #Create new z-score dataframe for each group
    group1_columns = ['Hugo_Symbol','Entrez_Gene_Id'] + list(set(groups[0]).intersection(z_scores.columns))
    group1_z_scores = z_scores[group1_columns]
    group2_columns = ['Hugo_Symbol','Entrez_Gene_Id'] + list(set(groups[1]).intersection(z_scores.columns))
    group2_z_scores = z_scores[group2_columns]
    
    #Get gene expression data for specified gene for each group
    group1_gene_expression_df = group1_z_scores.loc[group1_z_scores['Hugo_Symbol'] == gene]
    group1_gene_expression_values = group1_gene_expression_df.iloc[0,2:].values.astype(float)
    group2_gene_expression_df = group2_z_scores.loc[group2_z_scores['Hugo_Symbol'] == gene]
    group2_gene_expression_values = group2_gene_expression_df.iloc[0,2:].values.astype(float)
    
    #Boxplot comparing expression between groups
    data = [group1_gene_expression_values, group2_gene_expression_values]
    plt.boxplot(data)
    plt.xticks([1,2],labels=[label1 + ' (n = ' + str(len(group1_gene_expression_values)) + ')', label2 + ' (n = ' + str(len(group2_gene_expression_values)) + ')'])
    plt.ylabel('z-scores')
    plt.title(gene + ' Expression stratified by ' + stratifyingLabel)
    plt.show()
    
    #Violin plot comparing expression between groups
    ax = sns.violinplot(data=data)
    ax.set_ylabel('z-scores')
    ax.set_xticklabels([label1 + ' (n = ' + str(len(group1_gene_expression_values)) + ')', label2 + ' (n = ' + str(len(group2_gene_expression_values)) + ')'])
    ax.set_title(gene + ' Expression stratified by ' + stratifyingLabel)
    plt.show()
    
    #Print median, mean, and 95% CI of z-scores for each group
    print(label1 + ": median = " + str(np.median(group1_gene_expression_values))
     + ", mean = " + str(np.mean(group1_gene_expression_values))
     + ", 95% CI = " + str(st.norm.interval(alpha=0.95, loc=np.mean(group1_gene_expression_values), scale=st.sem(group1_gene_expression_values))))
    
    print(label2 + ": median = " + str(np.median(group2_gene_expression_values))
         + ", mean = " + str(np.mean(group2_gene_expression_values))
         + ", 95% CI = " + str(st.norm.interval(alpha=0.95, loc=np.mean(group2_gene_expression_values), scale=st.sem(group2_gene_expression_values))))

    #Run independent t-test between group 1 and group 2
    print(st.ttest_ind(group1_gene_expression_values,group2_gene_expression_values))
    
    

In [18]:
#This function compares the expression of a gene of interest between multiple groups
#Inputs: Hugo_Symbol of gene of interest, 2-D array containing lists of samples in each groups (output of functions above),
#label of stratifying variable and list of labels for each category (for labelling graphs)
#Outputs: boxplot and violin plot of gene expression compared between groups, summary statistics
def compareExpression2(gene,groups,stratifyingLabel,categoryLabels, testType):
    #Find z-scores for specified gene for each group, and store in list
    gene_expression_values = []
    for group in groups:
        group_columns = ['Hugo_Symbol','Entrez_Gene_Id'] + list(set(group).intersection(z_scores.columns))
        group_z_scores = z_scores[group_columns]
        group_gene_expression_df = group_z_scores.loc[group_z_scores['Hugo_Symbol']==gene]
        group_gene_expression_values = group_gene_expression_df.iloc[0,2:].values.astype(float)
        gene_expression_values.append(group_gene_expression_values)
    
    #Update category labels to include sample size
    updated_categoryLabels = []
    for i in range(len(categoryLabels)):
        updated_categoryLabels.append(categoryLabels[i] + ' (n = ' + str(len(gene_expression_values[i])) + ')')
    
    #Make dataframe of gene expression values
    gene_expression_values_df = pd.DataFrame(index=range(max([len(i) for i in gene_expression_values])))
    for i in range(len(categoryLabels)):
        gene_expression_values_df[categoryLabels[i]] = pd.Series(gene_expression_values[i])
    
    #Violin plot comparing expression between groups
    ax = sns.violinplot(data=gene_expression_values)
    ax.set_ylabel('z-scores')
    ax.set_xticklabels(updated_categoryLabels)
    ax.set_title(gene + ' Expression stratified by ' + stratifyingLabel)
    ax, test_results = add_stat_annotation(ax, data=gene_expression_values_df,
                    box_pairs=list(combinations(categoryLabels,2)),
                    test=testType, text_format='simple', loc='inside', verbose=2, comparisons_correction=None)
    plt.show()
    
    #Print median, mean, and 95% CI of z-scores for each group
    for i in range(len(gene_expression_values)):
        print(categoryLabels[i] + ": median = " + str(np.median(gene_expression_values[i]))
         + ", mean = " + str(np.mean(gene_expression_values[i]))
         + ", 95% CI = " + str(st.norm.interval(alpha=0.95, loc=np.mean(gene_expression_values[i]), scale=st.sem(gene_expression_values[i]))))
        

    
    
    

# Functions for comparing EFS and OS between two groups

In [19]:
#This function compares the Event-Free Survival (EFS) between two groups
#Inputs: 2-D array containing lists of samples in two groups (output of functions above), title of graph, labels for each group
#Outputs: boxplot and histogram of EFS compared between two groups, summary statistics

def compareEFS(groups, title, label1, label2):
    #Stratify patients based on sample groups
    group1_patients = clinical_sample_trimmed.loc[clinical_sample_trimmed['Sample Identifier'].isin(groups[0])]['#Patient Identifier']
    group2_patients = clinical_sample_trimmed.loc[clinical_sample_trimmed['Sample Identifier'].isin(groups[1])]['#Patient Identifier']
    
    #Get EFS Times (in days) for each group
    group1_EFS = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(group1_patients)]['EFS Time'].astype(float).dropna()
    group2_EFS = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(group2_patients)]['EFS Time'].astype(float).dropna()
    
    #Print median, mean, and 95% CI of EFS for each group
    print(label1 + ": median EFS = " + str(np.median(group1_EFS))
          + ", mean EFS = " + str(np.mean(group1_EFS)) + 
         ", 95% CI = " + str(st.norm.interval(alpha=0.95, loc=np.mean(group1_EFS), scale=st.sem(group1_EFS))))
    print(label2 + ": median EFS = " + str(np.median(group2_EFS))
          + ", mean EFS = " + str(np.mean(group2_EFS)) + 
         ", 95% CI = " + str(st.norm.interval(alpha=0.95, loc=np.mean(group2_EFS), scale=st.sem(group2_EFS))))
    
    #Run independent t-test between group 1 and group 2
    print(st.ttest_ind(group1_EFS,group2_EFS))
    
    #Boxplot comparing EFS Time of each group
    plt.boxplot([group1_EFS, group2_EFS])
    plt.xticks([1,2],labels=[label1 + ' (n = ' + str(len(group1_EFS)) + ')', label2 +' (n = ' + str(len(group2_EFS)) + ')'])
    plt.ylabel('EFS (Days)')
    plt.title(title)
    plt.show()
    

In [67]:
#This function compares the Event-Free Survival (EFS) between multiple groups
#Inputs: 2-D array containing lists of samples in each groups (output of functions above),
#title of graph, list of labels for each group
#Outputs: boxplot of EFS compared between groups, summary statistics
#Capable of comparing multiple groups, unlike original function
def compareEFS2(groups, title, labels, testType):
    #Get EFS times for each group
    EFS = []
    for group in groups:
        patients = clinical_sample_trimmed.loc[clinical_sample_trimmed['Sample Identifier'].isin(group)]['#Patient Identifier']
        group_EFS = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(patients)]['EFS Time'].astype(float).dropna()
        EFS.append(group_EFS)
    
    #Update  labels to include sample size
    updated_labels = []
    for i in range(len(labels)):
        updated_labels.append(labels[i] + ' (n = ' + str(len(EFS[i])) + ')')
        
    #Make dataframe of EFS times
    EFS_df = pd.DataFrame(index=range(max([len(i) for i in EFS])))
    for i in range(len(labels)):
        EFS_df[labels[i]] = pd.Series(EFS[i].values)
    
    #Boxplot comparing EFS time of each group
    ax = sns.boxplot(data=EFS)
    ax.set_ylabel('EFS (Days)')
    ax.set_xticklabels(updated_labels)
    ax.set_title(title)
    ax, test_results = add_stat_annotation(ax, data=EFS_df,
                    box_pairs=list(combinations(labels,2)),
                    test=testType, text_format='simple', loc='inside', verbose=2, comparisons_correction=None)
    plt.show()
    
    #Print median, mean, and 95% CI of z-scores for each group
    for i in range(len(EFS)):
        print(labels[i] + ": median = " + str(np.median(EFS[i]))
         + ", mean = " + str(np.mean(EFS[i]))
         + ", 95% CI = " + str(st.norm.interval(alpha=0.95, loc=np.mean(EFS[i]), scale=st.sem(EFS[i]))))
        
        

In [122]:
#This function compares the Overall Survival (OS) between two groups
#Inputs: 2-D array containing lists of samples in two groups (output of functions above), title of graph, labels for each group
#Outputs: boxplot and histogram of OS compared between two groups, summary statistics
def compareOS(groups, title, label1, label2):
    #Stratify patients based on sample groups
    group1_patients = clinical_sample_trimmed.loc[clinical_sample_trimmed['Sample Identifier'].isin(groups[0])]['#Patient Identifier']
    group2_patients = clinical_sample_trimmed.loc[clinical_sample_trimmed['Sample Identifier'].isin(groups[1])]['#Patient Identifier']
    
    #Get OS Times (in days) for each group
    group1_OS = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(group1_patients)]['Overall Survival Days'].astype(float).dropna()
    group2_OS = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(group2_patients)]['Overall Survival Days'].astype(float).dropna()
    
    #Print median, mean, and 95% CI of OS for each group
    print(label1 + ": median OS = " + str(np.median(group1_OS))
          + ", mean OS = " + str(np.mean(group1_OS)) + 
         ", 95% CI = " + str(st.norm.interval(alpha=0.95, loc=np.mean(group1_OS), scale=st.sem(group1_OS))))
    print(label2 + ": median OS = " + str(np.median(group2_OS))
          + ", mean OS = " + str(np.mean(group2_OS)) + 
         ", 95% CI = " + str(st.norm.interval(alpha=0.95, loc=np.mean(group2_OS), scale=st.sem(group2_OS))))
    
    #Run independent t-test between group 1 and group 2
    print(st.ttest_ind(group1_OS,group2_OS))
    
    #Boxplot comparing OS Time of each group
    plt.boxplot([group1_OS, group2_OS])
    plt.xticks([1,2],labels=[label1 + ' (n = ' + str(len(group1_OS)) + ')', label2 +' (n = ' + str(len(group2_OS)) + ')'])
    plt.ylabel('OS (Days)')
    plt.title(title)
    plt.show()

    

# Functions for making Kaplan Meier curves comparing two groups

In [123]:
#This function plots a Kaplan Meier curve based on overall survival of two groups
#Inputs: 2-D array containing lists of samples in two groups (output of functions above), string labels for each group
#Output: Kaplan Meier curve
#This function utilizes the kaplanmeier python package (make sure to cite): https://pypi.org/project/kaplanmeier/
def kaplanmeierOS(groups, label1, label2):
    #Stratify patients based on sample groups
    group1_patients = clinical_sample_trimmed.loc[clinical_sample_trimmed['Sample Identifier'].isin(groups[0])]['#Patient Identifier']
    group2_patients = clinical_sample_trimmed.loc[clinical_sample_trimmed['Sample Identifier'].isin(groups[1])]['#Patient Identifier']
    
    #Get OS Times (in days) for each group
    group1_OS = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(group1_patients)]['Overall Survival Days'].astype(float)
    group2_OS = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(group2_patients)]['Overall Survival Days'].astype(float)
    
    #Get survival status for each group
    group1_survival_string = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(group1_patients)]['Overall Survival Status']
    group1_survival_num = pd.get_dummies(group1_survival_string).values.argmax(1)
    group2_survival_string = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(group2_patients)]['Overall Survival Status']
    group2_survival_num = pd.get_dummies(group2_survival_string).values.argmax(1)
    
    #Create dataframe containing OS Time and Surivival Status for all group1 samples
    group1_df = pd.DataFrame(index=groups[0])
    group1_df['OS_time'] = group1_OS.values
    group1_df['Survival_Status'] = group1_survival_num
    group1_df['Group'] = label1
    
    #Create dataframe containing OS Time and Surivival Status for all group2 samples
    group2_df = pd.DataFrame(index=groups[1])
    group2_df['OS_time'] = group2_OS.values
    group2_df['Survival_Status'] = group2_survival_num
    group2_df['Group'] = label2
    
    #Combine dataframes
    combined_df = pd.concat([group1_df,group2_df]).dropna()
    
    #Use kaplanmeier Python package to plot Kaplan-Meier survival curve
    out = km.fit(combined_df['OS_time'], combined_df['Survival_Status'], combined_df['Group'])
    km.plot(out)
    
    
    
    

In [124]:
#This function plots a Kaplan Meier curve based on event-free survival of two groups
#Inputs: 2-D array containing lists of samples in two groups (output of functions above), string labels for each group
#Output: Kaplan Meier curve
#This function utilizes the kaplanmeier python package (make sure to cite): https://pypi.org/project/kaplanmeier/
def kaplanmeierEFS(groups, label1, label2):
    #Stratify patients based on sample groups
    group1_patients = clinical_sample_trimmed.loc[clinical_sample_trimmed['Sample Identifier'].isin(groups[0])]['#Patient Identifier']
    group2_patients = clinical_sample_trimmed.loc[clinical_sample_trimmed['Sample Identifier'].isin(groups[1])]['#Patient Identifier']
    
    #Get EFS Times (in days) for each group
    group1_EFS = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(group1_patients)]['EFS Time'].astype(float)
    group2_EFS = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(group2_patients)]['EFS Time'].astype(float)
    
    #Get event status for each group
    group1_event_string = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(group1_patients)]['First Event']
    group1_event_num=[]
    for status in group1_event_string:
        if(status == 'Censored'):
            group1_event_num.append(0)
        else:
            group1_event_num.append(1)
    
    group2_event_string = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(group2_patients)]['First Event']
    group2_event_num=[]
    for status in group2_event_string:
        if(status == 'Censored'):
            group2_event_num.append(0)
        else:
            group2_event_num.append(1)
    
    #Create dataframe containing EFS Time and Event Status for all group1 samples
    group1_df = pd.DataFrame()
    group1_df['EFS_time'] = group1_EFS.values
    group1_df['Event_Status'] = group1_event_num
    group1_df['Group'] = label1
    
    #Create dataframe containing EFS Time and Event Status for all group2 samples
    group2_df = pd.DataFrame()
    group2_df['EFS_time'] = group2_EFS.values
    group2_df['Event_Status'] = group2_event_num
    group2_df['Group'] = label2
    
    #Combine dataframes
    combined_df = pd.concat([group1_df,group2_df]).dropna()
    
    #Use kaplanmeier Python package to plot Kaplan-Meier survival curve
    out = km.fit(combined_df['EFS_time'], combined_df['Event_Status'], combined_df['Group'])
    km.plot(out)
    
    

In [172]:
#This function plots a Kaplan Meier curve based on event-free survival of multiple groups
#Inputs: 2-D array containing lists of samples in each groups (output of functions above),
#title of graph, list of string labels for each group
#Output: Kaplan Meier curve
#This function utilizes the kaplanmeier python package (make sure to cite): https://pypi.org/project/kaplanmeier/
#This function is capable of comparing multiple groups, unlike original function
def kaplanmeierEFS2(groups, title, labels):
    #Get EFS times for each group
    EFS = []
    for group in groups:
        patients = clinical_sample_trimmed.loc[clinical_sample_trimmed['Sample Identifier'].isin(group)]['#Patient Identifier']
        group_EFS = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(patients)]['EFS Time'].astype(float).dropna()
        EFS.append(group_EFS)
    
    #Get event status for each group
    event_status = []
    i=0
    for group in groups:
        patients = clinical_sample_trimmed.loc[clinical_sample_trimmed['Sample Identifier'].isin(group)]['#Patient Identifier']
        event_string = clinical_patient_trimmed.loc[clinical_patient_trimmed['#Patient Identifier'].isin(patients)]['First Event']
        for index in (set(event_string.index)-set(EFS[i].index)):
            del event_string[index]
        event_num=[]
        for status in event_string:
            if(status == 'Censored'):
                event_num.append(0)
            else:
                event_num.append(1)
        event_status.append(event_num)
        i+=1
    
    #Create dataframe containing EFS Time and Event Status for each group
    group_dfs = []
    for i in range(len(groups)):
        group_df = pd.DataFrame()
        group_df['EFS_time'] = EFS[i]
        group_df['Event_Status'] = event_status[i]
        group_df['Group'] = labels[i]
        group_dfs.append(group_df)
    
    #Combine dataframes
    combined_df = pd.concat(group_dfs).dropna()
    
    #Use kaplanmeier Python package to plot Kaplan-Meier survival curve
    for i in range(len(groups)):
        kmf = KaplanMeierFitter(alpha=1)
        kmf.fit(group_dfs[i]['EFS_time'], group_dfs[i]['Event_Status'], label=labels[i])
        ax = kmf.plot()
    ax.set_title(title)
    ax.set_ylabel('EFS proportion')
    ax.set_xlabel('Days')
    plt.show()
    
    #logrank_test
    for combo in list(combinations(labels,2)):
        group1=combined_df[combined_df['Group']==combo[0]]
        group2=combined_df[combined_df['Group']==combo[1]]
        T=group1['EFS_time']
        E=group1['Event_Status']
        T1=group2['EFS_time']
        E1=group2['Event_Status']
        results=logrank_test(T,T1,event_observed_A=E, event_observed_B=E1)
        print('Log Rank Test between ' + combo[0] + ' and ' + combo[1])
        results.print_summary()
    
    

# Function for making a scotter plot of z-scores between two genes

In [184]:
def geneScatter(gene1, gene2):
    gene1_z_scores = z_scores.loc[z_scores['Hugo_Symbol']==gene1].iloc[:,2:].transpose()
    gene2_z_scores = z_scores.loc[z_scores['Hugo_Symbol']==gene2].iloc[:,2:].transpose()
    
    plt.scatter(gene1_z_scores,gene2_z_scores)
    plt.xlabel(gene1 + ' z-score')
    plt.ylabel(gene2 + ' z-score')
    plt.show()

    corr,_ = st.pearsonr(gene1_z_scores.iloc[:,0].tolist(),gene2_z_scores.iloc[:,0].tolist())
    print('Pearson correlation coefficient = ' + str(corr))


# Function for making scatter plot of z-scores of a gene of interest against EFS Time

In [243]:
def geneVSefs_scatter(gene):
    df = z_scores.loc[z_scores['Hugo_Symbol']==gene].iloc[:,2:].transpose()
    df.reset_index(inplace=True)
    df = df.rename(columns={'index':'Sample Identifier', df.columns[1]: gene})
    df = pd.merge(df, clinical_sample_trimmed[['Sample Identifier','#Patient Identifier']], on='Sample Identifier')
    df = pd.merge(df, clinical_patient_trimmed[['#Patient Identifier','EFS Time']], on='#Patient Identifier')
    df['EFS Time'] = df['EFS Time'].astype(float)
    df = df.dropna()
    plt.scatter(df[gene],df['EFS Time'])
    plt.show()
    
    plt.hist([df[gene],df['EFS Time']])
    
    corr,_ = st.pearsonr(df[gene],df['EFS Time'])
    print('Pearson correlation coefficient = ' + str(corr))
    