In [654]:
maf_file_path = 'data/RESPOND_247_coding_final.maf'
expression_file_path = 'data/Expression_remove_BE.txt'
target_gene='MYO15A'

import pandas as pd
import numpy as np
from statistics import mean 
from scipy.stats import ranksums
import matplotlib.pylab as plt
from statsmodels.stats import multitest


In [655]:
def load_maf_data(file_path):
    # Define columns of interest
    columns = ["Hugo_Symbol", "Tumor_Sample_Barcode"]
    df = pd.read_csv(file_path, sep='\t', comment="#", usecols=columns)
    df.rename(columns={'Hugo_Symbol': 'gene', 'Tumor_Sample_Barcode': 'sample'}, inplace=True)
    df['mutation'] = 1
    return df
    
maf_df = load_maf_data(maf_file_path)



In [656]:
def log2(value):
    return np.log2(value + 1)

def load_txt_file_into_dataframe(file_path):
    # Read the .txt file into a pandas DataFrame
    df = pd.read_csv(file_path, sep='\t')  # Adjust the separator if needed

    # take log2 of expression data to scale expression data
    df = df.map(log2)

    return df


# Call the load_txt_file_into_dataframe function
expression_df = load_txt_file_into_dataframe(expression_file_path)

In [657]:
def reformat_expression_data(df):
    # Combine column names and index names into rows for every element
    melted_df = pd.melt(df.reset_index(), id_vars=['index'], var_name='column', col_level=0)

    # Rename columns
    melted_df.rename(columns={'index': 'gene', 'column': 'sample', 'value': 'gene_expression'}, inplace=True)

    return melted_df


expression_df_melted = reformat_expression_data(expression_df)

In [658]:
# note individuals with muttated data may be missing in the expression data and vice versa. 
# every sample should have at least one mutation 

In [659]:
def preprocess_and_combine_mutation_expression(maf_df, expression_df):
    ''' filter the expression data to those that have whole genome sequencing 
     i.e. appear in the mutation data frame (maf)
     join mutation and expression data
     '''
    
    exon_seq_samples = maf_df['sample'].unique()
    filtered_expression_df = expression_df[expression_df['sample'].isin(exon_seq_samples)]
    
    all_rows = expression_df.shape[0]
    filt_rows = filtered_expression_df.shape[0]
    
    percentage_filtered = (all_rows - filt_rows) / all_rows
    
    print('fraction of rows filtered is', percentage_filtered)

    express_mut_genes_df = pd.merge(maf_df, filtered_expression_df, on=['gene', 'sample'], how='right')

    express_mut_genes_df['mutation'].fillna(0, inplace=True)

    return express_mut_genes_df


mutation_expression_df_melted =  preprocess_and_combine_mutation_expression(maf_df, expression_df_melted)
    

fraction of rows filtered is 0.2865853658536585


In [660]:
def calculate_log_fold(list_mutated, list_non_mutated, smoothing_factor=1e-8): 
    mean_mutated = mean(list_mutated)  + smoothing_factor
    mean_non_mutated = mean(list_non_mutated) + smoothing_factor
        
    return np.log2(mean_mutated/mean_non_mutated)

In [661]:
def calculate_z_score(list_mutated, list_non_mutated):
    mean_mutated = mean(list_mutated) 
    mean_non_mutated = mean(list_non_mutated) 
    all_expression_values = list_non_mutated + list_mutated
    z_score = (mean_mutated - mean_non_mutated) / np.std(all_expression_values)
    return z_score


In [662]:
# this isn't used for gene expression 
# from scipy.stats import ttest_ind
# def calculate_pvalue_and_effect_size_ttest(list_mutated, list_non_mutated):
#     t_stat, p_value = ttest_ind(list_mutated, list_non_mutated, equal_var=False)
#     return p_value, t_stat


In [663]:
def calculate_pvalue_and_effect_size_wilcox_ranksum(list_mutated, list_non_mutated):
     test = ranksums(list_mutated, list_non_mutated)
     u_statistic = test.statistic 
     effect_size = u_statistic / (len(list_mutated) * len(list_non_mutated))
     pvalue=test.pvalue
     return pvalue, effect_size

In [664]:
def calculate_adjusted_pvalue(pvalues, method='fdr_bh'):
    _, corrected_pvalues, _, _ = multitest.multipletests(pvalues, method=method)
    return corrected_pvalues


In [665]:
import os
def generate_stats_per_gene(express_mut_genes_df, target_gene, output_folder=target_gene):
    """
    Given expression and mutation data calculates LogFC, pvalue, mean of expression per gene 
    for a given target_gene 
    
    """
    gene_df = express_mut_genes_df[express_mut_genes_df['gene']==target_gene]

    mutated_samples = gene_df[gene_df['mutation'] == 1]['sample']
    non_mutated_samples = gene_df[gene_df['mutation'] == 0]['sample']
    
    mutated_individuals_expression = express_mut_genes_df[express_mut_genes_df['sample'].isin(mutated_samples)]
    
    non_mutated_individuals_expression = express_mut_genes_df[express_mut_genes_df['sample'].isin(non_mutated_samples)]

    # gather data of mutated and non-mutated genes into lists
    mutated_individuals_data = mutated_individuals_expression.groupby(['gene'])['gene_expression'].apply(
    lambda x: list(x)).to_frame().reset_index().rename(columns={'gene_expression': 'gene_expression_mutated'})
    non_mutated_individuals_data = non_mutated_individuals_expression.groupby(['gene'])['gene_expression'].apply(
    lambda x: list(x)).to_frame().reset_index().rename(columns={'gene_expression': 'gene_expression_non_mutated'})

    # combine mutated and unmutated data into one df
    combined_data= pd.merge(mutated_individuals_data, non_mutated_individuals_data, on='gene', how='inner')

    # calculate fold change and p-value and z-score
    combined_data['logFC']= combined_data.apply(
        lambda x: calculate_log_fold(x.gene_expression_mutated, x.gene_expression_non_mutated), axis=1)
    # calculate wilcox pvalue
    combined_data[['pvalue', 'effect_size']] = combined_data.apply(
    lambda x: pd.Series(calculate_pvalue_and_effect_size_wilcox_ranksum(x.gene_expression_mutated, x.gene_expression_non_mutated)),
    axis=1
    )
    combined_data['expression_mutated_mean'] = combined_data['gene_expression_mutated'].apply(mean)
    combined_data['expression_nonmutated_mean'] = combined_data['gene_expression_non_mutated'].apply(mean)


    # calculate adjusted p-value
    combined_data['adjusted_pvalue'] = calculate_adjusted_pvalue(combined_data['pvalue'].values)

    # Output data to csv 
    combined_data.drop(columns=['gene_expression_mutated', 'gene_expression_non_mutated'], inplace=True)
    
    os.makedirs(output_folder, exist_ok=True)
    output_filename = f'{output_folder}/{mutated_samples.count()}_{non_mutated_samples.count()}_logfc_pvalue.csv'
    
    print(f"outputting data to {output_filename}")
    combined_data.to_csv(output_filename, index=False)

    return combined_data, mutated_samples, output_filename


In [666]:
volcano_plot_df, individuals_mutated_target_gene, volcano_input_filename = generate_stats_per_gene(
    mutation_expression_df_melted, 
    target_gene)

outputting data to MYO15A/19_101_logfc_pvalue.csv


## generate heatmap of top 100 differentially expressed genes

In [667]:
def generate_expression_heatmap(expression_df, volcano_plot_df, n=100, exclude_value=10):
    # Get the indices of the top n rows based on absolute values of 'logFC'
    # get the genes most differentially expressed (high log FC values)
    # if we exclude -exclude_value, exclude_value values
    # top_n_indices = volcano_plot_df[
    #     (volcano_plot_df['logFC'].abs() != exclude_value)
    # ]['logFC'].abs().nlargest(n).index
    
    # top_n_rows = volcano_plot_df.loc[top_n_indices].set_index('gene')

    top_n_indices = volcano_plot_df['pvalue'].nsmallest(n).index
    
    top_n_rows = volcano_plot_df.loc[top_n_indices].set_index('gene')

    # go back to the expression df and make a heatmap of the top n genes
    expression_df_heatmap = expression_df.loc[top_n_rows.index]
    
    return expression_df_heatmap

heatmap_data = generate_expression_heatmap(expression_df, volcano_plot_df)


In [668]:
import pandas as pd

def get_mutated_status(expression_df_heatmap, individuals_mutated_target_gene, output_folder=target_gene):
    individuals_mutated_target_gene = list(individuals_mutated_target_gene)
    
    mutated_status = [1 if item in individuals_mutated_target_gene else 0 for item in expression_df_heatmap.columns]

    sample_categories_df = pd.DataFrame({
        'Sample': expression_df_heatmap.columns,
        'Mutation Status': mutated_status
    })

    output_filename = f'{output_folder}/sample_mutation_status.csv'
    print(output_filename)
    sample_categories_df.to_csv(output_filename,  index=False)
    return sample_categories_df

mutated_status_df = get_mutated_status(heatmap_data, individuals_mutated_target_gene)


MYO15A/sample_mutation_status.csv


In [669]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import fcluster, linkage

def hierarchical_clustering(expression_df_heatmap, row_threshold=7, col_threshold=7, output_folder=target_gene):
    # Cluster the rows and columns using hierarchical clustering
    row_linkage = linkage(expression_df_heatmap, method='ward')
    col_linkage = linkage(expression_df_heatmap.T, method='ward')

    # Assign cluster labels using cluster
    row_clusters = fcluster(row_linkage, t=row_threshold, criterion='maxclust')
    col_clusters = fcluster(col_linkage, t=col_threshold, criterion='maxclust')

    # Create dictionaries to store cluster information
    row_cluster_info = {f"Cluster {cluster}": expression_df_heatmap.index[row_clusters == cluster] for cluster in np.unique(row_clusters)}
    col_cluster_info = {f"Cluster {cluster}": expression_df_heatmap.columns[col_clusters == cluster] for cluster in np.unique(col_clusters)}

    row_cluster_file = os.path.join(output_folder, 'row_clusters.txt')
    with open(row_cluster_file, 'w') as file:
        for cluster, items in row_cluster_info.items():
            file.write(f"{cluster}:\n")
            file.write(f"{', '.join(items)}\n\n")

    # Save column cluster information to a text file
    col_cluster_file = os.path.join(output_folder, 'col_clusters.txt')
    with open(col_cluster_file, 'w') as file:
        for cluster, items in col_cluster_info.items():
            file.write(f"{cluster}:\n")
            file.write(f"{', '.join(items)}\n\n")
    
    return row_linkage, col_linkage, row_cluster_info, col_cluster_info


row_linkage, col_linkage, row_clusters, col_clusters = hierarchical_clustering(heatmap_data)


In [670]:
import seaborn as sns
import matplotlib.pyplot as plt

def create_clustered_heatmap_and_save(expression_df_heatmap, 
                                      row_linkage, 
                                      col_linkage, 
                                      sample_mutation_df, 
                                      output_folder=target_gene):
    # Group labels and colors for color bar
    type_map = {1: 'red', 0: 'yellow'}

    sns.set(font_scale=1.0) 

    # Create a clustered heatmap
    clustered_df = sns.clustermap(
        expression_df_heatmap,
        row_linkage=row_linkage,
        col_linkage=col_linkage,
        cmap='viridis',
        annot=False,
        fmt=".1f",
        linewidths=.5,
        col_colors=sample_mutation_df.set_index('Sample')['Mutation Status'].map(type_map),
        z_score=0,
        cbar_kws={"shrink": 0.7, "aspect": 30}  # Adjust color bar size
    )

    #Set the size of the overall figure
    clustered_df.fig.set_size_inches(15, len(expression_df_heatmap) * 0.2)  

    output_filename = f'{output_folder}/heatmap.png'

    clustered_df.ax_heatmap.set_yticklabels(clustered_df.ax_heatmap.get_yticklabels(), rotation=0)
    
    # Save the plot to a file
    clustered_df.savefig(output_filename, bbox_inches='tight')

    # Close the plot to prevent displaying it in the notebook (optional)
    plt.close()

    return clustered_df

clustered_heatmap = create_clustered_heatmap_and_save(heatmap_data, 
                                                      row_linkage, 
                                                      col_linkage,
                                                      mutated_status_df)


In [671]:
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram

## add more args to the function 

def plot_and_save_dendrograms(row_linkage, col_linkage, expression_df_heatmap, output_folder=target_gene):
    # Plot the row dendrogram
    plt.figure(figsize=(12, 10))
    # play around with fig sizes 
    gene_plot = dendrogram(row_linkage, 
                           labels=expression_df_heatmap.index, 
                           truncate_mode='lastp', 
                           p=int(0.75 * len(row_linkage)), show_leaf_counts=True)
    plt.title('Row Dendrogram')
    
    # Save the row dendrogram plot to a file
    row_dendrogram_filename = f'{output_folder}/row_dendrogram.png'
    plt.savefig(row_dendrogram_filename)
    plt.close()

    # Plot the column dendrogram
    # play around with fig sizes 
    plt.figure(figsize=(12, 10))
    sample_plot = dendrogram(col_linkage, labels=expression_df_heatmap.columns, orientation='top', 
                             truncate_mode='lastp', p=int(0.75 * len(row_linkage)), show_leaf_counts=True)
    plt.title('Column Dendrogram')
    
    # Save the column dendrogram plot to a file
    col_dendrogram_filename = f'{output_folder}/col_dendrogram.png'
    plt.savefig(col_dendrogram_filename)
    plt.close()

    return gene_plot, sample_plot

row_dendrogram, col_dendrogram = plot_and_save_dendrograms(row_linkage, 
                                                           col_linkage, 
                                                           heatmap_data)


In [672]:
def histogram_of_column_and_save(df, column, output_folder=target_gene):
    # Plot the distribution of p-values
    plt.figure(figsize=(10, 6))
    plt.hist(df[column], bins=30, color='blue', edgecolor='black')
    plt.title(f'Distribution of {column}')
    plt.xlabel('P-values')
    plt.ylabel('Frequency')

    output_filename = f'{output_folder}/histogram_{column}.png'
    # Save the histogram plot to a file
    plt.savefig(output_filename)
    plt.close()

histogram_of_column_and_save(volcano_plot_df, 'pvalue')


In [673]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def volcano_plot(input_file_path, yaxis, 
                 xaxis, 
                 significance_threshold=0.05, 
                 logfold_positive_threshold=2, 
                 logfold_negative_threshold=-2,
                 output_folder=target_gene):
    df = pd.read_csv(input_file_path)

    # Apply -log10 transformation to the p-value
    df['-log10_pvalue'] = -np.log10(df[yaxis])

    # Highlight significant points with large log-fold changes
    significant_genes_positive = df[(df[yaxis] < significance_threshold) & (df['logFC'] > logfold_positive_threshold)]
    significant_genes_negative = df[(df[yaxis] < significance_threshold) & (df['logFC'] < logfold_negative_threshold)]

    # Clip outliers in 'logFC' column for the plot
    df['logFC'] = df['logFC'].clip(lower=-10, upper=10)

    plt.scatter(x=df[xaxis], y=df['-log10_pvalue'], s=1, label='All Genes', alpha=0.5)
    plt.scatter(x=significant_genes_positive[xaxis], y=significant_genes_positive['-log10_pvalue'], s=10, c='red', marker='^', label='Significant Genes (Positive LogFC)')
    plt.scatter(x=significant_genes_negative[xaxis], y=significant_genes_negative['-log10_pvalue'], s=10, c='blue', marker='v', label='Significant Genes (Negative LogFC)')

    plt.xlabel(xaxis)
    plt.ylabel(f'-log10({yaxis})')
    plt.title('Volcano Plot')
    plt.axhline(-np.log10(significance_threshold), color='gray', linestyle='--', label=f'Significance Threshold ({significance_threshold})')
    plt.axvline(logfold_positive_threshold, color='gray', linestyle='--', label=f'Positive Log-Fold Change Threshold ({logfold_positive_threshold})')
    plt.axvline(logfold_negative_threshold, color='gray', linestyle='--', label=f'Negative Log-Fold Change Threshold ({logfold_negative_threshold})')
    
     # Create a separate legend outside the plot
    fig = plt.gcf()
    handles, labels = plt.gca().get_legend_handles_labels()
    fig.legend(handles, labels, loc='center left', bbox_to_anchor=(1, 0.5))
    
    
    output_filename = f'{output_folder}/volcano_plot.png'
    plt.savefig(output_filename, bbox_inches='tight')
    plt.close()
    significant_genes_positive.to_csv(f'{output_folder}/signif_genes_positive.csv')
    significant_genes_negative.to_csv(f'{output_folder}/signif_genes_negative.csv')

    return significant_genes_positive, significant_genes_negative

significant_genes_positive, significant_genes_negative  = volcano_plot(volcano_input_filename, 'pvalue', 'logFC')


In [674]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def create_gene_expression_boxplot(expression_df, significant_genes_df, mutated_status_df, output_folder=target_gene, positive=1 ):
    # Filter for significant genes
    filtered_expression_df_genes_of_interest = expression_df.loc[significant_genes_df.gene]

    # Unstack the expression DataFrame
    unstack_expression = filtered_expression_df_genes_of_interest.unstack().reset_index()
    unstack_expression.columns = ['Sample', 'Gene', 'Expression']

    # Merge the expression matrix with the mutation information
    boxplot_df = pd.merge(unstack_expression, mutated_status_df, how='inner', on='Sample')

    # Rename columns for clarity
    boxplot_df.columns = ['Sample', 'Gene', 'Expression', 'Target Gene Mutation Status']

    # Set up the plot
    plt.figure(figsize=(12, 8))

    # Create a box plot with 'hue' for each gene and 'dodge' for Mutation Status
    sns.boxplot(x='Gene', y='Expression', data=boxplot_df, hue='Target Gene Mutation Status', dodge=True)

    # Customize the plot
    plt.xlabel('Gene')
    plt.ylabel('Gene Expression')
    plt.title('Box Plots of Gene Expression for Mutated and Non-Mutated Individuals')

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45, ha='right')

    if positive == 1:
        output_filename = f'{output_folder}/positive_genes_expression_boxplot.png'
    else:
        output_filename = f'{output_folder}/negative_genes_expression_boxplot.png'
    plt.savefig(output_filename)
    plt.close()

create_gene_expression_boxplot(expression_df, significant_genes_positive, mutated_status_df, positive=1)
create_gene_expression_boxplot(expression_df, significant_genes_negative, mutated_status_df, positive=0)


  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
