In [82]:
maf_file_path = 'data/RESPOND_247_coding_final.maf'
expression_file_path = 'data/Expression_remove_BE.txt'
target_gene='SPOP'

import pandas as pd
import numpy as np
from statistics import mean 
from scipy.stats import ranksums
import matplotlib.pylab as plt
from statsmodels.stats import multitest


In [83]:
def load_maf_data(file_path):
    # Define columns of interest
    columns = ["Hugo_Symbol", "Tumor_Sample_Barcode"]
    df = pd.read_csv(file_path, sep='\t', comment="#", usecols=columns)
    df.rename(columns={'Hugo_Symbol': 'gene', 'Tumor_Sample_Barcode': 'sample'}, inplace=True)
    df['mutation'] = 1
    return df
    
maf_df = load_maf_data(maf_file_path)



In [84]:
def log2(value):
    return np.log2(value + 1)

def load_txt_file_into_dataframe(file_path):
    # Read the .txt file into a pandas DataFrame
    df = pd.read_csv(file_path, sep='\t')  # Adjust the separator if needed

    # take log2 of expression data to scale expression data
    df = df.map(log2)

    return df


# Call the load_txt_file_into_dataframe function
expression_df = load_txt_file_into_dataframe(expression_file_path)

In [85]:
def reformat_expression_data(df):
    # Combine column names and index names into rows for every element
    melted_df = pd.melt(df.reset_index(), id_vars=['index'], var_name='column', col_level=0)

    # Rename columns
    melted_df.rename(columns={'index': 'gene', 'column': 'sample', 'value': 'gene_expression'}, inplace=True)

    return melted_df


expression_df_melted = reformat_expression_data(expression_df)

In [86]:
# note individuals with muttated data may be missing in the expression data and vice versa. 
# every sample should have at least one mutation 

In [87]:
def preprocess_and_combine_mutation_expression(maf_df, expression_df):
    ''' filter the expression data to those that have whole genome sequencing 
     i.e. appear in the mutation data frame (maf)
     join mutation and expression data
     '''
    
    exon_seq_samples = maf_df['sample'].unique()
    filtered_expression_df = expression_df[expression_df['sample'].isin(exon_seq_samples)]
    
    all_rows = expression_df.shape[0]
    filt_rows = filtered_expression_df.shape[0]
    
    percentage_filtered = (all_rows - filt_rows) / all_rows
    
    print('fraction of rows filtered is', percentage_filtered)

    express_mut_genes_df = pd.merge(maf_df, filtered_expression_df, on=['gene', 'sample'], how='right')

    express_mut_genes_df['mutation'].fillna(0, inplace=True)

    return express_mut_genes_df


mutation_expression_df_melted =  preprocess_and_combine_mutation_expression(maf_df, expression_df_melted)
    

fraction of rows filtered is 0.2865853658536585


In [88]:
def calculate_log_fold(list_mutated, list_non_mutated, smoothing_factor=1e-8): 
    mean_mutated = mean(list_mutated)  + smoothing_factor
    mean_non_mutated = mean(list_non_mutated) + smoothing_factor
        
    return np.log2(mean_mutated/mean_non_mutated)

In [89]:
def calculate_z_score(list_mutated, list_non_mutated):
    mean_mutated = mean(list_mutated) 
    mean_non_mutated = mean(list_non_mutated) 
    all_expression_values = list_non_mutated + list_mutated
    z_score = (mean_mutated - mean_non_mutated) / np.std(all_expression_values)
    return z_score


In [90]:
# this isn't used for gene expression 
# from scipy.stats import ttest_ind
# def calculate_pvalue_and_effect_size_ttest(list_mutated, list_non_mutated):
#     t_stat, p_value = ttest_ind(list_mutated, list_non_mutated, equal_var=False)
#     return p_value, t_stat


In [91]:
def calculate_pvalue_and_effect_size_wilcox_ranksum(list_mutated, list_non_mutated):
     test = ranksums(list_mutated, list_non_mutated)
     u_statistic = test.statistic 
     effect_size = u_statistic / (len(list_mutated) * len(list_non_mutated))
     pvalue=test.pvalue
     return pvalue, effect_size

In [92]:
def calculate_adjusted_pvalue(pvalues, method='fdr_bh'):
    _, corrected_pvalues, _, _ = multitest.multipletests(pvalues, method=method)
    return corrected_pvalues


In [93]:
target_gene

'SPOP'

In [94]:
import os
def generate_stats_per_gene(express_mut_genes_df, target_gene, output_folder=target_gene):
    """
    Given expression and mutation data calculates LogFC, pvalue, mean of expression per gene 
    for a given target_gene 
    
    """
    gene_df = express_mut_genes_df[express_mut_genes_df['gene']==target_gene]

    if len(gene_df) == 0:
        raise ValueError("This gene is not valid! No mutations with this gene exist") 

    mutated_samples = gene_df[gene_df['mutation'] == 1]['sample']
    non_mutated_samples = gene_df[gene_df['mutation'] == 0]['sample']
    
    mutated_individuals_expression = express_mut_genes_df[express_mut_genes_df['sample'].isin(mutated_samples)]
    
    non_mutated_individuals_expression = express_mut_genes_df[express_mut_genes_df['sample'].isin(non_mutated_samples)]

    # gather data of mutated and non-mutated genes into lists
    mutated_individuals_data = mutated_individuals_expression.groupby(['gene'])['gene_expression'].apply(
    lambda x: list(x)).to_frame().reset_index().rename(columns={'gene_expression': 'gene_expression_mutated'})
    non_mutated_individuals_data = non_mutated_individuals_expression.groupby(['gene'])['gene_expression'].apply(
    lambda x: list(x)).to_frame().reset_index().rename(columns={'gene_expression': 'gene_expression_non_mutated'})

    # combine mutated and unmutated data into one df
    combined_data= pd.merge(mutated_individuals_data, non_mutated_individuals_data, on='gene', how='inner')

    # calculate fold change and p-value and z-score
    combined_data['logFC']= combined_data.apply(
        lambda x: calculate_log_fold(x.gene_expression_mutated, x.gene_expression_non_mutated), axis=1)
    # calculate wilcox pvalue
    combined_data[['pvalue', 'effect_size']] = combined_data.apply(
    lambda x: pd.Series(calculate_pvalue_and_effect_size_wilcox_ranksum(x.gene_expression_mutated, x.gene_expression_non_mutated)),
    axis=1
    )
    combined_data['expression_mutated_mean'] = combined_data['gene_expression_mutated'].apply(mean)
    combined_data['expression_nonmutated_mean'] = combined_data['gene_expression_non_mutated'].apply(mean)


    # calculate adjusted p-value
    combined_data['adjusted_pvalue'] = calculate_adjusted_pvalue(combined_data['pvalue'].values)

    # Output data to csv 
    combined_data.drop(columns=['gene_expression_mutated', 'gene_expression_non_mutated'], inplace=True)
    
    os.makedirs(output_folder, exist_ok=True)
    output_filename = f'{output_folder}/{mutated_samples.count()}_{non_mutated_samples.count()}_logfc_pvalue.csv'
    
    print(f"outputting data to {output_filename}")
    combined_data.to_csv(output_filename, index=False)

    return combined_data, mutated_samples, output_filename


In [96]:
volcano_plot_df, individuals_mutated_target_gene, volcano_input_filename = generate_stats_per_gene(
    mutation_expression_df_melted, 
    target_gene)

outputting data to SPOP/15_102_logfc_pvalue.csv


## generate heatmap of top 100 differentially expressed genes

In [97]:
def generate_expression_heatmap(expression_df, volcano_plot_df, n=100, exclude_value=10):
    # Get the indices of the top n rows based on absolute values of 'logFC'
    # get the genes most differentially expressed (high log FC values)
    # if we exclude -exclude_value, exclude_value values
    # top_n_indices = volcano_plot_df[
    #     (volcano_plot_df['logFC'].abs() != exclude_value)
    # ]['logFC'].abs().nlargest(n).index
    
    # top_n_rows = volcano_plot_df.loc[top_n_indices].set_index('gene')

    top_n_indices = volcano_plot_df['pvalue'].nsmallest(n).index
    
    top_n_rows = volcano_plot_df.loc[top_n_indices].set_index('gene')

    # go back to the expression df and make a heatmap of the top n genes
    expression_df_heatmap = expression_df.loc[top_n_rows.index]
    
    return expression_df_heatmap

heatmap_data = generate_expression_heatmap(expression_df, volcano_plot_df)


In [98]:
import pandas as pd

def get_mutated_status(expression_df_heatmap, individuals_mutated_target_gene, output_folder=target_gene):
    individuals_mutated_target_gene = list(individuals_mutated_target_gene)
    
    mutated_status = [1 if item in individuals_mutated_target_gene else 0 for item in expression_df_heatmap.columns]

    sample_categories_df = pd.DataFrame({
        'Sample': expression_df_heatmap.columns,
        'Mutation Status': mutated_status
    })

    output_filename = f'{output_folder}/sample_mutation_status.csv'
    print(output_filename)
    sample_categories_df.to_csv(output_filename,  index=False)
    return sample_categories_df

mutated_status_df = get_mutated_status(heatmap_data, individuals_mutated_target_gene)


SPOP/sample_mutation_status.csv


In [99]:
heatmap_data

Unnamed: 0_level_0,RESPOND_10100218,RESPOND_10100291,RESPOND_10100412,RESPOND_10100478,RESPOND_10100596,RESPOND_10100615,RESPOND_10100801,RESPOND_10100884,RESPOND_10100899,RESPOND_10100952,...,RESPOND_80100242,RESPOND_80100313,RESPOND_80100345,RESPOND_80100411,RESPOND_80100526,RESPOND_80100556,RESPOND_80100590,RESPOND_81100031,RESPOND_40100842,RESPOND_80100259
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ARHGEF16,5.554589,5.426265,5.754888,4.321928,6.672425,4.087463,5.977280,6.614710,4.807355,4.700440,...,6.768184,5.754888,3.459432,7.179909,6.794416,3.700440,5.169925,6.285402,5.807355,6.303781
HMG20B,10.707359,8.781360,9.667112,8.426265,9.915879,9.308339,10.314017,8.044394,8.361944,8.299208,...,10.392317,10.890264,8.499846,11.106563,9.874981,9.154818,9.491853,9.292322,9.296916,10.213104
PLXNB2,10.000000,9.103288,9.623881,9.388017,9.977280,8.918863,9.262095,9.074141,8.686501,9.169925,...,9.611025,10.292322,8.810572,10.650154,10.079485,8.625709,9.337622,9.612868,9.317413,9.569856
WFIKKN1,2.321928,3.459432,3.169925,2.321928,3.584963,1.584963,3.000000,3.000000,2.000000,0.857649,...,3.906891,3.321928,1.000000,5.426265,3.459432,0.000000,2.000000,3.584963,3.459432,3.459432
MC1R,4.459432,7.330917,6.228819,5.000000,6.266787,4.523562,4.906891,6.228819,5.754888,4.392317,...,6.629357,6.523562,4.584963,7.965784,7.483816,2.321928,7.523562,6.392317,6.209453,7.475733
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MBD3,6.599913,6.189825,7.339850,6.108524,8.022368,6.321928,6.870365,7.651052,5.357552,6.247928,...,7.033423,7.434628,5.169925,8.370687,8.118941,4.954196,7.011227,7.977280,6.700440,7.033423
CYB561D1,3.459432,5.087463,4.000000,3.459432,4.087463,3.700440,3.459432,3.807355,3.169925,2.584963,...,2.584963,4.392317,2.807355,4.392317,4.321928,4.087463,3.807355,3.459432,3.906891,3.906891
GUCY1B1,3.169925,3.321928,3.000000,5.426265,2.807355,3.459432,3.906891,4.247928,4.169925,3.584963,...,2.584963,1.584963,4.392317,2.000000,2.584963,3.584963,3.169925,3.000000,3.807355,2.807355
KLF1,2.321928,0.371710,0.000000,0.000000,0.931392,0.000000,0.044200,0.449118,0.203308,0.000000,...,0.102749,1.000000,0.000000,0.852043,0.879822,0.000000,0.042794,1.000000,0.000000,0.078879


In [100]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.metrics import silhouette_score

def hierarchical_clustering(expression_df_heatmap, row_threshold=7, col_threshold=7, output_folder=target_gene):
    # Cluster the rows and columns using hierarchical clustering
    row_linkage = linkage(expression_df_heatmap, method='ward')
    col_linkage = linkage(expression_df_heatmap.T, method='ward')

    # Assign cluster labels using cluster
    row_clusters = fcluster(row_linkage, t=row_threshold, criterion='maxclust')
    col_clusters = fcluster(col_linkage, t=col_threshold, criterion='maxclust')

    # Calculate silhouette scores
    row_silhouette_score = silhouette_score(expression_df_heatmap, row_clusters)
    col_silhouette_score = silhouette_score(expression_df_heatmap.T, col_clusters)

    print(f"Row Silhouette Score: {row_silhouette_score}")
    print(f"Column Silhouette Score: {col_silhouette_score}")

    # Create dictionaries to store cluster information
    row_cluster_info = {f"Cluster {cluster}": expression_df_heatmap.index[row_clusters == cluster] for cluster in np.unique(row_clusters)}
    col_cluster_info = {f"Cluster {cluster}": expression_df_heatmap.columns[col_clusters == cluster] for cluster in np.unique(col_clusters)}

    row_cluster_file = os.path.join(output_folder, 'row_clusters.txt')
    with open(row_cluster_file, 'w') as file:
        file.write(f"Row Silhouette Score: {row_silhouette_score}\n\n")
        for cluster, items in row_cluster_info.items():
            file.write(f"{cluster}:\n")
            file.write(f"{', '.join(items)}\n\n")

    # Save column cluster information to a text file
    col_cluster_file = os.path.join(output_folder, 'col_clusters.txt')
    with open(col_cluster_file, 'w') as file:
        file.write(f"Column Silhouette Score: {col_silhouette_score}\n\n")
        for cluster, items in col_cluster_info.items():
            file.write(f"{cluster}:\n")
            file.write(f"{', '.join(items)}\n\n")
    
    return row_linkage, col_linkage, row_cluster_info, col_cluster_info


row_linkage, col_linkage, row_clusters, col_clusters = hierarchical_clustering(heatmap_data)


Row Silhouette Score: 0.24124544217953459
Column Silhouette Score: 0.06585128389732592


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [101]:
import seaborn as sns
import matplotlib.pyplot as plt

def create_clustered_heatmap_and_save(expression_df_heatmap, 
                                      row_linkage, 
                                      col_linkage, 
                                      sample_mutation_df, 
                                      output_folder=target_gene):
    # Group labels and colors for color bar
    type_map = {1: 'red', 0: 'yellow'}

    sns.set(font_scale=1.0) 

    # Create a clustered heatmap
    clustered_df = sns.clustermap(
        expression_df_heatmap,
        row_linkage=row_linkage,
        col_linkage=col_linkage,
        cmap='viridis',
        annot=False,
        fmt=".1f",
        linewidths=.5,
        col_colors=sample_mutation_df.set_index('Sample')['Mutation Status'].map(type_map),
        z_score=0,
        cbar_kws={"shrink": 0.7, "aspect": 30}  # Adjust color bar size
    )

    #Set the size of the overall figure
    clustered_df.fig.set_size_inches(15, len(expression_df_heatmap) * 0.2)  

    output_filename = f'{output_folder}/heatmap.png'

    clustered_df.ax_heatmap.set_yticklabels(clustered_df.ax_heatmap.get_yticklabels(), rotation=0)
    
    # Save the plot to a file
    clustered_df.savefig(output_filename, bbox_inches='tight')

    # Close the plot to prevent displaying it in the notebook (optional)
    plt.close()

    return clustered_df

clustered_heatmap = create_clustered_heatmap_and_save(heatmap_data, 
                                                      row_linkage, 
                                                      col_linkage,
                                                      mutated_status_df)


In [102]:
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram

## add more args to the function 

def plot_and_save_dendrograms(row_linkage, col_linkage, expression_df_heatmap, output_folder=target_gene):
    # Plot the row dendrogram
    plt.figure(figsize=(12, 10))
    # play around with fig sizes 
    gene_plot = dendrogram(row_linkage, 
                           labels=expression_df_heatmap.index, 
                           truncate_mode='lastp', 
                           p=int(0.75 * len(row_linkage)), show_leaf_counts=True)
    plt.title('Row Dendrogram')
    
    # Save the row dendrogram plot to a file
    row_dendrogram_filename = f'{output_folder}/row_dendrogram.png'
    plt.savefig(row_dendrogram_filename)
    plt.close()

    # Plot the column dendrogram
    # play around with fig sizes 
    plt.figure(figsize=(12, 10))
    sample_plot = dendrogram(col_linkage, labels=expression_df_heatmap.columns, orientation='top', 
                             truncate_mode='lastp', p=int(0.75 * len(row_linkage)), show_leaf_counts=True)
    plt.title('Column Dendrogram')
    
    # Save the column dendrogram plot to a file
    col_dendrogram_filename = f'{output_folder}/col_dendrogram.png'
    plt.savefig(col_dendrogram_filename)
    plt.close()

    return gene_plot, sample_plot

row_dendrogram, col_dendrogram = plot_and_save_dendrograms(row_linkage, 
                                                           col_linkage, 
                                                           heatmap_data)


In [103]:
def histogram_of_column_and_save(df, column, output_folder=target_gene):
    # Plot the distribution of p-values
    plt.figure(figsize=(10, 6))
    plt.hist(df[column], bins=30, color='blue', edgecolor='black')
    plt.title(f'Distribution of {column}')
    plt.xlabel('P-values')
    plt.ylabel('Frequency')

    output_filename = f'{output_folder}/histogram_{column}.png'
    # Save the histogram plot to a file
    plt.savefig(output_filename)
    plt.close()

histogram_of_column_and_save(volcano_plot_df, 'pvalue')


In [104]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def volcano_plot(input_file_path, yaxis, 
                 xaxis, 
                 significance_threshold=0.05, 
                 logfold_positive_threshold=2, 
                 logfold_negative_threshold=-2,
                 output_folder=target_gene):
    df = pd.read_csv(input_file_path)

    # Apply -log10 transformation to the p-value
    df['-log10_pvalue'] = -np.log10(df[yaxis])

    # Highlight significant points with large log-fold changes
    significant_genes_positive = df[(df[yaxis] < significance_threshold) & (df['logFC'] > logfold_positive_threshold)]
    significant_genes_negative = df[(df[yaxis] < significance_threshold) & (df['logFC'] < logfold_negative_threshold)]

    # Clip outliers in 'logFC' column for the plot
    df['logFC'] = df['logFC'].clip(lower=-10, upper=10)

    plt.scatter(x=df[xaxis], y=df['-log10_pvalue'], s=1, label='All Genes', alpha=0.5)
    plt.scatter(x=significant_genes_positive[xaxis], y=significant_genes_positive['-log10_pvalue'], s=10, c='red', marker='^', label='Significant Genes (Positive LogFC)')
    plt.scatter(x=significant_genes_negative[xaxis], y=significant_genes_negative['-log10_pvalue'], s=10, c='blue', marker='v', label='Significant Genes (Negative LogFC)')

    plt.xlabel(xaxis)
    plt.ylabel(f'-log10({yaxis})')
    plt.title('Volcano Plot')
    plt.axhline(-np.log10(significance_threshold), color='gray', linestyle='--', label=f'Significance Threshold ({significance_threshold})')
    plt.axvline(logfold_positive_threshold, color='gray', linestyle='--', label=f'Positive Log-Fold Change Threshold ({logfold_positive_threshold})')
    plt.axvline(logfold_negative_threshold, color='gray', linestyle='--', label=f'Negative Log-Fold Change Threshold ({logfold_negative_threshold})')
    
     # Create a separate legend outside the plot
    fig = plt.gcf()
    handles, labels = plt.gca().get_legend_handles_labels()
    fig.legend(handles, labels, loc='center left', bbox_to_anchor=(1, 0.5))
    
    
    output_filename = f'{output_folder}/volcano_plot.png'
    plt.savefig(output_filename, bbox_inches='tight')
    plt.close()
    significant_genes_positive.to_csv(f'{output_folder}/signif_genes_positive.csv')
    significant_genes_negative.to_csv(f'{output_folder}/signif_genes_negative.csv')

    return significant_genes_positive, significant_genes_negative

significant_genes_positive, significant_genes_negative  = volcano_plot(volcano_input_filename, 'pvalue', 'logFC')


In [105]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def create_gene_expression_boxplot(expression_df, significant_genes_df, mutated_status_df, output_folder=target_gene, positive=1 ):
    # Filter for significant genes
    filtered_expression_df_genes_of_interest = expression_df.loc[significant_genes_df.gene]

    # Unstack the expression DataFrame
    unstack_expression = filtered_expression_df_genes_of_interest.unstack().reset_index()
    unstack_expression.columns = ['Sample', 'Gene', 'Expression']

    # Merge the expression matrix with the mutation information
    boxplot_df = pd.merge(unstack_expression, mutated_status_df, how='inner', on='Sample')

    # Rename columns for clarity
    boxplot_df.columns = ['Sample', 'Gene', 'Expression', 'Target Gene Mutation Status']

    # Set up the plot
    plt.figure(figsize=(12, 8))

    # Create a box plot with 'hue' for each gene and 'dodge' for Mutation Status
    sns.boxplot(x='Gene', y='Expression', data=boxplot_df, hue='Target Gene Mutation Status', dodge=True)

    # Customize the plot
    plt.xlabel('Gene')
    plt.ylabel('Gene Expression')
    plt.title('Box Plots of Gene Expression for Mutated and Non-Mutated Individuals')

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45, ha='right')

    if positive == 1:
        output_filename = f'{output_folder}/positive_genes_expression_boxplot.png'
    else:
        output_filename = f'{output_folder}/negative_genes_expression_boxplot.png'
    plt.savefig(output_filename)
    plt.close()

create_gene_expression_boxplot(expression_df, significant_genes_positive, mutated_status_df, positive=1)
create_gene_expression_boxplot(expression_df, significant_genes_negative, mutated_status_df, positive=0)


  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


# exploring mutation data

In [1]:


from importlib import reload
import plot, utils, data_load, clustering, constants, gsea 
import warnings

# Suppress FutureWarnings from multiple modules
modules_to_ignore = ("seaborn", "sklearn")

for module in modules_to_ignore:
    warnings.filterwarnings("ignore", category=FutureWarning, module=module)

reload(clustering)
reload(utils)
reload(data_load)
reload(plot)
reload(constants)
reload(gsea)


<module 'gsea' from '/Users/meltemtutar/Documents/Huang/Respond/gsea.py'>

In [4]:
#hugo_symbol means gene, Tumor_Sample_Barcode is the sampleid
columns = ["Hugo_Symbol", "Tumor_Sample_Barcode", "Variant_Classification", "Variant_Type", "Chromosome", "Start_Position", "End_Position",
           "Description", "Protein_Change"]
maf_df = data_load.load_maf_data(file_path=constants.maf_file_path, columns=columns)


In [7]:
maf_df[maf_df['gene']=='KMT2D']

Unnamed: 0,gene,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,sample,Protein_Change,Description,mutation
6264,KMT2D,chr12,49038778,49038778,Missense_Mutation,SNP,RESPOND_20100780,p.R2860C,lysine methyltransferase 2D,1
6265,KMT2D,chr12,49041176,49041176,Frame_Shift_Del,DEL,RESPOND_60100060,p.Y2199fs,lysine methyltransferase 2D,1
6266,KMT2D,chr12,49031631,49031668,Frame_Shift_Del,DEL,RESPOND_60100201,p.P4346fs,lysine methyltransferase 2D,1
6267,KMT2D,chr12,49043078,49043079,Frame_Shift_Del,DEL,RESPOND_60100201,p.E1881fs,lysine methyltransferase 2D,1
6268,KMT2D,chr12,49041009,49041009,Missense_Mutation,SNP,RESPOND_60100883,p.N2254S,lysine methyltransferase 2D,1
6269,KMT2D,chr12,49051204,49051204,Nonsense_Mutation,SNP,RESPOND_61100223,p.Q827*,lysine methyltransferase 2D,1
6270,KMT2D,chr12,49048052,49048052,Missense_Mutation,SNP,RESPOND_61100364,p.C1383W,lysine methyltransferase 2D,1
13972,KMT2D,chr12,49040614,49040614,Missense_Mutation,SNP,RESPOND_70100563,p.R2386W,lysine methyltransferase 2D,1
14394,KMT2D,chr12,49049713,49049713,Missense_Mutation,SNP,RESPOND_20107556,p.R1292H,lysine methyltransferase 2D,1
15721,KMT2D,chr12,49039268,49039268,Missense_Mutation,SNP,RESPOND_60100690,p.R2774W,lysine methyltransferase 2D,1


In [8]:
## load data 
maf_df = data_load.load_maf_data(file_path=constants.maf_file_path, columns=["Hugo_Symbol", "Tumor_Sample_Barcode", "Variant_Classification", "Variant_Type", "Chromosome", "Start_Position", "End_Position",
           "Description", "Protein_Change"])
expression_df = data_load.load_txt_file_into_dataframe(file_path=constants.expression_file_path)
expression_df_melted = data_load.reformat_expression_data(df=expression_df)
mutation_expression_df_melted = data_load.preprocess_and_combine_mutation_expression(maf_df= maf_df, expression_df = expression_df_melted)

fraction of rows filtered is 0.2865853658536585


In [22]:
# calculate logfc, pvalue
volcano_plot_df, individuals_mutated_target_gene, output_stats_path = utils.generate_stats_per_gene(
        express_mut_genes_df=mutation_expression_df_melted, 
        target_gene='KMT2D',
        output_folder=constants.output_folder)


outputting data to output_data_v9/KMT2D/17_104_logfc_pvalue.csv


In [23]:
individuals_mutated_target_gene


348394     RESPOND_20100780
488442     RESPOND_20101031
608461     RESPOND_20101598
608462     RESPOND_20101598
608463     RESPOND_20101598
988501     RESPOND_40100784
1188518    RESPOND_60100201
1188519    RESPOND_60100201
1408539    RESPOND_60100883
1688567    RESPOND_61100037
1788580    RESPOND_61100364
1868587    RESPOND_70100042
1948596    RESPOND_70100229
1968599    RESPOND_70100286
2048606    RESPOND_70100563
2128616    RESPOND_70100812
2128617    RESPOND_70100812
Name: sample, dtype: object

In [25]:
mutation_expression_df_melted['mutation']
samples_cluster_6 = ['RESPOND_20100780', 'RESPOND_20101031', 'RESPOND_20101598',
       'RESPOND_40100176', 'RESPOND_40100292', 'RESPOND_40100361',
       'RESPOND_40100523', 'RESPOND_50100227', 'RESPOND_60100201',
       'RESPOND_60100943', 'RESPOND_60101197', 'RESPOND_61100037',
       'RESPOND_70100042', 'RESPOND_70100229', 'RESPOND_70100286',
       'RESPOND_70100317', 'RESPOND_70100812', 'RESPOND_70102184',
       'RESPOND_80100590']
mutated_samples=set(individuals_mutated_target_gene)
samples_same_cluster = mutation_expression_df_melted[(mutation_expression_df_melted['mutation'] == 1.0) & 
(mutation_expression_df_melted['gene'] == 'KMT2D')& 
(mutation_expression_df_melted['sample'].isin(samples))]



In [31]:
# Some samples have multiple mutations
set(samples_same_cluster['Protein_Change'])

{'p.A3146V',
 'p.E1186*',
 'p.E1881fs',
 'p.G1933D',
 'p.L1947F',
 'p.L4796P',
 'p.P4346fs',
 'p.P5026L',
 'p.R1704Q',
 'p.R2095C',
 'p.R2830Q',
 'p.R2860C',
 'p.R4484*'}

In [12]:
import pandas as pd
pd.set_option('display.max_columns', None)
def load_maf_data_2(file_path):
    df = pd.read_csv(file_path, sep='\t', comment="#")
    df['mutation'] = 1
    return df

In [13]:
all_df = load_maf_data_2(file_path=constants.maf_file_path)

  df = pd.read_csv(file_path, sep='\t', comment="#")


In [14]:
all_df.head(1)

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,dbSNP_RS,dbSNP_Val_Status,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Match_Norm_Seq_Allele1,Match_Norm_Seq_Allele2,Tumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2,Verification_Status,Validation_Status,Mutation_Status,Sequencing_Phase,Sequence_Source,Validation_Method,Score,BAM_File,Sequencer,Tumor_Sample_UUID,Matched_Norm_Sample_UUID,Genome_Change,Annotation_Transcript,Transcript_Strand,Transcript_Exon,Transcript_Position,cDNA_Change,Codon_Change,Protein_Change,Other_Transcripts,Refseq_mRNA_Id,Refseq_prot_Id,SwissProt_acc_Id,SwissProt_entry_Id,Description,UniProt_AApos,UniProt_Region,UniProt_Site,UniProt_Natural_Variations,UniProt_Experimental_Info,GO_Biological_Process,GO_Cellular_Component,GO_Molecular_Function,COSMIC_overlapping_mutations,COSMIC_fusion_genes,COSMIC_tissue_types_affected,COSMIC_total_alterations_in_gene,Tumorscape_Amplification_Peaks,Tumorscape_Deletion_Peaks,TCGAscape_Amplification_Peaks,TCGAscape_Deletion_Peaks,DrugBank,ref_context,gc_content,CCLE_ONCOMAP_overlapping_mutations,CCLE_ONCOMAP_total_mutations_in_gene,CGC_Mutation_Type,CGC_Translocation_Partner,CGC_Tumor_Types_Somatic,CGC_Tumor_Types_Germline,CGC_Other_Diseases,DNARepairGenes_Activity_linked_to_OMIM,FamilialCancerDatabase_Syndromes,MUTSIG_Published_Results,OREGANNO_ID,OREGANNO_Values,tumor_f,t_alt_count,t_ref_count,n_alt_count,n_ref_count,Gencode_34_secondaryVariantClassification,Achilles_Top_Genes,ClinVar_VCF_AF_ESP,ClinVar_VCF_AF_EXAC,ClinVar_VCF_AF_TGP,ClinVar_VCF_ALLELEID,ClinVar_VCF_CLNDISDB,ClinVar_VCF_CLNDISDBINCL,ClinVar_VCF_CLNDN,ClinVar_VCF_CLNDNINCL,ClinVar_VCF_CLNHGVS,ClinVar_VCF_CLNREVSTAT,ClinVar_VCF_CLNSIG,ClinVar_VCF_CLNSIGCONF,ClinVar_VCF_CLNSIGINCL,ClinVar_VCF_CLNVC,ClinVar_VCF_CLNVCSO,ClinVar_VCF_CLNVI,ClinVar_VCF_DBVARID,ClinVar_VCF_GENEINFO,ClinVar_VCF_MC,ClinVar_VCF_ORIGIN,ClinVar_VCF_RS,ClinVar_VCF_SSR,ClinVar_VCF_ID,ClinVar_VCF_FILTER,CosmicFusion_fusion_id,Familial_Cancer_Genes_Synonym,Familial_Cancer_Genes_Reference,Gencode_XHGNC_hgnc_id,HGNC_HGNC_ID,HGNC_Status,HGNC_Locus_Type,HGNC_Locus_Group,HGNC_Previous_Symbols,HGNC_Previous_Name,HGNC_Synonyms,HGNC_Name_Synonyms,HGNC_Chromosome,HGNC_Date_Modified,HGNC_Date_Symbol_Changed,HGNC_Date_Name_Changed,HGNC_Accession_Numbers,HGNC_Enzyme_IDs,HGNC_Ensembl_Gene_ID,HGNC_Pubmed_IDs,HGNC_RefSeq_IDs,HGNC_Gene_Family_ID,HGNC_Gene_Family_Name,HGNC_CCDS_IDs,HGNC_Vega_ID,HGNC_OMIM_ID(supplied_by_OMIM),HGNC_RefSeq(supplied_by_NCBI),HGNC_UniProt_ID(supplied_by_UniProt),HGNC_Ensembl_ID(supplied_by_Ensembl),HGNC_UCSC_ID(supplied_by_UCSC),Oreganno_Build,Simple_Uniprot_alt_uniprot_accessions,dbSNP_ASP,dbSNP_ASS,dbSNP_CAF,dbSNP_CDA,dbSNP_CFL,dbSNP_COMMON,dbSNP_DSS,dbSNP_G5,dbSNP_G5A,dbSNP_GENEINFO,dbSNP_GNO,dbSNP_HD,dbSNP_INT,dbSNP_KGPhase1,dbSNP_KGPhase3,dbSNP_LSD,dbSNP_MTP,dbSNP_MUT,dbSNP_NOC,dbSNP_NOV,dbSNP_NSF,dbSNP_NSM,dbSNP_NSN,dbSNP_OM,dbSNP_OTH,dbSNP_PM,dbSNP_PMC,dbSNP_R3,dbSNP_R5,dbSNP_REF,dbSNP_RV,dbSNP_S3D,dbSNP_SAO,dbSNP_SLO,dbSNP_SSR,dbSNP_SYN,dbSNP_TOPMED,dbSNP_TPA,dbSNP_U3,dbSNP_U5,dbSNP_VC,dbSNP_VP,dbSNP_WGT,dbSNP_WTD,dbSNP_dbSNPBuildID,dbSNP_ID,dbSNP_FILTER,HGNC_Entrez_Gene_ID(supplied_by_NCBI),dbSNP_RSPOS,dbSNP_VLD,AS_FilterStatus,AS_SB_TABLE,AS_UNIQ_ALT_READ_COUNT,CONTQ,DP,ECNT,GERMQ,MBQ,MFRL,MMQ,MPOS,NALOD,NCount,NLOD,OCM,PON,POPAF,ROQ,RPA,RU,SEQQ,STR,STRANDQ,STRQ,TLOD,Normal_Sample_Barcode,MutationID,QSS,TQSS,NT,QSS_NT,TQSS_NT,SGT,SOMATIC,MQ,MQ0,ReadPosRankSum,SNVSB,PNOISE,PNOISE2,SomaticEVS,cosmic,ExAC_ALL,ExAC_AFR,ExAC_AMR,ExAC_EAS,ExAC_FIN,ExAC_NFE,ExAC_OTH,ExAC_SAS,AF,AF_raw,AF_male,AF_female,AF_afr,AF_ami,AF_amr,AF_asj,AF_eas,AF_fin,AF_nfe,AF_oth,AF_sas,CLNALLELEID,CLNDN,CLNDISDB,CLNREVSTAT,CLNSIG,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,mutation
0,A1BG,1.0,__UNKNOWN__,hg38,chr19,58350553,58350573,+,In_Frame_Del,DEL,CCAGGGCGAAGCGCGCGCCCT,CCAGGGCGAAGCGCGCGCCCT,-,,,RESPOND_60100650,__UNKNOWN__,-,-,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,,,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,g.chr19:58350553_58350573delCCAGGGCGAAGCGCGCGCCCT,ENST00000263100.8,-,6.0,1065_1085,c.989_1009delAGGGCGCGCGCTTCGCCCTGG,c.(988-1011)gagggcgcgcgcttcgccctggtg>gtg,p.EGARFAL330del,AC012313.3_ENST00000599109.5_RNA|AC012313.3_EN...,NM_130786.3,NP_570602,P04217,A1BG_HUMAN,alpha-1-B glycoprotein,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,,blood microparticle (GO:0072562)|extracellular...,,,,biliary_tract(2)|breast(11)|central_nervous_sy...,91.0,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,,TCCTCGCGCACCAGGGCGAAGCGCGCGCCCTCCAGGGGCGC,0.71734,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,__UNKNOWN__,,__UNKNOWN__,,,0.028,4,179,0,189,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,EU794585,HGNC:5,Approved,gene with protein product,protein-coding gene,,,,,19q13.43,7/13/15,,,,,ENSG00000121410,2591067,NM_130786,594,Immunoglobulin like domain containing,CCDS12976,OTTHUMG00000183507,138670,NM_130786,P04217,ENSG00000121410,uc002qsd.5,,A8K052|Q68CK0|Q8IYJ6|Q96P39,False,False,,False,False,,False,False,False,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,False,,False,,False,False,False,,,,False,,,,1.0,,False,SITE,"[131, 237|2, 2]",,,453.0,2.0,93.0,"[20, 20]","[193, 176]","[60, 60]",58.0,2.21,,42.06,,,6.0,93.0,,,,,,,12.2,RESPOND_60100650_C,RESPOND_60100650_chr19_A1BG_58350553,,,,,,,,,,,,,,,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,exonic,A1BG,.,.,.,1


## 