In [2]:

from importlib import reload
import plot, utils, data_load, clustering, constants, gsea 
import warnings

In [10]:
expression_df = data_load.load_txt_file_into_dataframe(file_path=constants.expression_file_path)
mutation_df = data_load.load_maf_data(file_path=constants.maf_file_path)

# get expression data of individuals with at least one mutation (i.e. has sequencing data)
expression_df = expression_df.T[expression_df.T.index.isin(mutation_df['sample'].unique())].T

In [18]:
# Suppress FutureWarnings from multiple modules
modules_to_ignore = ("seaborn", "sklearn")

for module in modules_to_ignore:
    warnings.filterwarnings("ignore", category=FutureWarning, module=module)

reload(clustering)
reload(utils)
reload(data_load)
reload(plot)
reload(constants)
reload(gsea)

## load data 
maf_df = data_load.load_maf_data(file_path=constants.maf_file_path)
expression_df = data_load.load_txt_file_into_dataframe(file_path=constants.expression_file_path)
expression_df_melted = data_load.reformat_expression_data(df=expression_df)
mutation_expression_df_melted = data_load.preprocess_and_combine_mutation_expression(maf_df= maf_df, expression_df = expression_df_melted)


 # create data for GSEA analysis
# this data is the same for all target_genes, so doing this outside of the loop 
gsea_expression_path=gsea.create_gsea_expression_input(output_folder=constants.output_folder)

# run analysis for set of genes 
scores_genes = {}
for target_gene in constants.genes:

    # calculate logfc, pvalue
    volcano_plot_df, individuals_mutated_target_gene, output_stats_path = utils.generate_stats_per_gene(
        express_mut_genes_df=mutation_expression_df_melted, 
        target_gene=target_gene,
        output_folder=constants.output_folder)

    # get top 100 genes with the lowest p-values
    heatmap_data = data_load.generate_expression_heatmap(expression_df=expression_df, volcano_plot_df=volcano_plot_df, n=100)

    # cluster top 100 genes and samples
    # evaluate clustering with sihlouttee 
    # filter to only get silhouttee score for mutatted samples
    # TODO: am I suppsoed to normalize expression data before clustering?
    # TODO: ask about how the expression data is made. is it counts? but I see non integers 
    row_linkage, col_linkage, row_clusters, col_clusters, row_score, col_score, mutated_col_score = clustering.hierarchical_clustering(expression_df_heatmap=heatmap_data, 
                                                                                                output_folder=constants.output_folder,
                                                                                                target_gene=target_gene,
                                                                                                row_threshold=7, 
                                                                                                col_threshold=7,
                                                                                                mutated_samples=individuals_mutated_target_gene)
    
    print(f"for target gene {target_gene}  -- row score: {row_score}, col_score: {col_score}, mutated_col_score: {mutated_col_score}")
    

    scores_genes[target_gene]={'gene_sil_score': row_score, 'sample_sil_score': col_score, 'mutated_sample_sil_score': mutated_col_score}



    ## additional analysis 
    mutated_status_df = utils.get_mutated_status(expression_df_heatmap=heatmap_data, 
                                                individuals_mutated_target_gene=individuals_mutated_target_gene, 
                                                target_gene=target_gene,
                                                output_folder=constants.output_folder)

    # plot clustered heatmap
    clustered_heatmap = plot.create_clustered_heatmap_and_save(heatmap_data, 
                                                          row_linkage, 
                                                          col_linkage,
                                                          mutated_status_df,
                                                          target_gene=target_gene,
                                                        output_folder=constants.output_folder)

    # plot truncated dendograms 
    row_dendrogram, col_dendrogram = clustering.plot_and_save_dendrograms(row_linkage, 
                                                               col_linkage, 
                                                               heatmap_data, 
                                                               output_folder=constants.output_folder,
                                                               target_gene=target_gene)

    # plot histograms of adjusted and regular pvalues 
    plot.histogram_of_column_and_save(volcano_plot_df, 'pvalue', output_folder=constants.output_folder, target_gene=target_gene)
    plot.histogram_of_column_and_save(volcano_plot_df, 'adjusted_pvalue', output_folder=constants.output_folder, target_gene=target_gene)

    # calculate significant genes determined by cutoffs
    
    significant_genes_positive, significant_genes_negative  = plot.volcano_plot(input_file_path=output_stats_path, 
                                                                            yaxis='pvalue', 
                                                                            xaxis='logFC', 
                                                                            output_folder=constants.output_folder, target_gene=target_gene,
                                                                            significance_threshold=0.05, 
                                                                            logfold_positive_threshold=2, 
                                                                            logfold_negative_threshold=-2)

    # create box plots of significant genes (uoregulated and downregulated genes)
    plot.create_gene_expression_boxplot(expression_df, significant_genes_positive, mutated_status_df, positive=1, output_folder=constants.output_folder, target_gene=target_gene)
    plot.create_gene_expression_boxplot(expression_df, significant_genes_negative, mutated_status_df, positive=0, output_folder=constants.output_folder, target_gene=target_gene)

    gsea.create_mutation_label_gsea(gsea_expression_path=gsea_expression_path,
                           output_folder=constants.output_folder,
                           target_gene=target_gene)
    gsea.create_gsea_expression_input_preranked(output_folder=constants.output_folder,
                                                target_gene=target_gene,
                                                stats_path=output_stats_path
                                                )    

# Reset warnings to default behavior after the code block
warnings.resetwarnings()

fraction of rows filtered is 0.2865853658536585
outputting data to output_data_v2/FOXA1/15_104_logfc_pvalue.csv
for target gene FOXA1  -- row score: 0.25602165548062367, col_score: 0.04708802972647411, mutated_col_score: 0.06865564802385617
outputting data to output_data_v2/GYPB/1_116_logfc_pvalue.csv
for target gene GYPB  -- row score: 0.23421231675385734, col_score: 0.06410185672152817, mutated_col_score: 0.0
for gene GYPB no significant positive  genes found, so no box plot created
for gene GYPB no significant negative  genes found, so no box plot created
outputting data to output_data_v2/ID1/1_116_logfc_pvalue.csv
for target gene ID1  -- row score: 0.23923581352611958, col_score: 0.03606137333522512, mutated_col_score: 0.0
for gene ID1 no significant positive  genes found, so no box plot created
outputting data to output_data_v2/RPL10/5_112_logfc_pvalue.csv
for target gene RPL10  -- row score: 0.32735862367584057, col_score: 0.03903956075177235, mutated_col_score: 0.101919793225034

In [None]:
reload(utils)
import joblib 
# Save gene info to a file
joblib.dump(scores_genes, 'all_genes_data_v2.joblib')
top_genes = utils.output_genes_with_highest_score(scores_genes, n = 10)

# Save top_genes to a file
joblib.dump(top_genes, 'top_genes_data.joblib')

# Print the top genes
print("Genes with the Highest Mutated Sample Silhouette Score:")
for i, (gene, scores) in enumerate(top_genes):
    print(f"{i+1}. Gene: {gene}")
    print(f"   Mutated Sample Silhouette Score: {scores['mutated_sample_sil_score']}")
    print(f"   Gene Silhouette Score: {scores['gene_sil_score']}")
    print(f"   Sample Silhouette Score: {scores['sample_sil_score']}")
    print("\n")


# Load top_genes from the file
# loaded_top_genes = joblib.load('top_genes_data.joblib')
# loaded_all_genes = joblib.load('all_genes_data.joblib')





Genes with the Highest Mutated Sample Silhouette Score:
1. Gene: ERF
   Mutated Sample Silhouette Score: 0.39251641278388594
   Gene Silhouette Score: 0.20313066882688563
   Sample Silhouette Score: 0.05566518718633649


2. Gene: TRPM1
   Mutated Sample Silhouette Score: 0.3348453681669478
   Gene Silhouette Score: 0.1645550843863594
   Sample Silhouette Score: 0.02277090086582008


3. Gene: NDST2
   Mutated Sample Silhouette Score: 0.3220168796501848
   Gene Silhouette Score: 0.30216158086509837
   Sample Silhouette Score: 0.1017360369448713


4. Gene: TSPAN9
   Mutated Sample Silhouette Score: 0.31293881694406067
   Gene Silhouette Score: 0.15608305271917275
   Sample Silhouette Score: 0.022227998832706056


5. Gene: MED12
   Mutated Sample Silhouette Score: 0.22894002303535443
   Gene Silhouette Score: 0.181637859072286
   Sample Silhouette Score: 0.06157907971023549


6. Gene: SPOP
   Mutated Sample Silhouette Score: 0.1963834401696408
   Gene Silhouette Score: 0.24124544217953459


In [None]:

# import gseapy as gp

# enr = gp.prerank(rnk=gene_data,
#                  gene_sets= "/Users/meltemtutar/Documents/Huang/Respond/genesets.v2023.2.Hs.cleaned.gmt",
#                  outdir='gsea_result',
#                  permutation_num=1000,
#                  min_size=5,
#                  max_size=2000,
#                  format='png')

2023-12-26 16:29:29,860 No gene sets passed through filtering condition!!!, try new parameters again!
Note: check gene name, gmt file format, or filtering size.


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
