In [1]:
from SigProfilerExtractor import estimate_best_solution as ebs
from SigProfilerMatrixGenerator import install as genInstall
from SigProfilerExtractor import sigpro as sig
from SigProfilerExtractor import estimate_best_solution as ebs
import pandas as pd


# Install reference genome if necessary
genInstall.install('GRCh37')


def convert_csv_to_tsv(input_path, output_path):
    """
    Converts a CSV file to TSV format.
    """
    df = pd.read_csv(input_path)
    df.to_csv(output_path, sep='\t', index=False)


def run_sigprofiler(input_csv, output_dir, ref_genome, min_sig, max_sig, nmf_replicates, cpu_cores, stability_threshold, min_stability_threshold, combined_stability_threshold):
    """
    Executes the SigProfilerExtractor function with the specified parameters.
    """
    # Convert CSV to TSV
    tsv_path = input_csv.replace(".csv", ".tsv")
    convert_csv_to_tsv(input_csv, tsv_path)

    # Run SigProfilerExtractor
    sig.sigProfilerExtractor(
        input_type="matrix",
        output=output_dir,
        input_data=tsv_path,
        reference_genome=ref_genome,
        minimum_signatures=min_sig,
        maximum_signatures=max_sig,
        nmf_replicates=nmf_replicates,
        cpu=cpu_cores,
    )

    base_csvfile = "results/SBS96/All_solutions_stat.csv"
    all_solutions_folder = "results/SBS96/All_solutions"  # Path to the folder containing all solutions
    output_folder = "results"  # Output folder for results
    title = "Selection_Plot"  # Title of the output plot
    genomes = "results/SBS96/Samples.txt"

    optimal_solution = ebs.estimate_solution(
        base_csvfile=base_csvfile,
        All_solution=all_solutions_folder,
        genomes = genomes,
        output=output_folder,
        title=title,
        stability=stability_threshold,
        min_stability=min_stability_threshold,
        combined_stability=combined_stability_threshold,
        )



if __name__ == "__main__":
    # Input data
    input_csv = "simulated_data/s_25_n_0.06_GRCh37_17b_86_98_39_22a_43_17a_13_54_33_21_59_60_87_37_96_28_55_99_26_3_1_12_93_22b.csv"
    output_dir = "results"
    ref_genome = "GRCh37"
    min_sig = 4
    max_sig = 20
    nmf_replicates = 30
    cpu_cores = -1

    # k-estimation parameters
    stability_threshold = 0.8  # Average stability threshold
    min_stability_threshold = 0.2  # Minimum stability threshold
    combined_stability_threshold = 1.0  # Combined stability threshold
    

    # Run the process
    run_sigprofiler(input_csv, output_dir, ref_genome, min_sig, max_sig, nmf_replicates, cpu_cores, stability_threshold, min_stability_threshold, combined_stability_threshold)



Tool       | Installed 
-----------------------
curl       | True      
wget       | False     
rsync      | True      


INFO - GRCh37 is already installed.


All reference files have been created.
To proceed with matrix_generation, please provide the path to your vcf files and an appropriate output path.
Installation complete.

************** Reported Current Memory Use: 0.27 GB *****************

Extracting signature 4 for mutation type 96
The matrix normalizing cutoff is 27328




KeyboardInterrupt: 

process 4 continues please wait... 
execution time: 3 seconds 

process 4 continues please wait... 
execution time: 3 seconds 

process 4 continues please wait... 
execution time: 4 seconds 

process 4 continues please wait... 
execution time: 5 seconds 

process 4 continues please wait... 
execution time: 5 seconds 

process 4 continues please wait... 
execution time: 5 seconds 

process 4 continues please wait... 
execution time: 7 seconds 

process 4 continues please wait... 
execution time: 4 seconds 

process 4 continues please wait... 
execution time: 4 seconds 

process 4 continues please wait... 
execution time: 3 seconds 

process 4 continues please wait... 
execution time: 8 seconds 

process 4 continues please wait... 
execution time: 5 seconds 

process 4 continues please wait... 
execution time: 4 seconds 

process 4 continues please wait... 
execution time: 3 seconds 

process 4 continues please wait... 
execution time: 3 seconds 

process 4 continues please wait... 
exec


************** Reported Current Memory Use: 0.17 GB *****************

Extracting signature 4 for mutation type 96
The matrix normalizing cutoff is 27328


process 4 continues please wait... 
execution time: 3 seconds 

process 4 continues please wait... 
execution time: 4 seconds 

process 4 continues please wait... 
execution time: 3 seconds 

process 4 continues please wait... 
execution time: 3 seconds 

process 4 continues please wait... 
execution time: 4 seconds 

process 4 continues please wait... 
execution time: 5 seconds 

process 4 continues please wait... 
execution time: 6 seconds 

process 4 continues please wait... 
execution time: 5 seconds 

process 4 continues please wait... 
execution time: 6 seconds 

process 4 continues please wait... 
execution time: 7 seconds 

process 4 continues please wait... 
execution time: 3 seconds 

process 4 continues please wait... 
execution time: 3 seconds 

process 4 continues please wait... 
execution time: 5 seconds 

process 4 c