In [2]:
import pandas as pd
import random
import re
import numpy as np


signatures = pd.read_csv('cosmic_signatures/COSMIC_v3.4_SBS_GRCh37.txt', sep='\t')
signatures.set_index('Type', inplace=True)
signatures = signatures[[x for x in signatures.columns if not re.search(r'SBS5$|SBS40.?$',x)]]

orthogonal = dict()
for col_name, col_data in signatures.items():
    other = signatures.drop(columns=col_name)
    other_products = other.apply(lambda x: np.dot(col_data, x))
    orthogonal[col_name] = other_products.mean()

orthogonal = pd.Series(orthogonal).sort_values(ascending=False)

In [3]:
for signature_count in [8, 15, 25]:
    signature_list = " ".join(orthogonal[:signature_count].index)
    for noise_level in [0.2, 0.4, 0.6]:
        print(signature_list)
        identifier = f"s_{signature_count}_n_{noise_level}"
        !python SimulateData.py --identifier {identifier} --config "simulated_data/config_v1.json" --signatures_to_extract $signature_list --noise_distribution_avg_perc $noise_level


SBS10a SBS56 SBS10d SBS52 SBS36 SBS91 SBS45 SBS38
signatures_file_path
signatures_to_extract
False
0.2
{'counts_distribution': {'distribution': 'logscale', 'max': 50000, 'min': 1000},
 'identifier': 's_8_n_0.2',
 'n_samples': 100,
 'noise_distribution': {'avg_perc': 0.2, 'distribution': 'poisson'},
 'sample_signature_distribution': {'distribution': 'uniform',
                                   'max': 2,
                                   'min': 0.5,
                                   'n_sign_active': 2,
                                   'sign_active_prob': 0.4,
                                   'use_sign_active_prob': False},
 'save_dir': 'simulated_data',
 'signatures_file_path': 'cosmic_signatures/COSMIC_v3.4_SBS_GRCh37.txt',
 'signatures_to_extract': ['SBS10a',
                           'SBS56',
                           'SBS10d',
                           'SBS52',
                           'SBS36',
                           'SBS91',
                           'SBS45',
      

Sucessfully saved simulated data in simulated_data/s_15_n_0.4_GRCh37_10a_56_10d_52_36_91_45_38_10c_14_18_7b_7a_23_19.csv
Sucessfully saved meta-data in simulated_data/config_s_15_n_0.4_GRCh37_10a_56_10d_52_36_91_45_38_10c_14_18_7b_7a_23_19.json

            0    1     2     3     4     5   ...    94    95    96    97   98    99
Type                                         ...                                   
A[C>A]A  10235  557  8940  2252  2738  2193  ...  3298  1062  1822  7071  525  3457
A[C>A]C  10228  591  8973  2206  2808  2133  ...  3414  1074  1690  7209  553  3539
A[C>A]G   9869  577  8872  2137  2692  2208  ...  3128  1080  1641  7064  545  3243
A[C>A]T  10204  574  9500  2171  3058  2228  ...  3219  1086  1686  7707  518  3362
A[C>G]A   9924  569  8799  2133  2718  2199  ...  3174  1039  1748  7133  537  3386

[5 rows x 100 columns]
SBS10a SBS56 SBS10d SBS52 SBS36 SBS91 SBS45 SBS38 SBS10c SBS14 SBS18 SBS7b SBS7a SBS23 SBS19
signatures_file_path
signatures_to_extract
False


SBS10a SBS56 SBS10d SBS52 SBS36 SBS91 SBS45 SBS38 SBS10c SBS14 SBS18 SBS7b SBS7a SBS23 SBS19 SBS30 SBS11 SBS95 SBS20 SBS10b SBS58 SBS2 SBS15 SBS42 SBS4
signatures_file_path
signatures_to_extract
False
0.6
{'counts_distribution': {'distribution': 'logscale', 'max': 50000, 'min': 1000},
 'identifier': 's_25_n_0.6',
 'n_samples': 100,
 'noise_distribution': {'avg_perc': 0.6, 'distribution': 'poisson'},
 'sample_signature_distribution': {'distribution': 'uniform',
                                   'max': 2,
                                   'min': 0.5,
                                   'n_sign_active': 2,
                                   'sign_active_prob': 0.4,
                                   'use_sign_active_prob': False},
 'save_dir': 'simulated_data',
 'signatures_file_path': 'cosmic_signatures/COSMIC_v3.4_SBS_GRCh37.txt',
 'signatures_to_extract': ['SBS10a',
                           'SBS56',
                           'SBS10d',
                           'SBS52',
           