In [5]:
%load_ext autoreload
%autoreload 2

In [1]:
import SimulateData

In [2]:
config = {
    "signatures_file_path":     'cosmic_signatures/COSMIC_v3.4_SBS_GRCh37.txt',
    "signatures_to_extract":    ['SBS4', 'SBS6', 'SBS7a','SBS9','SBS18'],
    "n_samples":                10,
    "save_dir":                 'simulated_data',
    'sample_signature_distribution': {
        'distribution':             'uniform',
        'min':                      0.5,
        'max':                      2,
        'use_sign_active_prob':     True,
        'sign_active_prob':         0.4,    # Only used if use_sign_active_prob is True
        'n_sign_active':            2       # Only used if use_sign_active_prob is False
    },
    'noise_distribution': {
        'distribution':             'poisson',
        'avg_perc':                 0.05,
    },
    'counts_distribution': {
        'distribution':             'logscale',
        'min':                      1000,
        'max':                      50000,
    }
}

simulated_data, data_file, config_file = SimulateData.simulate_data(config)
print('')
print(simulated_data.head())

Sucessfully saved simulated data in simulated_data/data_v2_GRCh37_4_6_7a_9_18.csv
Sucessfully saved meta-data in simulated_data/config_v2.json

           0    1    2     3    4   5     6     7    8    9
Type                                                       
A[C>A]A  161  213  114  3122  167  67  3057  2027  899  124
A[C>A]C  181  152   96  2338  162  58  2400  1888  719   90
A[C>A]G  139  147   88  2028  143  47  2150  1505  579   92
A[C>A]T  173  171  106  2476  158  58  2496  1839  727  103
A[C>G]A  140  157   99  2031  163  59  2109  1508  627   90


In [4]:
!ls simulated_data

config_v1.json	data_v1_GRCh37_4_6_7a_9_18.csv
config_v2.json	data_v2_GRCh37_4_6_7a_9_18.csv


In [70]:
import pandas as pd
import random
import re
signatures = pd.read_csv('cosmic_signatures/COSMIC_v3.4_SBS_GRCh37.txt', sep='\t')
signatures.set_index('Type', inplace=True)
signatures = signatures[[x for x in signatures.columns if not re.search(r'SBS5$|SBS40.?$',x)]]

import numpy as np


orthogonal = dict()
for col_name, col_data in signatures.items():
    other = signatures.drop(columns=col_name)
    other_products = other.apply(lambda x: np.dot(col_data, x))
    orthogonal[col_name] = other_products.mean()

orthogonal = pd.Series(orthogonal).sort_values(ascending=False)

In [71]:
for signature_count in [8, 15, 25]:
    signature_list = " ".join(orthogonal[:signature_count].index)
    for noise_level in [0.02, 0.04, 0.06]:
        print(signature_list)
        identifier = f"s_{signature_count}_n_{noise_level}"
        !python SimulateData.py --identifier {identifier} --config "simulated_data/config_v1.json" --signatures_to_extract $signature_list --noise_distribution_avg_perc $noise_level
        break

SBS10a SBS56 SBS10d SBS52 SBS36 SBS91 SBS45 SBS38
signatures_file_path
signatures_to_extract
False
0.02
{'counts_distribution': {'distribution': 'logscale', 'max': 50000, 'min': 1000},
 'identifier': 's_8_n_0.02',
 'n_samples': 100,
 'noise_distribution': {'avg_perc': 0.02, 'distribution': 'poisson'},
 'sample_signature_distribution': {'distribution': 'uniform',
                                   'max': 2,
                                   'min': 0.5,
                                   'n_sign_active': 2,
                                   'sign_active_prob': 0.4,
                                   'use_sign_active_prob': False},
 'save_dir': 'simulated_data',
 'signatures_file_path': 'cosmic_signatures/COSMIC_v3.4_SBS_GRCh37.txt',
 'signatures_to_extract': ['SBS10a',
                           'SBS56',
                           'SBS10d',
                           'SBS52',
                           'SBS36',
                           'SBS91',
                           'SBS45',
   

'SBS10a SBS56 SBS10d SBS52 SBS36 SBS91 SBS45 SBS38'