In [6]:
import pandas as pd
import random
import re
import numpy as np


signatures = pd.read_csv('cosmic_signatures/COSMIC_v3.4_SBS_GRCh37.txt', sep='\t')
signatures.set_index('Type', inplace=True)
signatures = signatures[[x for x in signatures.columns if not re.search(r'SBS5$|SBS40.?$',x)]]

orthogonal = dict()
for col_name, col_data in signatures.items():
    other = signatures.drop(columns=col_name)
    other_products = other.apply(lambda x: np.dot(col_data, x))
    orthogonal[col_name] = other_products.mean()

orthogonal = pd.Series(orthogonal).sort_values() #should be ascending as lower dot products are more dissimilar

In [7]:
for signature_count in [8, 15, 25]:
    signature_list = " ".join(orthogonal[:signature_count].index)
    print(signature_list)

    for noise_level in [0.02, 0.04, 0.06]:
        identifier = f"s_{signature_count}_n_{noise_level}"
        !python SimulateData.py --identifier {identifier} --config "simulated_data/config_v1.json" --signatures_to_extract $signature_list --noise_distribution_avg_perc $noise_level


SBS17b SBS86 SBS98 SBS39 SBS22a SBS43 SBS17a SBS13
signatures_file_path
signatures_to_extract
False
0.02
{'counts_distribution': {'distribution': 'logscale', 'max': 50000, 'min': 1000},
 'identifier': 's_8_n_0.02',
 'n_samples': 100,
 'noise_distribution': {'avg_perc': 0.02, 'distribution': 'poisson'},
 'sample_signature_distribution': {'distribution': 'uniform',
                                   'max': 2,
                                   'min': 0.5,
                                   'n_sign_active': 2,
                                   'sign_active_prob': 0.4,
                                   'use_sign_active_prob': False},
 'save_dir': 'simulated_data',
 'signatures_file_path': 'cosmic_signatures/COSMIC_v3.4_SBS_GRCh37.txt',
 'signatures_to_extract': ['SBS17b',
                           'SBS86',
                           'SBS98',
                           'SBS39',
                           'SBS22a',
                           'SBS43',
                           'SBS17a',
 

Sucessfully saved simulated data in simulated_data/s_15_n_0.04_GRCh37_17b_86_98_39_22a_43_17a_13_54_33_21_59_60_87_37.csv
Sucessfully saved meta-data in simulated_data/config_s_15_n_0.04_GRCh37_17b_86_98_39_22a_43_17a_13_54_33_21_59_60_87_37.json

         0    1    2    3   4     5     6   ...   93   94    95  96   97    98   99
Type                                        ...                                    
A[C>A]A  91  107  101  290  83   672  1470  ...  556   90  1723  51  103  1882  475
A[C>A]C  70   94  105  261  63   685  1473  ...  490   95  1687  51  100  1935  468
A[C>A]G  72  125  106  266  67   616  1351  ...  533  109  1604  49   92  1766  421
A[C>A]T  86   83  113  287  60   635  1457  ...  507   83  1736  47   92  2208  432
A[C>G]A  99  101  116  271  87  1024  2021  ...  732   92  2867  70  210  2919  645

[5 rows x 100 columns]
signatures_file_path
signatures_to_extract
False
0.06
{'counts_distribution': {'distribution': 'logscale', 'max': 50000, 'min': 1000},
 'ide

Sucessfully saved simulated data in simulated_data/s_25_n_0.06_GRCh37_17b_86_98_39_22a_43_17a_13_54_33_21_59_60_87_37_96_28_55_99_26_3_1_12_93_22b.csv
Sucessfully saved meta-data in simulated_data/config_s_25_n_0.06_GRCh37_17b_86_98_39_22a_43_17a_13_54_33_21_59_60_87_37_96_28_55_99_26_3_1_12_93_22b.json

           0     1     2     3    4    5   ...    94    95    96  97   98    99
Type                                       ...                                 
A[C>A]A  1209  2913  2922  1583  172  321  ...  3517  1132  2303  69  185  1511
A[C>A]C  1064  2901  2887  1618  202  282  ...  3348  1082  2290  70  177  1368
A[C>A]G  1144  3437  2683  1545  170  273  ...  2895  1088  2271  67  200  1394
A[C>A]T  1158  2931  2702  1556  193  275  ...  3354  1119  2302  74  179  1389
A[C>G]A  1166  2910  4135  1973  240  367  ...  3382  1454  3082  73  154  1714

[5 rows x 100 columns]
