In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import SimulateData

In [4]:
config = {
    "signatures_file_path":     'cosmic_signatures/COSMIC_v3.4_SBS_GRCh37.txt',
    "signatures_to_extract":    ['SBS4', 'SBS6', 'SBS7a','SBS9','SBS18'],
    "n_samples":                100,
    "save_dir":                 'simulated_data_01_14',
    'sample_signature_distribution': {
        'distribution':             'uniform',
        'min':                      0.5,
        'max':                      2,
        'use_sign_active_prob':     True,
        'sign_active_prob':         0.4,    # Only used if use_sign_active_prob is True
        'n_sign_active':            2       # Only used if use_sign_active_prob is False
    },
    'noise_distribution': {
        'distribution':             'poisson',
        'avg_perc':                 0.05,
    },
    'counts_distribution': {
        'cancer_type':              'random', # Specify the cancer_type, or let each sample be a random one from the file 'mutation_counts/TCGA/WES_TCGA.96_min_max.csv'
        # 'cancer_type':              'NA',   # If want to not use the cancertype to get the min and max, but fill them in by hand
        'distribution':             'logscale',
        # 'min':                      1000,   # Only used if cancer_type = NA
        # 'max':                      50000,  # Only used if cancer_type = NA
    }
}

In [None]:
!ls simulated_data

In [6]:
import pandas as pd
import random
import re
signatures = pd.read_csv('cosmic_signatures/COSMIC_v3.4_SBS_GRCh37.txt', sep='\t')
signatures.set_index('Type', inplace=True)
signatures = signatures[[x for x in signatures.columns if not re.search(r'SBS5$|SBS40.?$',x)]]

import numpy as np


orthogonal = dict()
for col_name, col_data in signatures.items():
    other = signatures.drop(columns=col_name)
    other_products = other.apply(lambda x: np.dot(col_data, x))
    orthogonal[col_name] = other_products.mean()

orthogonal = pd.Series(orthogonal).sort_values(ascending=False)

In [7]:
# for signature_count in [8, 15, 25]:
#     signature_list = " ".join(orthogonal[:signature_count].index)
#     for noise_level in [0.02, 0.04, 0.06]:
#         for mutation_count in [(500, 2000), (2000, 5000), (5000, 8000)]:
#             print(signature_list)
#             identifier = f"s_{signature_count}_n_{noise_level}"
#             !python SimulateData.py --identifier {identifier} --config "simulated_data/config_v1.json" --signatures_to_extract $signature_list --noise_distribution_avg_perc $noise_level --counts_distribution_min $mutation_count[0] --counts_distribution_max $mutation_count[1]
#             break

In [None]:
for signature_count in [8, 15, 25]:
    signature_list = " ".join(orthogonal[:signature_count].index)
    for noise_level in [0.02, 0.04, 0.06]:
        print(signature_list)
        identifier = f"s_{signature_count}_n_{noise_level}"
        !python SimulateData.py --identifier {identifier} --config "simulated_data/config_v2.json" --signatures_to_extract $signature_list --noise_distribution_avg_perc $noise_level
        break

In [9]:
# Quick test
# !python SimulateData.py --identifier "352" --config "simulated_data/config_v2.json" --signatures_to_extract $signature_list --noise_distribution_avg_perc $noise_level

In [None]:
config = {
  "signatures_file_path":     "cosmic_signatures/COSMIC_v3.4_SBS_GRCh37.txt",
  "signatures_to_extract":    ["SBS4", "SBS6", "SBS7a","SBS9","SBS18"],
  "n_samples":                100,
  "save_dir":                 "simulated_data",
  "sample_signature_distribution": {
      "distribution":             "uniform",
      "min":                      0.5,
      "max":                      2,
      "use_sign_active_prob":     True,
      "sign_active_prob":         0.4, 
      "n_sign_active":            2
  },
  "noise_distribution": {
      "distribution":             "poisson",
      "avg_perc":                 0.05
  },
  "counts_distribution": {
      "cancer_type":              "NA",
      "distribution":             "logscale"
  }
}

config['counts_distribution']['distribution'] = "logscale"
config['counts_distribution']['cancer_type'] = 'Thy-AdenoCa Biliary-AdenoCa Skin-Melanoma'
for signature_count in [8, 15, 25]:
    signature_list = list(orthogonal[:signature_count].index)
    config['signatures_to_extract'] = signature_list
    for noise_level in [0.02, 0.04, 0.06]:
        config['noise_distribution']['avg_perc'] = noise_level
        identifier = f"s_{signature_count}_n_{noise_level}_"
        config.update({'identifier': identifier})
        SimulateData.simulate_data(config, print_text=False)


# config['counts_distribution']['distribution'] = "logscale"
# for signature_count in [8, 15, 25]:
#     signature_list = list(orthogonal[:signature_count].index)
#     config['signatures_to_extract'] = signature_list
#     for noise_level in [0.02, 0.04, 0.06]:
#         config['noise_distribution']['avg_perc'] = noise_level
#         for cancer_type in ['Thy-AdenoCa', 'Biliary-AdenoCa', 'Skin-Melanoma']:
#             config['counts_distribution']['cancer_type'] = cancer_type
#             identifier = f"s_{signature_count}_n_{noise_level}_c_{cancer_type}"
#             config.update({'identifier': identifier})
#             SimulateData.simulate_data(config, print_text=False)

# config['counts_distribution']['cancer_type'] = "NA"
# config['counts_distribution']['distribution'] = "logscale"
# for signature_count in [8, 15, 25]:
#     signature_list = list(orthogonal[:signature_count].index)
#     config['signatures_to_extract'] = signature_list
#     for noise_level in [0.02, 0.04, 0.06]:
#         config['noise_distribution']['avg_perc'] = noise_level
#         for count_range in [(0, 3000), (3000, 6000), (6000, 9000)]:
#             config['counts_distribution']['min'] = count_range[0]
#             config['counts_distribution']['max'] = count_range[1]
#             identifier = f"s_{signature_count}_n_{noise_level}_c_{count_range[0]}_{count_range[1]}"
#             config.update({'identifier': identifier})
#             SimulateData.simulate_data(config, print_text=False)

# config['counts_distribution']['cancer_type'] = "random"
# config['counts_distribution']['distribution'] = "logscale"
# for signature_count in [8, 15, 25]:
#     signature_list = orthogonal[:signature_count].index
#     signature_list2 = list(orthogonal[:signature_count].index)
#     signature_list3 = orthogonal[:signature_count]
#     config['signatures_to_extract'] = signature_list2
#     for noise_level in [0.02, 0.04, 0.06]:
#         config['noise_distribution']['avg_perc'] = noise_level
#         identifier = f"s_{signature_count}_n_{noise_level}"
#         config.update({'identifier': identifier})
#         SimulateData.simulate_data(config, print_text=False)