In [2]:
import pandas as pd
import numpy as np
import os
import random
import matplotlib.pyplot as plt

In [3]:
def get_signatures_from_files(folder_path):
    arr = os.listdir(folder_path)
    signatures = []
    signature_names = []
    for file in arr:
        signatures.append(pd.read_csv(os.path.join(folder_path, file)))
        signature_names.append(file.removesuffix('.csv'))
    return signatures, signature_names


In [4]:
def get_specific_signature(signatures_list, signature_names, signature='_GRCh37'):
    signatures = pd.DataFrame()
    for i in range(len(signature_names)):
        column = signatures_list[i][signature_names[i] + signature]
        signatures = pd.concat([signatures, column], axis = 1)

    return signatures

In [8]:
def get_distribution_of_samples(signatures, n_samples):
    df_sparse = pd.DataFrame()
    for i in range(n_samples):
        # TODO: find what distribution of signatures to use (Article uses 5 out of 10 for each sample)
        # Right now, use 0.4 percent chance of signatures being present, with the strength of it being between 0.5 and 2

        # Get a distribution of the counts
        distribution = [random.random()*1.5+0.5 if random.uniform(0, 1) > 0.6 else 0 for x in range(0, signatures.shape[1])]
        total = sum(distribution)
        while(sum(distribution)==0):
            distribution = [random.random()*1.5+0.5 if random.uniform(0, 1) > 0.6 else 0 for x in range(0, signatures.shape[1])]
            total = sum(distribution)

        # normalize
        total = sum(distribution)
        distribution = [x/total for x in distribution]
        df_sparse[i] = distribution

    df_sparse = df_sparse.set_index(signatures.columns)
    return df_sparse

In [9]:
def calculate_counts(signatures, sample_distributions, average_noise):
    simulated_data = signatures.dot(sample_distributions)
    for i in range(simulated_data.shape[1]):
        distribution = simulated_data[i]
        # Get the number of counts between 1001 and 50119 in logscale (50119 for easier numbers in formula)
        n_counts = 10 ** (random.uniform(3, 4.7))
        counts = [int(x*n_counts) for x in distribution]

        # Add Poisson noise
        noisy_counts = [x+np.random.poisson(average_noise) for x in counts]
        simulated_data[i] = noisy_counts

    return simulated_data

In [13]:
# Get paths to working directory and files folder
dir_path = os.path.dirname(os.path.abspath(os.curdir))
dir_path = os.path.join(dir_path, 'Mutational_Signatures')

signature_type = '_GRCh37'
n_samples = 30
average_noise = 10

signatures_list, signature_names = get_signatures_from_files(os.path.join(dir_path, 'cosmic_signatures'))
# print(signatures_list[1].to_string)
signatures = get_specific_signature(signatures_list, signature_names, signature_type)
# print(signatures)
sample_distributions = get_distribution_of_samples(signatures, n_samples)
# print(sample_distributions)
simulated_data = calculate_counts(signatures, sample_distributions, average_noise)
print(simulated_data)

# TODO: make the row names match the mutation types

simulated_data.to_csv('simulated_data/test1', index=True)  

    0    1     2   3    4    5   6    7    8   9   ...   20  21    22  23  24  \
0   69  138  1738  88  404  125  89  791  514  66  ...   10  16  1994  53   9   
1   34   52   539  54  128   72  71  300  209  32  ...   36  15   804  22   8   
2   21   15    90  22   27   29  37   48   35  15  ...   10  11   233   8   3   
3   40   70   725  52  176   73  62  372  245  36  ...   28  10   943  25   6   
4   20   18    68  31   27   37  25   99   76  16  ...   38  16   130  11   8   
..  ..  ...   ...  ..  ...  ...  ..  ...  ...  ..  ...  ...  ..   ...  ..  ..   
91  11   79   127  50   40   76  11  515  424  53  ...  228  37   117  11  12   
92  14   97    34  71   19  114   7  765  627  80  ...  378  13    35  15   5   
93   9   25    80  19   26   31  12  148  114  21  ...   57  15    82   8   7   
94  10   28    65  24   19   29  14  159  123  17  ...   75  12    77  10  11   
95  13  108   181  76   51  111   7  825  662  84  ...  362  22   180  15  12   

     25    26   27   28   2

only simulate 30 samples (since real-life databases are also small (not in our case though??))

1. Choose 5 random signatures per sample to get a representation in combined probability (not really anymore since not sum to 100%
2. randomly select between 1000 to 50000 mutations in log scale per sample
3. Calculate the counts of the samples
4. Add Poisson noise