In [56]:
import pandas as pd
import numpy as np
import os
import random
import matplotlib.pyplot as plt

In [75]:
def get_signatures_from_files(folder_path):
    arr = os.listdir(folder_path)
    signatures = []
    signature_names = []
    for file in arr:
        signatures.append(pd.read_csv(os.path.join(folder_path, file)))
        signature_names.append(file.removesuffix('.csv'))
    return signatures, signature_names


In [79]:
def get_specific_signature(signatures_list, signature_names, signature='_GRCh37'):
    signatures = pd.DataFrame()
    for i in range(len(signature_names)):
        column = signatures_list[i][signature_names[i] + signature]
        signatures = pd.concat([signatures, column], axis = 1)

    return signatures

In [86]:
def get_distribution_of_samples(signatures, n_samples):
    df_sparse = pd.DataFrame()
    for i in range(n_samples):
        # TODO: find what distribution of signatures to use (Article uses 5 out of 10 for each sample)
        # Right now, use 0.4 percent chance of signatures being present, with the strength of it being between 0.5 and 2

        # Get a distribution of the counts
        distribution = [random.random()*1.5+0.5 if random.uniform(0, 1) > 0.6 else 0 for x in range(0, df_grch37.shape[1])]
        total = sum(distribution)
        while(sum(distribution)==0):
            distribution = [random.random()*1.5+0.5 if random.uniform(0, 1) > 0.6 else 0 for x in range(0, df_grch37.shape[1])]
            total = sum(distribution)

        # normalize
        total = sum(distribution)
        distribution = [x/total for x in distribution]
        df_sparse[i] = distribution

    df_sparse = df_sparse.set_index(signatures.columns)
    return df_sparse

In [88]:
def calculate_counts(signatures, sample_distributions, average_noise):
    simulated_data = signatures.dot(sample_distributions)
    for i in range(simulated_data.shape[1]):
        distribution = simulated_data[i]
        # Get the number of counts between 1001 and 50119 in logscale (50119 for easier numbers in formula)
        n_counts = 10 ** (random.uniform(3, 4.7))
        counts = [int(x*n_counts) for x in distribution]

        # Add Poisson noise
        noisy_counts = [x+np.random.poisson(average_noise) for x in counts]
        simulated_data[i] = noisy_counts

    return simulated_data

In [91]:
# Get paths to working directory and files folder
dir_path = os.path.dirname(os.path.abspath(os.curdir))
dir_path = os.path.join(dir_path, 'Mutational_Signatures')

signature_type = '_GRCh37'
n_samples = 30
average_noise = 10

signatures_list, signature_names = get_signatures_from_files(os.path.join(dir_path, 'cosmic_signatures'))
# print(signatures_list[1].to_string)
signatures = get_specific_signature(signatures_list, signature_names, signature_type)
# print(signatures)
sample_distributions = get_distribution_of_samples(signatures, n_samples)
# print(sample_distributions)
simulated_data = calculate_counts(signatures, sample_distributions, average_noise)
print(simulated_data)

     0   1    2    3    4   5    6    7    8   9   ...   20    21   22   23  \
0   504  95  118  170  108  25  126  207  232   5  ...  253   671  440  265   
1   156  56   99  145   42  24   80  172  184  13  ...  212   278  362   92   
2    39  26   47   76   20   9   37   75   95  10  ...  101    49  162   16   
3   211  56   86  128   51  20   77  152  165  12  ...  188   325  317  115   
4    24  14   25   46   14  29   24   46   50  10  ...   63   111  102   18   
..  ...  ..  ...  ...  ...  ..  ...  ...  ...  ..  ...  ...   ...  ...  ...   
91   40  33   16   95   23  60   74   79   15  16  ...  123   652  199   27   
92   14  45   11  161    7  14  105  116   13   8  ...  203   997  312   14   
93   34  11   11   26   17  15   25   31   13  13  ...   36   169   46   14   
94   18  16   14   39   14  16   29   38   16   5  ...   58   198   86   18   
95   61  42   14  155   24  40  113  117   12  14  ...  200  1038  304   40   

    24    25  26   27   28  29  
0    8  1764  14  

only simulate 30 samples (since real-life databases are also small (not in our case though??))

1. Choose 5 random signatures per sample to get a representation in combined probability (not really anymore since not sum to 100%
2. randomly select between 1000 to 50000 mutations in log scale per sample
3. Calculate the counts of the samples
4. Add Poisson noise