# Simulate synthetic bulk RNA-seq gene expression data

Nuha BinTayyash, 2020

This notebook shows how to Simulate synthetic bulk RNA-seq time series of $S =3$ samples, $D=  400$ genes at $T=6$ time points assuming three generative functions $f$: sine, cosine and cubic splines. The generated data from the three functions is exponentiated using exponential link function to set the mean of count data. Count data is sampled from negative binomial distribution parametrized by the probability of success $p=\frac{r}{\exp(f)+r}$ and number of failures $r =\frac{1}{dispersion} $.
 
Bulk_simulator.py used to simulated four datasets with two levels of the mean of count data (high count and low count datasets) and two levels of dispersion (high dispersion and low dispersion).


In [None]:
%run ./Bulk_simulator.py

 50%|█████     | 1/2 [02:27<02:27, 147.42s/it]

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Filter datasets to have mean counts higher than 1 and lower than 1000 for low counts dataset and lower than 100000 for high counts dataset 

In [None]:

def filter_counts(counts,max_value):
    counts = counts.T
    filtered_counts = counts.loc[:,(counts.mean(0) > 1.) & (counts.mean(0) <= max_value)]
    n, bins, patches = plt.hist(filtered_counts.mean(0), density=True, facecolor='g')
    plt.grid(True)
    plt.show()
    filtered_counts = filtered_counts.T
    return filtered_counts

def get_common_genes_between_two_ts(first_ts,second_ts):
    shared_index = first_ts.index.intersection(second_ts.index)
    first_ts = first_ts.loc[shared_index]
    second_ts = second_ts.loc[shared_index]
    return first_ts,second_ts

In [None]:
filesname = ['low_counts_low_dispersion'
             ,'low_counts_high_dispersion'                                      
             ,'high_counts_low_dispersion'
             ,'high_counts_high_dispersion']

max_mean_count = [1000., 100000.] 

for filename in filesname:
    DE_samples = pd.read_csv(filename+'.csv',index_col=[0])
    Non_DE_samples = DE_samples.iloc[240::,:]
    DE_samples = DE_samples.iloc[0:240,:] 
  
    if filename.find('low_counts') != -1:
        max_num = max_mean_count[0]
    else:
        max_num = max_mean_count[1]
        
    DE_samples_filtered = filter_counts(DE_samples,max_num)
    Non_DE_samples_filtered = filter_counts(Non_DE_samples,max_num)
    
    DE_samples_filtered,Non_DE_samples_filtered = get_common_genes_between_two_ts(DE_samples_filtered,Non_DE_samples_filtered)
    
    DE_non_DE = DE_samples_filtered
    DE_non_DE = DE_non_DE.T
    # create a dataframe that have differentially expressed genes followed by non differentially expressed genes
    for i in range(Non_DE_samples_filtered.shape[0]):
        DE_non_DE.insert(i+i+1,Non_DE_samples_filtered.index.values[i] ,Non_DE_samples_filtered.iloc[[i]].values.T,True)
    DE_non_DE = DE_non_DE.T
    #print(DE_non_DE)
    genes = ['gene_%s' % (s+1) for s in range(DE_non_DE.shape[0])] 
    DE_non_DE.index = genes
    print('shape',DE_non_DE.shape)
    DE_non_DE.iloc[0:400,:].to_csv(filename+'.csv')
    print('shape',DE_non_DE.iloc[0:400,:].shape)