### Running DESeq2 and edgeR and calculating Bayes factors

In [2]:
import os

# This python file is calling R scripts with a list of dependencies. To make sure you have all of them ready you can simply create a conda environment using our explorted environment

# conda env create -f environment.yml

# If you are not using conda, the specific requirements here are
# R, statmod, tidyverse, bioconductor, genomeinfodbdata, limma and of course edgeR, DESeq2
# http://bioconductor.org/install/
# https://bioconductor.org/packages/release/bioc/html/edgeR.html
# https://bioconductor.org/packages/release/bioc/html/DESeq2.html
# https://bioconductor.org/packages/release/bioc/html/limma.html
# https://anaconda.org/bioconda/bioconductor-genomeinfodbdata



#### 100 boostrap iterations á #reps ∈ {3,6,12,20}

# Running edgeR

In [None]:
# 3 REPLICATES


for i in range(1,101):

    DATASET_NAME = f"3R_100/R3_{i}.csv"

    RESULTS_NAME = f"DGE_results/R3_{i}_edgeR.csv"

    os.system(f"Rscript Do_DGE_edgeR_R3.R {DATASET_NAME} {RESULTS_NAME} 0.0")

    print(f"Done {i}")


In [None]:
# 6 REPLICATES


for i in range(1,101):

    DATASET_NAME = f"6R_100/R6_{i}.csv"

    RESULTS_NAME = f"DGE_results/R6_{i}_edgeR.csv"

    os.system(f"Rscript Do_DGE_edgeR_R6.R {DATASET_NAME} {RESULTS_NAME} 0.0")

    print(f"Done {i}")


In [None]:
# 12 REPLICATES


for i in range(1,101):

    DATASET_NAME = f"12R_100/R12_{i}.csv"

    RESULTS_NAME = f"DGE_results/R12_{i}_edgeR.csv"

    os.system(f"Rscript Do_DGE_edgeR_R12.R {DATASET_NAME} {RESULTS_NAME} 0.0")

    print(f"Done {i}")


In [None]:
# 20 REPLICATES


for i in range(1,101):

    DATASET_NAME = f"20R_100/R20_{i}.csv"

    RESULTS_NAME = f"DGE_results/R20_{i}_edgeR.csv"

    os.system(f"Rscript Do_DGE_edgeR_R20.R {DATASET_NAME} {RESULTS_NAME} 0.0")

    print(f"Done {i}")


In [None]:
# ALL REPLICATES


DATASET_NAME = f"RALL.csv"

RESULTS_NAME = f"DGE_results/RALL_edgeR.csv"

os.system(f"Rscript Do_DGE_edgeR_RALL.R {DATASET_NAME} {RESULTS_NAME} 0.0")

print(f"Done ALL")

# Running DESeq2

In [None]:
# 3 REPLICATES


for i in range(1,101):

    DATASET_NAME = f"3R_100/R3_{i}.csv"

    METADATA_NAME = f"3R_100/R3_{i}_meta.txt"

    RESULTS_NAME = f"DGE_results/R3_{i}_DESeq2.csv"

    os.system(f"Rscript Do_DGE_DESeq2.R {DATASET_NAME} {METADATA_NAME} {RESULTS_NAME}")

    print(f"Done {i}")

In [None]:
# 6 REPLICATES


for i in range(1,101):

    DATASET_NAME = f"6R_100/R6_{i}.csv"

    METADATA_NAME = f"6R_100/R6_{i}_meta.txt"

    RESULTS_NAME = f"DGE_results/R6_{i}_DESeq2.csv"

    os.system(f"Rscript Do_DGE_DESeq2.R {DATASET_NAME} {METADATA_NAME} {RESULTS_NAME}")

    print(f"Done {i}")


In [None]:
# 12 REPLICATES


for i in range(1,101):

    DATASET_NAME = f"12R_100/R12_{i}.csv"

    METADATA_NAME = f"12R_100/R12_{i}_meta.txt"

    RESULTS_NAME = f"DGE_results/R12_{i}_DESeq2.csv"

    os.system(f"Rscript Do_DGE_DESeq2.R {DATASET_NAME} {METADATA_NAME} {RESULTS_NAME}")

    print(f"Done {i}")


In [None]:
# 20 REPLICATES


for i in range(1,101):

    DATASET_NAME = f"20R_100/R20_{i}.csv"

    METADATA_NAME = f"20R_100/R20_{i}_meta.txt"

    RESULTS_NAME = f"DGE_results/R20_{i}_DESeq2.csv"

    os.system(f"Rscript Do_DGE_DESeq2.R {DATASET_NAME} {METADATA_NAME} {RESULTS_NAME}")

    print(f"Done {i}")


In [None]:
# all reps


DATASET_NAME = f"RALL.csv"

METADATA_NAME = f"RALL_meta.txt"

RESULTS_NAME = f"DGE_results/RALL_DESeq2.csv"

os.system(f"Rscript Do_DGE_DESeq2.R {DATASET_NAME} {METADATA_NAME} {RESULTS_NAME}")

print(f"Done ALL")


# bayexpress

In [3]:
import pandas as pd
import numpy as np
from numpy import random
import scipy.special as sc
import matplotlib as m
import os

In [4]:
# calculating Bayes factors
def get_BF(N_1, n_1, N_2, n_2):

    return (sc.betaln( u_1 + n_1, u_2 + N_1 - n_1) + sc.betaln( u_1 + n_2, u_2 + N_2 - n_2) - sc.betaln( u_1 + n_1 + n_2, u_2 + N_1 - n_1 + N_2 - n_2)) / np.log(10) 


# ratio of expression 
# calculating log fold change
def get_FC(N_1, n_1, N_2, n_2):
    rate_1 = (u_1 + n_1) / (u_2 + N_1 - n_1)
    rate_2 = (u_1 + n_2) / (u_2 + N_2 - n_2)

    return np.log2(rate_2 / rate_1)

# setting the priors to flat prios
u_1 = 1
u_2 = 1

In [5]:
# all reps

in_data = pd.read_csv(f"RALL.csv")

out_data = pd.DataFrame({'genes': in_data.genes})

n_1 = in_data.iloc[:,1:43].sum(axis=1)
n_2 = in_data.iloc[:,43:].sum(axis=1)

N_1 = in_data.iloc[:,1:43].sum(axis=1).sum()
N_2 = in_data.iloc[:,43:].sum(axis=1).sum()

out_data['BF'] = get_BF(N_1, n_1, N_2, n_2)
out_data['FC'] = get_FC(N_1, n_1, N_2, n_2)

out_data.to_csv(f"DGE_results/RALL_bayexpress.csv")

In [None]:
# 3 REPLICATES

# summing up over replicates


for i in range(1,101):

    in_data = pd.read_csv(f"3R_100/R3_{i}.csv")

    out_data = pd.DataFrame({'locus_name': in_data.locus_name})

    n_1 = in_data.iloc[:,1:4].sum(axis=1)
    n_2 = in_data.iloc[:,4:].sum(axis=1)

    N_1 = in_data.iloc[:,1:4].sum(axis=1).sum()
    N_2 = in_data.iloc[:,4:].sum(axis=1).sum()

    out_data['BF'] = get_BF(N_1, n_1, N_2, n_2)
    out_data['FC'] = get_FC(N_1, n_1, N_2, n_2)

    out_data.to_csv(f"DGE_results/R3_{i}_bayexpress.csv")

    print(f"Done {i}")

In [None]:
# 6 REPLICATES

# summing up over replicates


for i in range(1,101):

    in_data = pd.read_csv(f"6R_100/R6_{i}.csv")

    out_data = pd.DataFrame({'locus_name': in_data.locus_name})

    n_1 = in_data.iloc[:,1:7].sum(axis=1)
    n_2 = in_data.iloc[:,7:].sum(axis=1)

    N_1 = in_data.iloc[:,1:7].sum(axis=1).sum()
    N_2 = in_data.iloc[:,7:].sum(axis=1).sum()

    out_data['BF'] = get_BF(N_1, n_1, N_2, n_2)
    out_data['FC'] = get_FC(N_1, n_1, N_2, n_2)

    out_data.to_csv(f"DGE_results/R6_{i}_bayexpress.csv")

    print(f"Done {i}")

In [None]:
# 12 REPLICATES

# summing up over replicates


for i in range(1,101):

    in_data = pd.read_csv(f"12R_100/R12_{i}.csv")

    out_data = pd.DataFrame({'locus_name': in_data.locus_name})

    n_1 = in_data.iloc[:,1:13].sum(axis=1)
    n_2 = in_data.iloc[:,13:].sum(axis=1)

    N_1 = in_data.iloc[:,1:13].sum(axis=1).sum()
    N_2 = in_data.iloc[:,13:].sum(axis=1).sum()

    out_data['BF'] = get_BF(N_1, n_1, N_2, n_2)
    out_data['FC'] = get_FC(N_1, n_1, N_2, n_2)

    out_data.to_csv(f"DGE_results/R12_{i}_bayexpress.csv")

    print(f"Done {i}")

In [None]:
# 20 REPLICATES

# summing up over replicates


for i in range(1,101):

    in_data = pd.read_csv(f"20R_100/R20_{i}.csv")

    out_data = pd.DataFrame({'locus_name': in_data.locus_name})

    n_1 = in_data.iloc[:,1:21].sum(axis=1)
    n_2 = in_data.iloc[:,21:].sum(axis=1)

    N_1 = in_data.iloc[:,1:21].sum(axis=1).sum()
    N_2 = in_data.iloc[:,21:].sum(axis=1).sum()

    out_data['BF'] = get_BF(N_1, n_1, N_2, n_2)
    out_data['FC'] = get_FC(N_1, n_1, N_2, n_2)

    out_data.to_csv(f"DGE_results/R20_{i}_bayexpress.csv")

    print(f"Done {i}")

### Control experiment ... what if we only use the WT in the bootstrapping?

In [4]:

# assuming a flat prior (for now?)
u_1 = 1
u_2 = 1

# calculating Bayes factors

def get_BF_k1(data):
    # this range is irrelevant if we want to do all 
    k = len(data.columns)

    evidence2 = np.full(len(data), 0)

    # iterating over j until k
    for col in data.columns[1:k]: 
        n_j = data[col]
        # print(n_j, 'n_j')
        N_j = sum(data[col])
        # print(N_j, 'N_j')
        evidence2 = evidence2 + sc.betaln(u_1 + n_j, u_2 + N_j - n_j)

    N = sum(data.iloc[:,1:k].sum(axis=0, numeric_only=True))
    n_i = data.iloc[:,1:k].sum(axis=1, numeric_only=True)

    # print(n_i, 'n_i')
    # print(N, 'N')

    evidence1 = sc.betaln( u_1 + n_i, u_2 + N - n_i)

    return (evidence2 - evidence1) / np.log(10) 


In [None]:
# 3 REPLICATES
# CONTROL EXPERIMENT

# summing up over replicates


for i in range(1,101):

    in_data = pd.read_csv(f"6R_100/R6_{i}.csv")

    out_data = pd.DataFrame({'locus_name': in_data.locus_name})

    n_1 = in_data.iloc[:,1:4].sum(axis=1)
    n_2 = in_data.iloc[:,4:7].sum(axis=1)

    N_1 = in_data.iloc[:,1:4].sum(axis=1).sum()
    N_2 = in_data.iloc[:,4:7].sum(axis=1).sum()

    out_data['BF'] = get_BF(N_1, n_1, N_2, n_2)
    out_data['FC'] = get_FC(N_1, n_1, N_2, n_2)

    out_data['BF_k1_1'] = get_BF_k1(in_data.iloc[:,1:4])
    out_data['BF_k1_2'] = get_BF_k1(in_data.iloc[:,4:7])

    out_data.to_csv(f"DGE_results/CONTROL_R3_{i}_bayexpress.csv")

    print(f"Done {i}")

In [None]:
# 10 REPLICATES
# CONTROL EXPERIMENT

# summing up over replicates


for i in range(1,101):

    in_data = pd.read_csv(f"20R_100/R20_{i}.csv")

    out_data = pd.DataFrame({'locus_name': in_data.locus_name})

    n_1 = in_data.iloc[:,1:11].sum(axis=1)
    n_2 = in_data.iloc[:,11:21].sum(axis=1)

    N_1 = in_data.iloc[:,1:11].sum(axis=1).sum()
    N_2 = in_data.iloc[:,11:21].sum(axis=1).sum()

    out_data['BF'] = get_BF(N_1, n_1, N_2, n_2)
    out_data['FC'] = get_FC(N_1, n_1, N_2, n_2)

    out_data['BF_k1_1'] = get_BF_k1(in_data.iloc[:,1:4])
    out_data['BF_k1_2'] = get_BF_k1(in_data.iloc[:,4:7])

    out_data.to_csv(f"DGE_results/CONTROL_R10_{i}_bayexpress.csv")

    print(f"Done {i}")