In [None]:
import pandas as pd
import numpy as np
import scipy.special as sc
import matplotlib as m
import matplotlib.pyplot as plt

#### What if we take all consistently inconsistent genes out of the data and perform the WT-WT control, do we still get DEG hits?

#### What if we take only consistently inconsistent genes in the data and perform the WT-WT control, how many hits do we end up with?

In [None]:
# control experiment

# 10 reps, 100 iterations 
# samples from ONLY WT!

In [None]:
# load WT-WT control experiment data

# all bayexpress results
# 10 replicates
# BF > 1

CONTROL_R10_bayexpress_FC0 = pd.read_csv(f'DGE_results/CONTROL_R10_1_bayexpress.csv').iloc[:,1:2]
CONTROL_R10_bayexpress_FC1 = pd.read_csv(f'DGE_results/CONTROL_R10_1_bayexpress.csv').iloc[:,1:2]
CONTROL_R10_bayexpress_FC2 = pd.read_csv(f'DGE_results/CONTROL_R10_1_bayexpress.csv').iloc[:,1:2]

for i in range(1,101):

    data = pd.read_csv(f'DGE_results/CONTROL_R10_{i}_bayexpress.csv').iloc[:,1:]

    # criteria 0 for significance: (BF > 1) & (abs(FC) > 0)
    CONTROL_R10_bayexpress_FC0[f'{i}'] = (data.BF > 1) & (abs(data.FC) > 0)

    # criteria 1 for significance: (BF > 1) & (abs(FC) > 1)
    CONTROL_R10_bayexpress_FC1[f'{i}'] = (data.BF > 1) & (abs(data.FC) > 1)

    # criteria 2 for significance: (BF > 1) & (abs(FC) > 2)
    CONTROL_R10_bayexpress_FC2[f'{i}'] = (data.BF > 1) & (abs(data.FC) > 2)

CONTROL_R10_bayexpress_FC0['Ntrue'] = [(CONTROL_R10_bayexpress_FC0.iloc[i].value_counts().get(True, 0)) for i in range(len(CONTROL_R10_bayexpress_FC0))]

CONTROL_R10_bayexpress_FC1['Ntrue'] = [(CONTROL_R10_bayexpress_FC1.iloc[i].value_counts().get(True, 0)) for i in range(len(CONTROL_R10_bayexpress_FC1))]

CONTROL_R10_bayexpress_FC2['Ntrue'] = [(CONTROL_R10_bayexpress_FC2.iloc[i].value_counts().get(True, 0)) for i in range(len(CONTROL_R10_bayexpress_FC2))]

display(CONTROL_R10_bayexpress_FC0)
display(CONTROL_R10_bayexpress_FC1)
display(CONTROL_R10_bayexpress_FC2)

In [None]:
# load data from all replicate comparisons which contains lists of consistently inconsistent genes (AOTP)

RALL_bayexpress = pd.read_csv('RALL_bayexpress.csv', index_col=0)


# the column AOTP_WT informs whether the genes is marked as consistently inconsistent (True) or not (False) in the wild type yeast data

RALL_bayexpress

In [None]:
# regular plot, all genes included

fig, axs = plt.subplots(1, 1, sharey=True, figsize=(6, 4), dpi=300)

# scatters
axs.scatter(np.arange(1, 101),
            CONTROL_R10_bayexpress_FC0.iloc[:, 1:-1].sum(axis=0),
            c='#332288', s=30,
            label=f'BF > 1, FC > 0, total = {sum(CONTROL_R10_bayexpress_FC0.iloc[:,1:-1].sum(axis=0))}',
            alpha=0.9, marker='x')

axs.scatter(np.arange(1, 101),
            CONTROL_R10_bayexpress_FC1.iloc[:, 1:-1].sum(axis=0),
            c='#44AA99', s=30,
            label=f'BF > 1, FC > 1, total = {sum(CONTROL_R10_bayexpress_FC1.iloc[:,1:-1].sum(axis=0))}',
            alpha=0.9, marker='x')

axs.scatter(np.arange(1, 101),
            CONTROL_R10_bayexpress_FC2.iloc[:, 1:-1].sum(axis=0),
            c='#CC6677', s=30,
            label=f'BF > 1, FC > 2, total = {sum(CONTROL_R10_bayexpress_FC2.iloc[:,1:-1].sum(axis=0))}',
            alpha=0.9, marker='x')

axs.legend(loc='upper left')

# Use tex in labels
axs.set_xticks(np.arange(1, 101))
axs.set_xticklabels('')

# formatting y axis ticks
plt.gca().ticklabel_format(axis='y', style='plain', useOffset=False)

axs.set_xlabel('100 Bootstrapping iterations')
axs.set_ylabel('# putative differentially expressed genes')

# Set y limit 3500
axs.set_ylim(-100, 3500)

plt.title('All genes (n=7126)')

plt.show()


In [None]:
# only consistently inconsistent genes in the plot

fig, axs = plt.subplots(1, 1, sharey=True, figsize=(6, 4), dpi=300)

# scatters
axs.scatter(np.arange(1, 101),
            CONTROL_R10_bayexpress_FC0.set_index('locus_name').loc[list(RALL_bayexpress.loc[RALL_bayexpress.AOTP_WT == True].locus_name)].iloc[:, :-1].sum(axis=0),
            c='#332288', s=30,
            label=f'BF > 1, FC > 0, total = {sum(CONTROL_R10_bayexpress_FC0.set_index('locus_name').loc[list(RALL_bayexpress.loc[RALL_bayexpress.AOTP_WT == True].locus_name)].iloc[:, :-1].sum(axis=0))}',
            alpha=0.9, marker='x')

axs.scatter(np.arange(1, 101),
            CONTROL_R10_bayexpress_FC1.set_index('locus_name').loc[list(RALL_bayexpress.loc[RALL_bayexpress.AOTP_WT == True].locus_name)].iloc[:, :-1].sum(axis=0),
            c='#44AA99', s=30,
            label=f'BF > 1, FC > 1, total = {sum(CONTROL_R10_bayexpress_FC1.set_index('locus_name').loc[list(RALL_bayexpress.loc[RALL_bayexpress.AOTP_WT == True].locus_name)].iloc[:, :-1].sum(axis=0))}',
            alpha=0.9, marker='x')

axs.scatter(np.arange(1, 101),
            CONTROL_R10_bayexpress_FC2.set_index('locus_name').loc[list(RALL_bayexpress.loc[RALL_bayexpress.AOTP_WT == True].locus_name)].iloc[:, :-1].sum(axis=0),
            c='#CC6677', s=30,
            label=f'BF > 1, FC > 2, total = {sum(CONTROL_R10_bayexpress_FC2.set_index('locus_name').loc[list(RALL_bayexpress.loc[RALL_bayexpress.AOTP_WT == True].locus_name)].iloc[:, :-1].sum(axis=0))}',
            alpha=0.9, marker='x')

axs.legend(loc='upper left')

# Use tex in labels
axs.set_xticks(np.arange(1, 101))
axs.set_xticklabels('')

# formatting y axis ticks
plt.gca().ticklabel_format(axis='y', style='plain', useOffset=False)

axs.set_xlabel('100 Bootstrapping iterations')
axs.set_ylabel('# putative differentially expressed genes')

# Set y limit 3500
axs.set_ylim(-100, 3500)

plt.title('Exclusively consistently inconsistent genes (n=1633)')


plt.show()


display(RALL_bayexpress.loc[RALL_bayexpress.AOTP_WT == True])

In [None]:
# excluding consistently inconsistent genes from the plot

fig, axs = plt.subplots(1, 1, sharey=True, figsize=(6, 4), dpi=300)

# scatters
axs.scatter(np.arange(1, 101),
            CONTROL_R10_bayexpress_FC0.set_index('locus_name').loc[list(RALL_bayexpress.loc[RALL_bayexpress.AOTP_WT == False].locus_name)].iloc[:, :-1].sum(axis=0),
            c='#332288', s=30,
            label=f'BF > 1, FC > 0, total = {sum(CONTROL_R10_bayexpress_FC0.set_index('locus_name').loc[list(RALL_bayexpress.loc[RALL_bayexpress.AOTP_WT == False].locus_name)].iloc[:, :-1].sum(axis=0))}',
            alpha=0.9, marker='x')

axs.scatter(np.arange(1, 101),
            CONTROL_R10_bayexpress_FC1.set_index('locus_name').loc[list(RALL_bayexpress.loc[RALL_bayexpress.AOTP_WT == False].locus_name)].iloc[:, :-1].sum(axis=0),
            c='#44AA99', s=30,
            label=f'BF > 1, FC > 1, total = {sum(CONTROL_R10_bayexpress_FC1.set_index('locus_name').loc[list(RALL_bayexpress.loc[RALL_bayexpress.AOTP_WT == False].locus_name)].iloc[:, :-1].sum(axis=0))}',
            alpha=0.9, marker='x')

axs.scatter(np.arange(1, 101),
            CONTROL_R10_bayexpress_FC2.set_index('locus_name').loc[list(RALL_bayexpress.loc[RALL_bayexpress.AOTP_WT == False].locus_name)].iloc[:, :-1].sum(axis=0),
            c='#CC6677', s=30,
            label=f'BF > 1, FC > 2, total = {sum(CONTROL_R10_bayexpress_FC2.set_index('locus_name').loc[list(RALL_bayexpress.loc[RALL_bayexpress.AOTP_WT == False].locus_name)].iloc[:, :-1].sum(axis=0))}',
            alpha=0.9, marker='x')

axs.legend(loc='upper left')

# Use tex in labels
axs.set_xticks(np.arange(1, 101))
axs.set_xticklabels('')

# formatting y axis ticks
plt.gca().ticklabel_format(axis='y', style='plain', useOffset=False)

axs.set_xlabel('100 Bootstrapping iterations')
axs.set_ylabel('# putative differentially expressed genes')

# Set y limit 3500
axs.set_ylim(-100, 3500)


plt.title('Excluding consistently inconsistent genes (n=5493)')


plt.show()


display(RALL_bayexpress.loc[RALL_bayexpress.AOTP_WT == False])
