In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import mannwhitneyu, wilcoxon
from statsmodels.stats.multitest import multipletests

from statannotations.Annotator import Annotator

In [2]:
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)

plt.rcParams['figure.dpi']=170

DPI=250

In [3]:
from list_vars import LIST_PROFILERS, DIR_FIGURES, RESULTS_DIR, POOLS, CONTROLS

# Biological sample analysis (part 2)

In this notebook we are going to do the analysis on the *biological* samples (POOL samples + controls).

One thing we want to check is how S and mode affect the robustness of our results. So, to do that we are going to compare the results using a comparative plot. The comparativ plot shows how many new variable are shown compared to the previous case. So, we can do a comparative increase on mode by fixing S, or on S by fixing the mode.

This analysis can be performed with many variables, and we are going to choose the following:
1) The number of detected species, across all samples and per individual sample (after NORM+ and cutting of species with more than 65% NaNs).
2) The number of detected differentially abundant species across the 4 comparisons.

With this in mind, we can later select one S and one mode and do the following analyses.

1) The importance of including the biological control samples to ensure that false positives are not considered.
2) The importance of normalizing the reads considering the biogical samples.
3) Plot the significantly differentially abundant species. 


## Observing the effect of $S$ on species retention for biological controls

In [4]:
rename_mapping = {
        'POOL1': 'RR1', 'POOL2': 'RR2', 'POOL3': 'RR3', 'POOL4': 'RR4',
        'POOL5': 'SP1', 'POOL6': 'SP2', 'POOL7': 'SP3', 'POOL8': 'SP4',
        'POOL9': 'HC1', 'POOL10': 'HC2', 'POOL11': 'HC3', 'POOL12': 'HC4',
    }

In [5]:
dict_retained_discarded_species = {'mode': [], 'S': [], 'sample': [], 'retained': [], 'discarded': []}

for mode in [3, 5, 7]:
    for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
        for sample in [f'POOL{i}' for i in range(1, 13)] + ['ACIDOLA', 'BLACTIS']:
            flags_file =  f'{RESULTS_DIR}/summary/{sample}_pass2_mode{mode}_taxgenus_S{S}_NORM+.flags.tsv'
            df_flags = pd.read_csv(flags_file, sep='\t').set_index('taxonomy_id')[['name', 'lineage', 'mean_norm']]
            
            dict_retained_discarded_species['mode'].append(mode)
            dict_retained_discarded_species['S'].append(S)
            dict_retained_discarded_species['sample'].append(rename_mapping[sample] if 'POOL' in sample else sample)
            dict_retained_discarded_species['retained'].append((df_flags['mean_norm'] == False).sum())
            dict_retained_discarded_species['discarded'].append((df_flags['mean_norm'] == True).sum())          

df_retained_discarded_species = pd.DataFrame(dict_retained_discarded_species)
df_retained_discarded_species['total'] = df_retained_discarded_species['retained'] + df_retained_discarded_species['discarded']

df_retained_discarded_species['percentage'] = 100 * df_retained_discarded_species['retained'] / df_retained_discarded_species['total']


In [None]:
df_retained_discarded_species

In [None]:
df_total_counts = df_retained_discarded_species.groupby(['mode', 'sample'])['total'].max().unstack(level=0)
df_total_counts['increment_3_5'] = df_total_counts[5] / df_total_counts[3]
df_total_counts['increment_3_7'] = df_total_counts[7] / df_total_counts[3]

df_total_counts

In [None]:
df_retained_discarded_species

In [None]:
for column, name in zip(['retained', 'percentage'], ['Retained genera',  'Retained percentage']):
    # Set up the grid for 14 samples in a 4x4 layout
    samples = df_retained_discarded_species['sample'].unique()  # Unique samples
    n_cols, n_rows = 7, 2  # Define the grid layout

    # Initialize the grid layout
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(2*n_cols, 2*n_rows), sharex=True)

    # Placeholder to collect handles and labels for the legend
    handles, labels = None, None


    # Loop through each sample and create its subplot
    for i, sample in enumerate(samples):
        row, col = divmod(i, n_cols)  # Get the row and column index for the subplot
        ax = axes[row, col]

        # Filter the DataFrame for the current sample
        sample_data = df_retained_discarded_species[df_retained_discarded_species['sample'] == sample]

        # Plot retained species as a function of S with mode as hue
        lineplot = sns.lineplot(
            data=sample_data,
            x='S',
            y=column,
            hue='mode',
            marker='o',
            ax=ax,
            legend=(i == 6)  # Enable legend only for the first subplot
        )
        ax.set_title(sample)  # Title of the subplot is the sample name
        ax.set_xlabel("S")
        if col == 0:
            ax.set_ylabel(name)
        else:
            ax.set_ylabel("")

        ax.set_xticks([0,  2,  4,  6,  10, 15])

        if i==6:
            ax.legend(frameon=False, bbox_to_anchor=(1.05, 1), title='Mode')

        # Set a specific Y-axis limit for the last two samples
        if column == 'retained':
            if i in [len(samples) - 2, len(samples) - 1]:  # Last two samples
                ax.set_ylim(0, 4)

    # Remove extra subplots (for grid cells not needed)
    for j in range(i + 1, n_rows * n_cols):
        fig.delaxes(axes[j // n_cols, j % n_cols])


    # Adjust the main plot area to leave space for the legend
    plt.tight_layout()

    for format in ['png', 'tiff']: 
        plt.savefig(f'{RESULTS_DIR}/figures/paper/s_effect_{column}.{format}', dpi=DPI)

    plt.show()

## Plotting sample detection and statistacally differential species across modes and S

In [None]:
# Initialize an empty list to collect data rows
data_rows = []

# Iterate through modes and S values
for mode in [3, 5, 7]:
    for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
        # Read the data
        df_normpipe_retained_species = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_retained.tsv', sep='\t')
        df_normpipe_discarded_normplus = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_norm+.tsv', sep='\t')
        df_normpipe_discarded_common = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_common.tsv', sep='\t')

        # Get the list of samples (assuming they are column names)
        samples = df_normpipe_retained_species.columns[3:]

        data_rows.append({
                'mode': mode,
                'S': S,
                'sample': 'ALL',
                'count_retained': len(df_normpipe_retained_species),
                'count_discarded_norm': len(df_normpipe_discarded_normplus),
                'count_discarded_all': len(df_normpipe_discarded_common)
            })
        
        for sample in samples:
            # Count non-NaN species for each dataframe and sample
            count_retained = df_normpipe_retained_species[sample].notna().sum()
            count_discarded_norm = df_normpipe_discarded_normplus[sample].notna().sum()
            count_discarded_all = df_normpipe_discarded_common[sample].notna().sum()

            # Collect the data as a dictionary
            data_rows.append({
                'mode': mode,
                'S': S,
                'sample': sample,
                'count_retained': count_retained,
                'count_discarded_norm': count_discarded_norm,
                'count_discarded_all': count_discarded_all
            })

# Convert the collected rows into a dataframe
df_stats_species_count = pd.DataFrame(data_rows)
df_stats_species_count['count_total'] = df_stats_species_count['count_retained'] + df_stats_species_count['count_discarded_norm'] + df_stats_species_count['count_discarded_all']
df_stats_species_count

In [None]:
df_stats_species_count[df_stats_species_count['sample'] == 'ALL']

In [None]:
# Initialize an empty list to collect data rows
data_rows = []

# Iterate through modes and S values
for mode in [3, 5, 7]:
    for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
        # Read the data
        df_pval_HCvsRR = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsRR.tsv', sep='\t')
        df_pval_HCvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsSP.tsv', sep='\t')
        df_pval_RRvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_RRvsSP.tsv', sep='\t')
        df_pval_sex = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_sex.tsv', sep='\t')

        # Get the list of samples (assuming they are column names)
        samples = df_normpipe_retained_species.columns[3:]

        data_rows.append({
                'mode': mode,
                'S': S,
                'count_HCvsRR': len(df_pval_HCvsRR[df_pval_HCvsRR['pval_MW'] < 0.05]),
                'count_HCvsSP': len(df_pval_HCvsSP[df_pval_HCvsSP['pval_MW'] < 0.05]),
                'count_RRvsSP': len(df_pval_RRvsSP[df_pval_RRvsSP['pval_MW'] < 0.05]),
                'count_sex': len(df_pval_sex[df_pval_sex['pval_MW'] < 0.05]),
                'species_HCvsRR': df_pval_HCvsRR[df_pval_HCvsRR['pval_MW'] < 0.05]['name'].values.tolist(),
                'species_HCvsSP': df_pval_HCvsSP[df_pval_HCvsSP['pval_MW'] < 0.05]['name'].values.tolist(),
                'species_RRvsSP': df_pval_RRvsSP[df_pval_RRvsSP['pval_MW'] < 0.05]['name'].values.tolist(),
                'species_sex': df_pval_sex[df_pval_sex['pval_MW'] < 0.05]['name'].values.tolist()
            })
        
      

# Convert the collected rows into a dataframe
df_stats_species_diffabundance = pd.DataFrame(data_rows)
df_stats_species_diffabundance['count_total'] = df_stats_species_diffabundance['count_HCvsRR'] + df_stats_species_diffabundance['count_HCvsSP'] + \
                                                df_stats_species_diffabundance['count_RRvsSP'] + df_stats_species_diffabundance['count_sex']
df_stats_species_diffabundance


## Why normalization with biological controls is relevant

In [13]:
mode = 3
S = 7

In [14]:
df_normpipe_retained_species = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_retained.tsv', sep='\t')
df_normpipe_discarded_normplus = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_norm+.tsv', sep='\t')
df_normpipe_discarded_common = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_common.tsv', sep='\t')

In [None]:
print(len(df_normpipe_retained_species), len(df_normpipe_discarded_normplus), len(df_normpipe_discarded_common))

In [None]:
species = 'Janibacter'

display(df_normpipe_retained_species[df_normpipe_retained_species['name'] == species])
display(df_normpipe_discarded_normplus[df_normpipe_discarded_normplus['name'] == species])
display(df_normpipe_discarded_common[df_normpipe_discarded_common['name'] == species])

## Plotting discarded species

In [17]:
def annotation_pval(df, ax, groups, x_col, y_col):
    if len(groups) == 2:
        pairs = [(groups[0], groups[1])]

    annotator = Annotator(ax, pairs, data=df, x=x_col, y=y_col)
    annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', line_width=0.65, fontsize=8)
    annotator.apply_and_annotate()

def add_medians(df, ax, x_col, y_col, ysync=0.1, xsync=0.1):
    groups = df[x_col].unique()
    vals = [df[df[x_col] == group][y_col].values for group in groups]

    medians = [np.nanmedian(val) for val in vals]
    medianslog =  [np.log10(np.nanmedian(val) + 1) for val in vals]
    

    for i, median in enumerate(medians):
            ax.text(i + xsync, medianslog[i] + ysync, f"{medians[i]:.0f}", ha='left', va='bottom', fontsize=7)

    log2fc = np.log2(medians[1] / medians[0])

    if log2fc > 0:
        ax.text(-0.4, 5, f"log$_2$FC: {log2fc:.2f}", ha='left', va='bottom', fontsize=7)
    else:
        ax.text(1.5, 5, f"log$_2$FC: {log2fc:.2f}", ha='right', va='bottom', fontsize=7)

In [18]:
def plot_ax_2(ax, y1, y2, color1, color2, group1, group2, species):
    ax.scatter([0] * len(y1), y1, color=color1, label=group1, alpha=0.8)
    ax.scatter([1] * len(y2), y2, color=color2, label=group2, alpha=0.8)

    
    # Add the means as horizontal lines
    ax.plot([0 - 0.2, 0 + 0.2], [np.log10(np.nanmedian(10 ** y1 - 1) + 1), np.log10(np.nanmedian(10 ** y1 - 1) + 1)], color=color1, lw=2)
    ax.plot([1 - 0.2, 1 + 0.2], [np.log10(np.nanmedian(10 ** y2 - 1) + 1), np.log10(np.nanmedian(10 ** y2 - 1) + 1)], color=color2, lw=2)

    # Add horizontal gridlines at 0.5 intervals
    for y in np.arange(0, max(6, max(max(y1) + 0.5, max(y2) + 0.5)), 1):  # Adjust range to match y-axis limits
        ax.axhline(y, color='lightgray', linestyle='--', linewidth=0.5, zorder=0)

    # Customize the x-axis
    ax.set_xticks([0, 1])
    ax.set_xticklabels([group1, group2])
    ax.set_xlim(-0.5, 1.5)

    # Set axis limits and labels
    ax.set_ylim(0, max(6, max(max(y1) + 0.5, max(y2) + 0.5)))
    ax.set_ylabel('log$_{10}$ counts')

    # Remove the x and y axis lines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # Remove ticks on the y-axis
    ax.yaxis.set_ticks_position('none')

    ax.set_title(species, fontsize=10, pad=20)

    # Set lighter grid aesthetics
    ax.grid(False)


def plot_ax_4(ax, yHC, ySP, yRR, yCTRL, species):
    ax.scatter([0] * len(yHC), yHC, color='#648FFF', label='HC', alpha=0.8)
    ax.scatter([1] * len(ySP), ySP, color='#DC267F', label='SP', alpha=0.8)
    ax.scatter([2] * len(yRR), yRR, color='#FE6100', label='RR', alpha=0.8)
    ax.scatter([3] * len(yCTRL), yCTRL, color='#848484', label='CTRL', alpha=0.8)

    # Add the means as horizontal lines
    ax.plot([0 - 0.2, 0 + 0.2], [np.nanmean(yHC), np.nanmean(yHC)], color='#648FFF', lw=2)
    ax.plot([1 - 0.2, 1 + 0.2], [np.nanmean(ySP), np.nanmean(ySP)], color='#DC267F', lw=2)
    ax.plot([2 - 0.2, 2 + 0.2], [np.nanmean(yRR), np.nanmean(yRR)], color='#FE6100', lw=2)
    ax.plot([3 - 0.2, 3 + 0.2], [np.nanmean(yCTRL), np.nanmean(yCTRL)], color='#848484', lw=2)

    # Add horizontal gridlines at 0.5 intervals
    for y in np.arange(0, max(5, max(max(yHC) + 0.5, max(ySP) + 0.5, max(yRR) + 0.5, max(yCTRL) + 0.5)), 1):  # Adjust range to match y-axis limits
        ax.axhline(y, color='lightgray', linestyle='--', linewidth=0.5, zorder=0)

    # Customize the x-axis
    ax.set_xticks([0, 1, 2, 3])
    ax.set_xticklabels(['HC', 'SP', 'RR', 'CTRL'])
    ax.set_xlim(-0.5, 3.5)

    # Set axis limits and labels
    ax.set_ylim(0, max(5, max(max(yHC) + 0.5, max(ySP) + 0.5, max(yRR) + 0.5, max(yCTRL) + 0.5)))
    ax.set_ylabel('log$_{10}$ counts')

    # Remove the x and y axis lines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # Remove ticks on the y-axis
    ax.yaxis.set_ticks_position('none')

    ax.set_title(species, fontsize=10)

    # Set lighter grid aesthetics
    ax.grid(False)

In [None]:
df_normpipe_discarded_common

In [20]:
def plot_species(df, ncols_max=6):
    nplots = len(df)
    nrows = nplots//ncols_max 
    nrows += (nplots % ncols_max != 0)

    if nplots:
        # Create the figure and axis
        fig, axs = plt.subplots(nrows, ncols_max, figsize=(2 * ncols_max, 2 * nrows))

        for i, species in enumerate(df.index):
            ax = axs.ravel()[i] if nplots > 1 else axs
            yHC = np.log10(df.loc[species][['HC1', 'HC2', 'HC3', 'HC4']].astype(float).values + 1)
            ySP = np.log10(df.loc[species][['SP1', 'SP2', 'SP3', 'SP4']].astype(float).values + 1)
            yRR = np.log10(df.loc[species][['RR1', 'RR2', 'RR3', 'RR4']].astype(float).values + 1)
            yCTRL = np.log10(df.loc[species][['ACIDOLA', 'BLACTIS']].astype(float).values + 1)
            
            plot_ax_4(ax, yHC, ySP, yRR, yCTRL, species)
            df_annot = pd.DataFrame({'group': ['HC'] * 4 + ['SP'] * 4 + ['RR'] * 4 + ['CTRL'] * 2, 
                                     'exp': df.loc[species, ['HC1', 'HC2', 'HC3', 'HC4', 'SP1', 'SP2', 'SP3', 'SP4', 'RR1', 'RR2', 'RR3', 'RR4', 'ACIDOLA', 'BLACTIS']].astype(float).values})
            # add_medians(df_annot, ax, x_col='group', y_col='exp', xsync=0.15, ysync=0)

        if nplots % ncols_max:
            for i in range(nplots, nrows * ncols_max ):
                axs.ravel()[i].axis('off')
        plt.tight_layout()



In [None]:
plot_species(df_normpipe_discarded_common.copy().set_index('name'), ncols_max=6)
for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_NORM+x_discarded.{format}', dpi=DPI)

plot_species(df_normpipe_discarded_normplus.copy().set_index('name'), ncols_max=7)
for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_NORMx_discarded.{format}', dpi=DPI)

plot_species(df_normpipe_retained_species.copy().set_index('name'), ncols_max=8)



## Plotting differential species

In [23]:
def plot_diff_sopecies(df_pval, cols_A, cols_B, color1, color2, nrows=2, other_labels = None):
    ncols = len(df_pval) // nrows

    if other_labels is None:
        g1, g2 = cols_A[0][:-1], cols_B[0][:-1]

    if ncols:
        # Create the figure and axis
        fig, axs = plt.subplots(nrows, ncols, figsize=(2 * ncols, nrows * 2))

        for i in range(len(df_pval)):
            df_i = df_pval.iloc[i]
            ax = axs.ravel()[i] if ncols > 1 else axs
            y1 = np.log10(df_i[cols_A].astype(float).values + 1)
            y2 = np.log10(df_i[cols_B].astype(float).values + 1)
            
            species = df_i['name']

            df_annot = pd.DataFrame({'group': [i[:-1] for i in cols_A + cols_B], 'exp': df_i[cols_A + cols_B].astype(float).values})

            plot_ax_2(ax, y1=y1, y2=y2, color1=color1, color2=color2, group1=g1, group2=g2, species=species)
            annotation_pval(df_annot, ax, [g1, g2], x_col='group', y_col='exp')
            add_medians(df_annot, ax, x_col='group', y_col='exp', xsync=0.15, ysync=0)

        plt.tight_layout()

In [None]:
# Plot HC vs RR
df_pval_HCvsRR = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsRR.tsv', sep='\t')
df_pval_HCvsRR_up = df_pval_HCvsRR[(df_pval_HCvsRR['pval_MW'] < 0.15) & (df_pval_HCvsRR['log2FC'] > 0)].sort_values(by='log2FC').head()
df_pval_HCvsRR_down = df_pval_HCvsRR[(df_pval_HCvsRR['pval_MW'] < 0.15) & (df_pval_HCvsRR['log2FC'] <= 0)].sort_values(by='log2FC').tail()

df_pval_HCvsRR_updown = pd.concat([df_pval_HCvsRR_up, df_pval_HCvsRR_down])
df_pval_HCvsRR_updown = df_pval_HCvsRR_updown.iloc[:len(df_pval_HCvsRR_updown) - len(df_pval_HCvsRR_updown) % 2]


ncols = len(df_pval_HCvsRR_updown) 

plot_diff_sopecies(df_pval_HCvsRR_updown,  ['HC1', 'HC2', 'HC3', 'HC4'], ['RR1', 'RR2', 'RR3', 'RR4'], "#648FFF", "#FE6100", nrows=1)

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_HCvRR.{format}', dpi=DPI)




df_pval_HCvsRR_up = df_pval_HCvsRR[df_pval_HCvsRR['pval_MW'] < 0.05].sort_values(by='log2FC').head()
df_pval_HCvsRR_down = df_pval_HCvsRR[df_pval_HCvsRR['pval_MW'] < 0.05].sort_values(by='log2FC').tail()

df_pval_HCvsRR_updown = pd.concat([df_pval_HCvsRR_up, df_pval_HCvsRR_down])

ncols = len(df_pval_HCvsRR_updown) 

plot_diff_sopecies(df_pval_HCvsRR_updown,  ['HC1', 'HC2', 'HC3', 'HC4'], ['RR1', 'RR2', 'RR3', 'RR4'], "#648FFF", "#FE6100", nrows=1)

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/pval_HCvRR.{format}', dpi=DPI)

In [None]:
# Plot HC vs SP

df_pval_HCvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsSP.tsv', sep='\t')

df_pval_HCvsSP_up = df_pval_HCvsSP[(df_pval_HCvsSP['pval_MW'] < 0.15) & (df_pval_HCvsSP['log2FC'] > 0)].sort_values(by='log2FC').head()
df_pval_HCvsSP_down = df_pval_HCvsSP[(df_pval_HCvsSP['pval_MW'] < 0.15) & (df_pval_HCvsSP['log2FC'] <= 0)].sort_values(by='log2FC').tail()

df_pval_HCvsSP_updown = pd.concat([df_pval_HCvsSP_up, df_pval_HCvsSP_down])
df_pval_HCvsSP_updown = df_pval_HCvsSP_updown.iloc[:len(df_pval_HCvsSP_updown) - len(df_pval_HCvsSP_updown) % 2]

ncols = len(df_pval_HCvsSP_updown)

plot_diff_sopecies(df_pval_HCvsSP_updown,  ['HC1', 'HC2', 'HC3', 'HC4'], ['SP1', 'SP2', 'SP3', 'SP4'], "#648FFF", "#DC267F", nrows=1)

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_HCvSP.{format}', dpi=DPI)




df_pval_HCvsSP_up = df_pval_HCvsSP[df_pval_HCvsSP['pval_MW'] < 0.05].sort_values(by='log2FC').head()

ncols = len(df_pval_HCvsSP_up) 

plot_diff_sopecies(df_pval_HCvsSP_up,  ['HC1', 'HC2', 'HC3', 'HC4'], ['SP1', 'SP2', 'SP3', 'SP4'], "#648FFF", "#DC267F", nrows=1)                                        

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/pval_HCvsSP.{format}', dpi=DPI)

In [None]:
# # Plot RR vs SP
# df_pval_RRvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_RRvsSP.tsv', sep='\t')
# df_pval_RRvsSP_up = df_pval_RRvsSP[(df_pval_RRvsSP['pval_MW'] < 0.15) & (df_pval_RRvsSP['log2FC'] > 0)].sort_values(by='log2FC').head(14)
# df_pval_RRvsSP_down = df_pval_RRvsSP[(df_pval_RRvsSP['pval_MW'] < 0.15) & (df_pval_RRvsSP['log2FC'] <= 0)].sort_values(by='log2FC').head(14)

# df_pval_RRvsSP_updown = pd.concat([df_pval_RRvsSP_up, df_pval_RRvsSP_down])
# df_pval_RRvsSP_updown = df_pval_RRvsSP_updown.iloc[:len(df_pval_RRvsSP_updown) - len(df_pval_RRvsSP_updown) % 2]

# ncols = len(df_pval_RRvsSP_updown) // 3

# plot_diff_sopecies(df_pval_RRvsSP_updown,  ['RR1', 'RR2', 'RR3', 'RR4'], ['SP1', 'SP2', 'SP3', 'SP4'],  "#FE6100", "#DC267F", nrows=3)

# for format in ['png', 'tiff']: 
#     plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_RRvsSP.{format}', dpi=DPI)


df_pval_RRvsSP_up = df_pval_RRvsSP[df_pval_RRvsSP['pval_MW'] < 0.05].sort_values(by='log2FC').head(16)

ncols = len(df_pval_RRvsSP_up)

plot_diff_sopecies(df_pval_RRvsSP_up,  ['RR1', 'RR2', 'RR3', 'RR4'], ['SP1', 'SP2', 'SP3', 'SP4'],  "#FE6100", "#DC267F", nrows=1) 

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/pval_RRvsSP.{format}', dpi=DPI)

In [None]:
df_pval_RRvsSP_updown

In [None]:
# Plot sex

df_pval_sex = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_sex.tsv', sep='\t')
df_pval_sex = df_pval_sex.rename(columns={'HC1':'Male1', 'HC2':'Male2', 'RR1':'Male3', 'RR2':'Male4', 'SP1':'Male5', 'SP2':'Male6', 'HC3': 'Female1', 'HC4': 'Female2', 'RR3': 'Female3', 'RR4': 'Female4', 'SP3': 'Female5', 'SP4': 'Female6'})

df_pval_sex_up = df_pval_sex[(df_pval_sex['pval_MW'] < 0.15) & (df_pval_sex['log2FC'] > 0)].sort_values(by='log2FC').head(10)
df_pval_sex_down = df_pval_sex[(df_pval_sex['pval_MW'] < 0.15) & (df_pval_sex['log2FC'] <= 0)].sort_values(by='log2FC').tail(10)

df_pval_sex_updown = pd.concat([df_pval_sex_up, df_pval_sex_down])
df_pval_sex_updown = df_pval_sex_updown.iloc[:len(df_pval_sex_updown) - len(df_pval_sex_updown) % 2]

ncols = len(df_pval_sex_updown) 

plot_diff_sopecies(df_pval_sex_updown, [f'Female{i}' for i in range(1,7)], [f'Male{i}' for i in range(1,7)], "#785EF0", "#FFB000", nrows=1)

for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/l2fc_sex.{format}', dpi=DPI)


df_pval_sex_up = df_pval_sex[df_pval_sex['pval_MW'] < 0.05].sort_values(by='log2FC').head()

ncols = len(df_pval_sex_up) 

plot_diff_sopecies(df_pval_sex_up,  [f'Male{i}' for i in range(1,7)], [f'Female{i}' for i in range(1,7)], "#785EF0", "#FFB000", nrows=1) 


for format in ['png', 'tiff']: 
    plt.savefig(f'{RESULTS_DIR}/figures/paper/pval_sex.{format}', dpi=DPI)

In [None]:
species = 'Sutterella'

df_all = pd.concat([df_normpipe_retained_species, df_normpipe_discarded_normplus, df_normpipe_discarded_common]).set_index('name')

yHC = np.log10(df_all.loc[species][['HC1', 'HC2', 'HC3', 'HC4']].astype(float).values + 1)
ySP = np.log10(df_all.loc[species][['SP1', 'SP2', 'SP3', 'SP4']].astype(float).values + 1)
yRR = np.log10(df_all.loc[species][['RR1', 'RR2', 'RR3', 'RR4']].astype(float).values + 1)
yCTRL = np.log10(df_all.loc[species][['ACIDOLA', 'BLACTIS']].astype(float).values + 1)

fig, ax = plt.subplots(1, 1)
plot_ax_4(ax, yHC, ySP, yRR, yCTRL, species)


display(df_normpipe_discarded_common[df_normpipe_discarded_common['name'] == species])
display(df_normpipe_discarded_normplus[df_normpipe_discarded_normplus['name'] == species])
display(df_normpipe_retained_species[df_normpipe_retained_species['name'] == species])


display(df_pval_HCvsRR[df_pval_HCvsRR['name'] == species])
display(df_pval_HCvsSP[df_pval_HCvsSP['name'] == species])
display(df_pval_RRvsSP[df_pval_RRvsSP['name'] == species])
display(df_pval_sex[df_pval_sex['name'] == species])