In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
from matplotlib.colors import Normalize, ListedColormap
from scipy.stats import linregress, pearsonr
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import json

In [None]:
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)

plt.rcParams['figure.dpi']=170

In [None]:
from list_vars import LIST_PROFILERS, DIR_FIGURES, RESULTS_DIR, POOLS, CONTROLS

# Biological sample analysis (part 2)

In this notebook we are going to do the analysis on the *biological* samples (POOL samples + controls).

One thing we want to check is how S and mode affect the robustness of our results. So, to do that we are going to compare the results using a comparative plot. The comparativ plot shows how many new variable are shown compared to the previous case. So, we can do a comparative increase on mode by fixing S, or on S by fixing the mode.

This analysis can be performed with many variables, and we are going to choose the following:
1) The number of detected species, across all samples and per individual sample (after NORM+ and cutting of species with more than 65% NaNs).
2) The number of detected differentially abundant species across the 4 comparisons.

With this in mind, we can later select one S and one mode and do the following analyses.

1) The importance of including the biological control samples to ensure that false positives are not considered.
2) The importance of normalizing the reads considering the biogical samples.
3) Plot the significantly differentially abundant species. 


## Plotting sample detection and statistacally differential species across modes and S

In [None]:
# Initialize an empty list to collect data rows
data_rows = []

# Iterate through modes and S values
for mode in [3, 5, 7]:
    for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
        # Read the data
        df_normpipe_retained_species = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_retained.tsv', sep='\t')
        df_normpipe_discarded_normplus = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_norm+.tsv', sep='\t')
        df_normpipe_discarded_common = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_common.tsv', sep='\t')

        # Get the list of samples (assuming they are column names)
        samples = df_normpipe_retained_species.columns[3:]

        data_rows.append({
                'mode': mode,
                'S': S,
                'sample': 'ALL',
                'count_retained': len(df_normpipe_retained_species),
                'count_discarded_norm': len(df_normpipe_discarded_normplus),
                'count_discarded_all': len(df_normpipe_discarded_common)
            })
        
        for sample in samples:
            # Count non-NaN species for each dataframe and sample
            count_retained = df_normpipe_retained_species[sample].notna().sum()
            count_discarded_norm = df_normpipe_discarded_normplus[sample].notna().sum()
            count_discarded_all = df_normpipe_discarded_common[sample].notna().sum()

            # Collect the data as a dictionary
            data_rows.append({
                'mode': mode,
                'S': S,
                'sample': sample,
                'count_retained': count_retained,
                'count_discarded_norm': count_discarded_norm,
                'count_discarded_all': count_discarded_all
            })

# Convert the collected rows into a dataframe
df_stats_species_count = pd.DataFrame(data_rows)
df_stats_species_count['count_total'] = df_stats_species_count['count_retained'] + df_stats_species_count['count_discarded_norm'] + df_stats_species_count['count_discarded_all']
df_stats_species_count

In [None]:
df_stats_species_count[df_stats_species_count['sample'] == 'ALL']

In [None]:
# Initialize an empty list to collect data rows
data_rows = []

# Iterate through modes and S values
for mode in [3, 5, 7]:
    for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
        # Read the data
        df_pval_HCvsRR = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsRR.tsv', sep='\t')
        df_pval_HCvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsSP.tsv', sep='\t')
        df_pval_RRvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_RRvsSP.tsv', sep='\t')
        df_pval_sex = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_sex.tsv', sep='\t')

        # Get the list of samples (assuming they are column names)
        samples = df_normpipe_retained_species.columns[3:]

        data_rows.append({
                'mode': mode,
                'S': S,
                'species_HCvsRR': len(df_pval_HCvsRR[df_pval_HCvsRR['pval_MW'] < 0.05]),
                'species_HCvsSP': len(df_pval_HCvsSP[df_pval_HCvsSP['pval_MW'] < 0.05]),
                'species_RRvsSP': len(df_pval_RRvsSP[df_pval_RRvsSP['pval_MW'] < 0.05]),
                'species_sex': len(df_pval_sex[df_pval_sex['pval_MW'] < 0.05])
            })
        
      

# Convert the collected rows into a dataframe
df_stats_species_diffabundance = pd.DataFrame(data_rows)
df_stats_species_diffabundance['count_total'] = df_stats_species_diffabundance['species_HCvsRR'] + df_stats_species_diffabundance['species_HCvsSP'] + \
                                                df_stats_species_diffabundance['species_RRvsSP'] + df_stats_species_diffabundance['species_sex']
df_stats_species_diffabundance


## Why normalization with biological controls is relevant

In [None]:
mode = 3
S = 0

In [None]:
df_normpipe_retained_species = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_retained.tsv', sep='\t')
df_normpipe_discarded_normplus = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_norm+.tsv', sep='\t')
df_normpipe_discarded_common = pd.read_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_common.tsv', sep='\t')

In [None]:
species = 'Candida'

display(df_normpipe_retained_species[df_normpipe_retained_species['name'] == species])
display(df_normpipe_discarded_normplus[df_normpipe_discarded_normplus['name'] == species])
display(df_normpipe_discarded_common[df_normpipe_discarded_common['name'] == species])

## Plotting differential species

In [None]:
def plot_ax(ax, y1, y2, color1, color2, group1, group2, species):
    ax.scatter([1] * len(y1), y1, color=color1, label=group1, alpha=0.8)
    ax.scatter([2] * len(y2), y2, color=color2, label=group2, alpha=0.8)

    # Add the means as horizontal lines
    ax.plot([1 - 0.2, 1 + 0.2], [np.nanmean(y1), np.nanmean(y1)], color=color1, lw=2)
    ax.plot([2 - 0.2, 2 + 0.2], [np.nanmean(y2), np.nanmean(y2)], color=color2, lw=2)

    # Add horizontal gridlines at 0.5 intervals
    for y in np.arange(0, 5.5, 0.5):  # Adjust range to match y-axis limits
        ax.axhline(y, color='lightgray', linestyle='--', linewidth=0.8, zorder=0)

    # Customize the x-axis
    ax.set_xticks([1, 2])
    ax.set_xticklabels([group1, group2])
    ax.set_xlim(0.5, 2.5)

    # Set axis limits and labels
    ax.set_ylim(0, max(5, max(max(y1) + 0.5, max(y2) + 0.5)))
    ax.set_ylabel('log$_{10}$ counts')

    # Remove the x and y axis lines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)

    # Remove ticks on the y-axis
    ax.yaxis.set_ticks_position('none')

    ax.set_title(species)

    # Set lighter grid aesthetics
    ax.grid(False)

In [None]:
# Plot HC vs RR
df_pval_HCvsRR = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsRR.tsv', sep='\t')
df_pval_HCvsRR = df_pval_HCvsRR[df_pval_HCvsRR['pval_MW'] < 0.05].sort_values(by='log2FC')

ncols = len(df_pval_HCvsRR)

if ncols:
    # Create the figure and axis
    fig, axs = plt.subplots(1, ncols, figsize=(3 * ncols, 3))

    for i in range(ncols):
        ax = axs[i]
        y1 = np.log10(df_pval_HCvsRR.iloc[i][['HC1', 'HC2', 'HC3', 'HC4']].astype(float).values + 1)
        y2 = np.log10(df_pval_HCvsRR.iloc[i][['RR1', 'RR2', 'RR3', 'RR4']].astype(float).values + 1)
        
        species = df_pval_HCvsRR.iloc[i]['name']

        plot_ax(ax, y1=y1, y2=y2, color1='skyblue', color2='crimson', group1='HC', group2='RR', species=species)

    plt.tight_layout()
    plt.show()

In [None]:
# Plot HC vs SP

df_pval_HCvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsSP.tsv', sep='\t')
df_pval_HCvsSP = df_pval_HCvsSP[df_pval_HCvsSP['pval_MW'] < 0.05].sort_values(by='log2FC')

ncols = len(df_pval_HCvsSP)

if ncols:
    # Create the figure and axis
    fig, axs = plt.subplots(1, ncols, figsize=(3 * ncols, 3))

    for i in range(ncols):
        ax = axs[i] if ncols > 1 else axs
        y1 = np.log10(df_pval_HCvsSP.iloc[i][['HC1', 'HC2', 'HC3', 'HC4']].astype(float).values + 1)
        y2 = np.log10(df_pval_HCvsSP.iloc[i][['SP1', 'SP2', 'SP3', 'SP4']].astype(float).values + 1)
        
        species = df_pval_HCvsSP.iloc[i]['name']

        plot_ax(ax, y1=y1, y2=y2, color1='skyblue', color2='purple', group1='HC', group2='SP', species=species)

    plt.tight_layout()
    plt.show()                                              

In [None]:
# Plot HC vs SP

df_pval_RRvsSP = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_RRvsSP.tsv', sep='\t')
df_pval_RRvsSP = df_pval_RRvsSP[df_pval_RRvsSP['pval_MW'] < 0.05].sort_values(by='log2FC')

ncols = len(df_pval_RRvsSP)

# Create the figure and axis
fig, axs = plt.subplots(2, ncols//2 + ncols%2, figsize=(2 * ncols, 2 * 3))

if ncols:
    for i in range(ncols):
        ax = axs.ravel()[i]
        y1 = np.log10(df_pval_RRvsSP.iloc[i][['RR1', 'RR2', 'RR3', 'RR4']].astype(float).values + 1)
        y2 = np.log10(df_pval_RRvsSP.iloc[i][['SP1', 'SP2', 'SP3', 'SP4']].astype(float).values + 1)
        
        species = df_pval_RRvsSP.iloc[i]['name']

        plot_ax(ax, y1=y1, y2=y2, color1='crimson', color2='purple', group1='RR', group2='SP', species=species)

    for i in range(ncols, ncols+1):
        axs.ravel()[i].axis('off')
        
    plt.tight_layout()
    plt.show()

In [None]:
# Plot sex

df_pval_sex = pd.read_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_sex.tsv', sep='\t')
df_pval_sex = df_pval_sex[df_pval_sex['pval_MW'] < 0.05].sort_values(by='log2FC')

ncols = len(df_pval_sex)

# Create the figure and axis
fig, axs = plt.subplots(1, ncols, figsize=(3 * ncols, 3))

if ncols:
    for i in range(ncols):
        ax = axs[i] if ncols > 1 else axs
        y1 = np.log10(df_pval_sex.iloc[i][['HC1', 'HC2', 'RR1', 'RR2', 'SP1', 'SP2']].astype(float).values + 1)
        y2 = np.log10(df_pval_sex.iloc[i][ ['HC3', 'HC4', 'RR3', 'RR4', 'SP3', 'SP4']].astype(float).values + 1)
        
        species = df_pval_sex.iloc[i]['name']

        plot_ax(ax, y1=y1, y2=y2, color1='crimson', color2='purple', group1='male', group2='female', species=species)

    plt.tight_layout()
    plt.show()



In [None]:
species = 'Candida'

display(df_pval_HCvsRR[df_pval_HCvsRR['name'] == species])
display(df_pval_HCvsSP[df_pval_HCvsSP['name'] == species])
display(df_pval_RRvsSP[df_pval_RRvsSP['name'] == species])
display(df_pval_sex[df_pval_sex['name'] == species])