In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
from matplotlib.colors import Normalize, ListedColormap
from scipy.stats import linregress, pearsonr
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import json
from kneed import KneeLocator


In [2]:
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)

plt.rcParams['figure.dpi']=170

In [3]:
from list_vars import LIST_PROFILERS, DIR_FIGURES, RESULTS_DIR, POOLS, CONTROLS

# Biological sample analysis

In this notebook we are going to do an analysis on the *biological* samples (POOL samples + controls).

There are two main variables that we are going to consider:
- The importance of including the biological control samples to ensure that false positives are not considered.
- The importance of normalizing the reads considering the biooogical samples.

These two concepts are intertwined, so what we are going to do is the following:
- Load all pool + control tables.
- Get the species that are selected using the flags. With that we are going to generate a cut-off table with the species. The way to merge the table is "outer", that is, we are going to include any species that appears in any sample. We can later discard them.
- We are going to discard the species that have less than X times more expression in control samples than in pools. X is determined dynamically using the kneed method.
- Then we are going to compare which species have been discarded using the | normalization (normalizing controls and pools separately) and the + normalization (normalizng controls and pools jointly).

In [4]:
# Loading the tables
def process_samples(pass_num, mode, S, NORM, samples, verbose, min_sample_flag='dynamic'):
    # Initialize empty DataFrames for counts and flags
    joined_counts = pd.DataFrame()
    taxid_list = []

    for sample in samples:
        # Define file paths for counts and flags
        counts_file = f'{RESULTS_DIR}/summary/{sample}_pass{pass_num}_mode{mode}_taxgenus_S{S}_{NORM}.diversity.tsv'
        flags_file =  f'{RESULTS_DIR}/summary/{sample}_pass{pass_num}_mode{mode}_taxgenus_S{S}_{NORM}.flags.tsv'

        # Load the data
        df_counts = pd.read_csv(counts_file, sep='\t').set_index('taxonomy_id')[['name', 'lineage', 'mean_norm']]
        df_flags = pd.read_csv(flags_file, sep='\t').set_index('taxonomy_id')[['name', 'lineage', 'mean_norm']]

        # Ensure name and lineage columns are retained correctly
        if joined_counts.empty:
            joined_counts = df_counts.rename(columns={'mean_norm': sample})
        else:
            df_counts = df_counts.rename(columns={'mean_norm': sample})
            joined_counts = joined_counts.join(df_counts, how='outer', rsuffix=f'_{sample}')

        # Add tax_ids where mean_norm is False in flags dataframe
        taxid_list += df_flags[df_flags['mean_norm'] == False].index.tolist()

    # Consolidate name and lineage columns to avoid suffix issues
    if not joined_counts.empty:
        joined_counts['name'] = joined_counts.filter(like='name').bfill(axis=1).iloc[:, 0]
        joined_counts['lineage'] = joined_counts.filter(like='lineage').bfill(axis=1).iloc[:, 0]
        joined_counts = joined_counts.drop(columns=joined_counts.filter(like='name_').columns)
        joined_counts = joined_counts.drop(columns=joined_counts.filter(like='lineage_').columns)

    # Rename pool columns based on their ranges
    rename_mapping = {
        'POOL1': 'RR1', 'POOL2': 'RR2', 'POOL3': 'RR3', 'POOL4': 'RR4',
        'POOL5': 'SP1', 'POOL6': 'SP2', 'POOL7': 'SP3', 'POOL8': 'SP4',
        'POOL9': 'HC1', 'POOL10': 'HC2', 'POOL11': 'HC3', 'POOL12': 'HC4'
    }
    joined_counts = joined_counts.rename(columns=rename_mapping)

    # Create a "cut" DataFrame containing only the tax_ids in the taxid_list
    taxidvalues, samplecounts = np.unique(taxid_list, return_counts=True)
    joined_counts['n_samples_flag'] = 0
    joined_counts.loc[taxidvalues, 'n_samples_flag'] = samplecounts


    n_samples, counts_ntaxids = np.unique(joined_counts['n_samples_flag'].values, return_counts=True)
            
    if min_sample_flag == 'dynamic':
        kneedle = KneeLocator(n_samples, np.cumsum(counts_ntaxids), curve='concave', direction='increasing', S=0)
        min_sample_flag = kneedle.knee
    

    joined_counts['selected_flag'] = joined_counts['n_samples_flag'] >= min_sample_flag
    cut_df = joined_counts[joined_counts['selected_flag'] == True]

    if verbose: 
        print('TaxIDs species count:', n_samples, counts_ntaxids)
        print(f'Flag threshold: {min_sample_flag} | Number of species: {len(joined_counts)} | Species selected: {len(cut_df)} ({100 * len(cut_df) / len(joined_counts):.2f}%)')

    

    # Reset index and sort by mean counts (descending order)
    joined_counts = joined_counts.reset_index().sort_values(by=list(rename_mapping.values()), ascending=False)
    cut_df = cut_df.reset_index().sort_values(by=list(rename_mapping.values()), ascending=False)

    return joined_counts, taxid_list, cut_df
# Example usage
samples = [
    'POOL1', 'POOL2', 'POOL3', 'POOL4', 'POOL5', 'POOL6',
    'POOL7', 'POOL8', 'POOL9', 'POOL10', 'POOL11', 'POOL12',
    'ACIDOLA', 'BLACTIS'
]

In [5]:
def filter_by_nan_percentage(df, per_cutoff=0.35):
    # Identify sample columns (excluding taxonomy_id, name, lineage, and controls)
    control_cols = ['ACIDOLA', 'BLACTIS']
    sample_cols = [col for col in df.columns if col not in ['taxonomy_id', 'name', 'lineage'] + control_cols]

    # Calculate the percentage of NaNs in sample columns
    nan_percentage = df[sample_cols].isna().mean(axis=1)

    # Retain species with less than 35% NaNs
    filtered_df = df[nan_percentage < per_cutoff]

    return filtered_df

In [6]:
def calculate_retained_discarded(df, threshold, verbose):
        df = df.copy()
        
        # Separate ACIDOLA and BLACTIS columns
        control_cols = ['ACIDOLA', 'BLACTIS']
        sample_cols = [col for col in df.columns if col not in ['taxonomy_id', 'name', 'lineage'] + control_cols]

        # Calculate mean across samples
        df['mean_across_samples'] = df[sample_cols].mean(axis=1, skipna=True)

        # Calculate max of ACIDOLA and BLACTIS
        df['max_control'] = df[control_cols].max(axis=1, skipna=True).fillna(0)

        if threshold == 'dynamic':
            list_len_discarded = []

            for threshold in range(1, 500): # In theory the dataframe len is not related but it is just a number to add, which should be bigger the bigger the dataframe 
                discarded = df[~((df['mean_across_samples'] > (df['max_control'] * threshold)))]
                list_len_discarded.append(len(discarded) / len(df))

            kneedle = KneeLocator(np.arange(1,500), list_len_discarded, curve='concave', direction='increasing', S=-1)
            threshold = kneedle.knee


        # Define retention logic
        retained = df[(df['mean_across_samples'] > (df['max_control'] * threshold)) | (df['max_control'].isna())]
        discarded = df[~((df['mean_across_samples'] > (df['max_control'] * threshold)) | (df['max_control'].isna()))]

        if verbose: 
                print(f'Threshold: {threshold} | Number of species: {len(df)} | Species discarded: {len(discarded)} ({100 * len(discarded) / len(df):.2f}%)')

        return retained, discarded

def filter_species_ids(joined_counts_norm_plus, joined_counts_norm_pipe, threshold='dynamic', verbose=True):
    retained_norm_plus, discarded_norm_plus = calculate_retained_discarded(joined_counts_norm_plus, threshold, verbose)
    retained_norm_pipe, discarded_norm_pipe = calculate_retained_discarded(joined_counts_norm_pipe, threshold, verbose)

    # Extract taxonomy IDs
    retained_ids_norm_plus = retained_norm_plus['taxonomy_id'].tolist()
    discarded_ids_norm_plus = discarded_norm_plus['taxonomy_id'].tolist()
    retained_ids_norm_pipe = retained_norm_pipe['taxonomy_id'].tolist()
    discarded_ids_norm_pipe = discarded_norm_pipe['taxonomy_id'].tolist()



    discarded_common = np.intersect1d(discarded_ids_norm_plus, discarded_ids_norm_pipe).tolist()
    discarded_exclusive_norm_plus = [i for i in discarded_ids_norm_plus if i not in discarded_ids_norm_pipe]
    discarded_exclusive_norm_pipe = [i for i in discarded_ids_norm_pipe if i not in discarded_ids_norm_plus]    

    return {
        'discarded_common': discarded_common,
        'discarded_exclusive_norm_plus': discarded_exclusive_norm_plus,
        'discarded_exclusive_norm_pipe': discarded_exclusive_norm_pipe,
        'retained_norm_plus': retained_ids_norm_plus,
        'retained_norm_pipe': retained_ids_norm_pipe
    }

In [7]:
def differential_abundance_analysis(df, condition_cols, reference_cols):
    """
    Perform differential abundance analysis between condition and reference groups.

    Parameters:
        df (pd.DataFrame): Input dataframe containing species counts and metadata.
        condition_cols (list): Column names for the condition group.
        reference_cols (list): Column names for the reference group.
        output_file (str, optional): Path to save results to an Excel file. Default is None.
        sheet_name (str): Sheet name for Excel output. Default is 'Results'.

    Returns:
        pd.DataFrame: Dataframe containing p-values, log2 fold changes, and sorted results.
    """
    list_pvals_mannwhitney = []
    L2FC = []

    for row in range(len(df)):
        # Extract condition and reference values
        condition_vals = df.iloc[row][condition_cols].astype(float).dropna().values
        reference_vals = df.iloc[row][reference_cols].astype(float).dropna().values

        condition_vals, reference_vals = condition_vals, reference_vals

        # Mann-Whitney U test
        res_mw = mannwhitneyu(condition_vals, reference_vals, alternative='two-sided')
        list_pvals_mannwhitney.append(res_mw.pvalue)

        # Log2 fold change
        L2FC.append(np.log2(condition_vals.mean() / reference_vals.mean()))

    # Compile results
    df_pval = df.copy()
    df_pval['log2FC'] = L2FC
    df_pval['pval_MW'] = list_pvals_mannwhitney

    # Add the corrected p-values to the dataset
    _, pvals_corrected, _, _ = multipletests(df_pval['pval_MW'].values, alpha=0.05, method='fdr_bh')
    df_pval['pval_MW_corrected'] = pvals_corrected


    # Sort by p-values
    df_pval = df_pval.sort_values(by=['pval_MW'])

    return df_pval

In [8]:
os.makedirs(f'{RESULTS_DIR}/merged_counts', exist_ok=True)
os.makedirs(f'{RESULTS_DIR}/differential_abundance', exist_ok=True)

In [None]:
for mode in [3, 5, 7]:
    for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
        print(f'MODE: {mode} | S {S}')
        df_all_normplus, taxid_list, df_cut_normplus = process_samples(pass_num=2, mode=mode, S=S, NORM='NORM+', samples=samples, verbose=True, min_sample_flag=2)
        df_cut_nan_percentage_normplus = filter_by_nan_percentage(df_cut_normplus, per_cutoff=0.35)

        df_all_normpipe, taxid_list, df_cut_normpipe = process_samples(pass_num=2, mode=mode, S=S, NORM='NORMx', samples=samples, verbose=True, min_sample_flag=2)
        df_cut_nan_percentage_normpipe = filter_by_nan_percentage(df_cut_normpipe, per_cutoff=0.35)

        dict_filternorm_cut = filter_species_ids(df_cut_nan_percentage_normplus, df_cut_nan_percentage_normpipe)
        with open(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_dict_norm+|_species.tsv', "w") as file:
            json.dump(dict_filternorm_cut, file)
        
        print([(i, len(dict_filternorm_cut[i])) for i in dict_filternorm_cut.keys()])

        df_cut_nan_percentage_normplus_discarded_common = df_cut_nan_percentage_normplus[df_cut_nan_percentage_normplus['taxonomy_id'].isin(dict_filternorm_cut['discarded_common'])]
        df_cut_nan_percentage_normplus_discarded_common.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_common.tsv', sep='\t', index=None)

        df_cut_nan_percentage_normplus_discarded_normplus = df_cut_nan_percentage_normplus[df_cut_nan_percentage_normplus['taxonomy_id'].isin(dict_filternorm_cut['discarded_exclusive_norm_plus'])]
        df_cut_nan_percentage_normplus_discarded_normplus.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_discarded_norm+.tsv', sep='\t', index=None)
        
        df_cut_nan_percentage_normplus_retained = df_cut_nan_percentage_normplus[df_cut_nan_percentage_normplus['taxonomy_id'].isin(dict_filternorm_cut['retained_norm_plus'])]
        df_cut_nan_percentage_normplus_retained.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORM+_retained.tsv', sep='\t', index=None)

        df_cut_nan_percentage_normpipe_discarded_common = df_cut_nan_percentage_normpipe[df_cut_nan_percentage_normpipe['taxonomy_id'].isin(dict_filternorm_cut['discarded_common'])]
        df_cut_nan_percentage_normpipe_discarded_common.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORMx_discarded_common.tsv', sep='\t', index=None)

        df_cut_nan_percentage_normpipe_discarded_normplus = df_cut_nan_percentage_normpipe[df_cut_nan_percentage_normpipe['taxonomy_id'].isin(dict_filternorm_cut['discarded_exclusive_norm_plus'])]
        df_cut_nan_percentage_normpipe_discarded_normplus.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORMx_discarded_norm+.tsv', sep='\t', index=None)

        df_cut_nan_percentage_normpipe_retained = df_cut_nan_percentage_normpipe[df_cut_nan_percentage_normpipe['taxonomy_id'].isin(dict_filternorm_cut['retained_norm_pipe'])]
        df_cut_nan_percentage_normpipe_retained.to_csv(f'{RESULTS_DIR}/merged_counts/mode{mode}_S{S}_NORMx_retained.tsv', sep='\t', index=None)



        df_pval_HCvsRR = differential_abundance_analysis(df_cut_nan_percentage_normplus_retained, ['HC1', 'HC2', 'HC3', 'HC4'], ['RR1', 'RR2', 'RR3', 'RR4'])
        df_pval_HCvsRR.to_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsRR.tsv', sep='\t', index=None)

        df_pval_HCvsSP = differential_abundance_analysis(df_cut_nan_percentage_normplus_retained, ['HC1', 'HC2', 'HC3', 'HC4'], ['SP1', 'SP2', 'SP3', 'SP4'])
        df_pval_HCvsSP.to_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_HCvsSP.tsv', sep='\t', index=None)

        df_pval_RRvsSP = differential_abundance_analysis(df_cut_nan_percentage_normplus_retained, ['RR1', 'RR2', 'RR3', 'RR4'], ['SP1', 'SP2', 'SP3', 'SP4'])
        df_pval_RRvsSP.to_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_RRvsSP.tsv', sep='\t', index=None)

        df_pval_sex = differential_abundance_analysis(df_cut_nan_percentage_normplus_retained, ['HC1', 'HC2', 'RR1', 'RR2', 'SP1', 'SP2'], ['HC3', 'HC4', 'RR3', 'RR4', 'SP3', 'SP4'])
        df_pval_sex.to_csv(f'{RESULTS_DIR}/differential_abundance/mode{mode}_S{S}_sex.tsv', sep='\t', index=None)

        print('\n\n')