In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
from scipy.stats import pearsonr


In [2]:
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)

plt.rcParams['figure.dpi']=170

In [3]:
from list_vars import LIST_PROFILERS, DIR_FIGURES, RESULTS_DIR

# In silico sample analysis

In this notebook we are going to do an analysis on the *in silico* samples, where we are going to study several variables.

---

## How many reads are incorrectly mapped if we do not perfom a host mapping step?

It has been reported that not mapping to human databases before profiling increases the number of reads assigned to other organisms. 

In this case, we are going to do 3 checks with the *in silico* dataset using pass2 (profiling after 2-time host mapping) and pass0 (direct profiling withou host mapping), and we are going to check the influence in parameter sensitivity:
-  We are going to see what is the total number of reads mapped to the human dataset, and what is the offset left unmapped which should have been mapped to human.
    - We are also going to do the same with the microbial reads, and see if more microbial reads have been assigned to the pass0 dataset.

Later in the analysis we are going to do two additional analyses:
-  We are going to see the number of species present in total between pass0 and pass2, and their jaccard index.
- We are going to calculate the ratio between the number of reads in pass0 and pass2.

In [4]:
df_host_map_info = pd.read_csv(f'{RESULTS_DIR}/counts/mapping_counts.txt', sep='\t').set_index('SAMPLE')

In [None]:
artificial_taxid_counts = pd.read_csv('table_artificial_taxid.csv', sep=';', names=['species', 'taxid', 'reads'])
artificial_taxid_counts['reads_true'] = (artificial_taxid_counts['reads'] / 2).astype(int)

n_true_human_reads = int(artificial_taxid_counts['reads_true'].iloc[0])
n_true_human_reads

In [None]:
n_mapped_reads_1and2_maps = df_host_map_info.loc['ARTIFICIAL', '1st_mapped'] + df_host_map_info.loc['ARTIFICIAL', '2nd_mapped']

print(f'There is a total of {n_mapped_reads_1and2_maps} reads mapped to human during the 1st and 2nd map, which represents around {100 * n_mapped_reads_1and2_maps/n_true_human_reads} % of the total number of reads ({n_true_human_reads}).')
print(f'There is a total of {n_true_human_reads - n_mapped_reads_1and2_maps} reads remaining to be mapped.')

In [None]:
df_host_profile_info = pd.read_csv(f'{RESULTS_DIR}/counts/profiling_counts_ARTIFICIAL.txt', sep='\t')
df_host_profile_info_artificial = df_host_profile_info[df_host_profile_info['SAMPLE'] == 'ARTIFICIAL']

df_host_profile_info_artificial['mapped_human_1_2_maps'] = 0
df_host_profile_info_artificial.loc[df_host_profile_info_artificial['pass'] == 2, 'mapped_human_1_2_maps'] = n_mapped_reads_1and2_maps

df_host_profile_info_artificial['mapped_human_total'] = df_host_profile_info_artificial['mapped_human_1_2_maps'] + df_host_profile_info_artificial['mapped_human']
df_host_profile_info_artificial['total_reads'] = df_host_profile_info_artificial['mapped_human_total'] + df_host_profile_info_artificial['mapped_others'] + df_host_profile_info_artificial['unmapped']

df_host_profile_info_artificial['observed_human_prop'] = df_host_profile_info_artificial['mapped_human_total'] / df_host_profile_info_artificial['total_reads']
df_host_profile_info_artificial['observed_others_prop'] = df_host_profile_info_artificial['mapped_others'] / df_host_profile_info_artificial['total_reads']
df_host_profile_info_artificial['observed_unmapped_prop'] = df_host_profile_info_artificial['unmapped'] / df_host_profile_info_artificial['total_reads']

df_host_profile_info_artificial['expected_human_prop'] = n_true_human_reads / artificial_taxid_counts['reads_true'].sum() # 0.8
df_host_profile_info_artificial['expected_others_prop'] = 1 - n_true_human_reads / artificial_taxid_counts['reads_true'].sum() # 0.8

df_host_profile_info_artificial['calculated_unmapped_human_prop'] = df_host_profile_info_artificial['expected_human_prop'] - df_host_profile_info_artificial['observed_human_prop']
df_host_profile_info_artificial['calculated_unmapped_others_prop'] = df_host_profile_info_artificial['expected_others_prop'] - df_host_profile_info_artificial['observed_others_prop']

df_host_profile_info_artificial['proportion_mapped_other_reads'] = df_host_profile_info_artificial['observed_others_prop'] /  df_host_profile_info_artificial['expected_others_prop']


for profiler in LIST_PROFILERS:
    display(profiler)
    display(df_host_profile_info_artificial[df_host_profile_info_artificial['profiler'] == profiler])


In [None]:
# 1A) Check if there are differences in human read assignment.

# Step 1: Calculate the differences between pass 2 and pass 0 for each profiler and mode

pass_diff = (
    df_host_profile_info_artificial.pivot_table(
        index=["profiler", "mode"], columns="pass", values="observed_human_prop"
    )
    .reset_index()
)

# Ensure column names are integers
pass_diff.columns.name = None  # Remove the columns' name from pivot_table
pass_diff.columns = ['profiler', 'mode', 0, 2]  # Explicitly rename columns

# Calculate the difference
pass_diff["difference"] = 100 * (pass_diff[2] - pass_diff[0])



# Step 2: Plot the differences using a lineplot

plt.figure(figsize=(6, 4))
sns.lineplot(
    data=pass_diff,
    x="mode",
    y="difference",
    hue="profiler",
    marker="o",
    palette="tab10",
)

plt.axhline(0, color="gray", linestyle="--", linewidth=0.8)
plt.title("Difference in Observed Human Proportion (Pass 2 - Pass 0)", fontsize=14)
plt.xlabel("Mode", fontsize=12)
plt.ylabel("Diff (%)", fontsize=12)

plt.grid(alpha=0.3)

plt.legend(title="Profiler", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.tight_layout()

plt.show()

In [None]:
# 1B) Check if there are differences in non-human read assignment.

pass_diff = (
    df_host_profile_info_artificial.pivot_table(
        index=["profiler", "mode"], columns="pass", values="observed_others_prop"
    )
    .reset_index()
)

pass_diff["difference"] = 100 * (pass_diff[2] - pass_diff[0])



# Step 2: Plot the differences using a lineplot

plt.figure(figsize=(6, 4))
sns.lineplot(
    data=pass_diff,
    x="mode",
    y="difference",
    hue="profiler",
    marker="o",
    palette="tab10",
)

plt.axhline(0, color="gray", linestyle="--", linewidth=0.8)
plt.title("Difference in Observed Non-Human Proportion (Pass 2 - Pass 0)", fontsize=14)
plt.xlabel("Mode", fontsize=12)
plt.ylabel("Diff (%)", fontsize=12)

plt.grid(alpha=0.3)

plt.legend(title="Profiler", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.tight_layout()

plt.show()

**What do we see here?**
- The number of reads assigned to humans without host mapping is very variable depending on the profiler. Centrifuge, krakenuniq and ganon and map human reads correctly, whereas kaiju, kraken2 fail to map the reads to human. The differences tend to decrease with the sensitivity mode, that is, paradoxically, a more strict read assignment leads to an improved number of human-mapped reads. However, this makes sense because more reads are assigned in general, and thus both human and non-human reads are mapped.
- However, this difference does not occur in non-human species. In general, non-human species are assigned equally with or without host mapping. This is interesting because we would expect a higher amount of reads assigned to non-human species originating from a false positive assignment of human reads, but seems not to be the case, even in profilers that have a high ammount of unmapped human reads.
    - Still, we have to take into acount that the profiler databases include a host mapping step.


In [None]:
table_artificial_taxcounts = pd.read_csv('../../src/version_2/table_artificial_taxid.csv', sep=';', names=['species', 'taxid', 'count'])
table_artificial_taxcounts = table_artificial_taxcounts[table_artificial_taxcounts['taxid'] != 9606]
table_artificial_taxcounts['abundance'] = 100 * table_artificial_taxcounts['count'] / table_artificial_taxcounts['count'].sum()
table_artificial_taxcounts

# Computing detection stats to aswer the questions

One of the parameters used during profiling is the mode of the profilers. Each profiler has a different set of parametters to include reads as valid or not. This may results in the detection of false positives and negatives. 

Here, we are going to study this effect in *in silico* samples to see if there are major changes. We are can measure the effectivity of several variables: 
- Categorical values: we can use each of the columns in the flag system to check how well were species assigned. We can use the precision (TP/TP + FP), recall (TP/TP+FN) and F1-score (2 x precision x recall / precision + recall) and Cohen's kappa.
$$\kappa = \frac{p_0-p_e}{1-p_e} \quad p_0 = \frac{TP + TN}{TP + FP + FN + TN} \quad p_e=\frac{TP + FP}{TP + FP + FN + TN}\cdot\frac{TP + FN }{TP + FP + FN + TN} + \frac{TN + FP}{TP + FP + FN + TN}\cdot\frac{TN + FN}{TP + FP + FN + TN}$$

- Numerical values: we can use the normalized value and the abundance to see how well are reads classified. For that we can use the expected number of reads and abundance. With that we will calculate the (1) difference between observed and expected categories and (2) the mean absolute error:
$$(1) \qquad DIFF_i = 100\cdot\frac{x_{obs,i} - x_{exp,i}}{x_{exp,i}}$$ 
$$(2) \qquad MAE = \frac{100}{N}\sum\frac{x_{obs,i} - x_{exp,i}}{x_{exp,i}}$$ 

- For numerical values we are also going to calculate the pearson correlation between the observed and expected values, using a log10(1+x) transform

## Categorical values

In [11]:
def calculate_nominal_metrics(df_tax_ground_truth, df_flags_observed, column):
    list_expected_taxids = list(df_tax_ground_truth['taxid'].astype(int).values)
    list_observed_true_taxids = list(df_flags_observed.loc[df_flags_observed[column] == False, 'taxonomy_id'].astype(int).values)
    list_observed_false_taxids = list(df_flags_observed.loc[df_flags_observed[column] == True, 'taxonomy_id'].astype(int).values)

    TP = len([i for i in list_expected_taxids if i in list_observed_true_taxids])
    FN = len([i for i in list_expected_taxids if i not in list_observed_true_taxids])
    FP = len([i for i in list_observed_true_taxids if i not in list_expected_taxids])
    TN = len([i for i in list_observed_false_taxids if i not in list_expected_taxids])

    assert len(set(list_expected_taxids + list_observed_true_taxids + list_observed_false_taxids)) == TP + FN + FP + TN

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)

    try:
        f1 = (2 * precision * recall) / (precision + recall)
    except:
        f1 = 0
    
    # Create kappa measures
    ALL = TP + FN + FP + TN
    p0 = (TP + TN) / (ALL)
    pe = (TP + FP)/ALL * (TP + FN)/ALL + (TN + FP)/ALL * (TN + FN)/ALL
    kappa = (p0 - pe) / (1 - pe)

    return precision, recall, f1, kappa, TP, FN, FP, TN



In [12]:
columns_selected = ['centrifuge_norm', 'ganon_norm', 'kaiju_norm', 'kmcp_norm', 'kraken2_norm', 'krakenuniq_norm',
                    'centrifuge_relab', 'ganon_relab', 'kaiju_relab', 'kmcp_relab', 'kraken2_relab', 'krakenuniq_relab',
                    'mean_norm', 'CV_norm', 'mean_relab', 'CV_relab']
df_nominal_stats = {'pass': [], 'mode': [], 'S': [], 'column': [], 'precision': [], 'recall': [], 'f1': [], 
                    'kappa': [], 'TP|FN|FP|TN': []}

for passn in [0, 2]:
    for mode in range(1, 10):
        for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
            for column in columns_selected: 
                summary_table_flags = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.flags.tsv', sep='\t')
                try:
                    precision, recall, f1, kappa, TP, FN, FP, TN = calculate_nominal_metrics(table_artificial_taxcounts, summary_table_flags, column)
                except KeyError:
                    continue 

                df_nominal_stats['pass'].append(passn)
                df_nominal_stats['mode'].append(mode)
                df_nominal_stats['S'].append(S)
                df_nominal_stats['column'].append(column)

                df_nominal_stats['precision'].append(precision)
                df_nominal_stats['recall'].append(recall)
                df_nominal_stats['f1'].append(f1)
                df_nominal_stats['kappa'].append(kappa)
                df_nominal_stats['TP|FN|FP|TN'].append((TP, FN, FP, TN))

df_nominal_stats = pd.DataFrame(df_nominal_stats)

## Numerical values

In [13]:
def compute_mad(values):
    median = np.median(values)
    mad = np.median(np.abs(values - median))
    return mad

def calculate_numerical_metrics(df_tax_ground_truth, df_counts_observed, df_flags_observed, profiler, suffix):
    df_tax_ground_truth = df_tax_ground_truth.copy().set_index('taxid')
    df_counts_observed = df_counts_observed.copy().set_index('taxonomy_id')
    df_flags_observed = summary_table_flags.set_index('taxonomy_id').copy()

    list_expected_taxids = df_tax_ground_truth.index.astype(int).values
    list_observed_true_taxids = df_flags_observed.loc[df_flags_observed[f'{profiler}_{suffix}'].astype(bool) == False].index.astype(int).values

    combined_taxid = np.intersect1d(list_expected_taxids, list_observed_true_taxids)
    species = df_tax_ground_truth.loc[combined_taxid, 'species'].values
    observed_counts = df_counts_observed.loc[combined_taxid, f'{profiler}_{suffix}'].values

    expected_col = 'count' if suffix == 'norm' else 'abundance'
    expected_counts = df_tax_ground_truth.loc[combined_taxid, expected_col].values

    diff_counts = 100 * (observed_counts - expected_counts) / expected_counts

    MAE_counts = np.mean(diff_counts)
    MAED_counts = np.std(diff_counts)
    MACV_counts = MAED_counts / MAE_counts

    if len(combined_taxid) > 5:
        if suffix == 'norm':
            corr, _ = pearsonr(np.log10(1 + observed_counts), np.log10(1 + expected_counts))
            rmse = np.sqrt(np.mean((np.log10(1 + observed_counts) - np.log10(1 + expected_counts)) ** 2))
        else:
            corr, _ = pearsonr(observed_counts, expected_counts)
            rmse = np.sqrt(np.mean((observed_counts - expected_counts) ** 2))
    else:
        corr, rmse = np.nan, np.nan

    return diff_counts, MAE_counts, MAED_counts, MACV_counts, corr, rmse, combined_taxid, species, expected_counts, observed_counts


In [14]:
df_numerical_stats = {'pass': [], 'mode': [], 'S': [], 'profiler': [], 
                    'diff_counts': [], 'MAE_counts': [], 'MAED_counts': [], 'MACV_counts': [], 
                    'corr_counts': [], 'RMSE_counts': [], 'taxid_counts': [], 
                    'species_counts': [], 'expected_counts': [], 'observed_counts': [], 
                    'diff_abundance': [], 'MAE_abundance': [], 'MAED_abundance': [], 'MACV_abundance': [], 
                    'corr_abundance': [], 'RMSE_abundance': [], 'taxid_abundance': [], 
                    'species_abundance': [], 'expected_abundance': [], 'observed_abundance': [],  }

for passn in [0, 2]:
    for mode in range(1, 10):
        for S in [0, 1, 2, 3, 4, 5, 6, 7, 10, 15]:
            for profiler in LIST_PROFILERS + ['mean']: 
                summary_table_flags = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.flags.tsv', sep='\t')
                summary_table_counts = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass{passn}_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t')
                try:
                    diff_counts, MAE_counts, MAED_counts, MACV_counts, corr_counts, rmse_counts, taxids_counts, species_counts, expected_counts, observed_counts = \
                        calculate_numerical_metrics(table_artificial_taxcounts, summary_table_counts, \
                                                                        summary_table_flags, profiler, suffix='norm')
                    diff_abundance, MAE_abundance, MAED_abundance, MACV_abundance, corr_abundance, rmse_abundance, taxids_abundance, species_abundance, expected_abundance, observed_abundance = \
                        calculate_numerical_metrics(table_artificial_taxcounts, summary_table_counts, \
                                                                        summary_table_flags, profiler, suffix='relab')
                except KeyError:
                    continue 

                df_numerical_stats['pass'].append(passn)
                df_numerical_stats['mode'].append(mode)
                df_numerical_stats['S'].append(S)
                df_numerical_stats['profiler'].append(profiler)

                df_numerical_stats['diff_counts'].append(diff_counts)
                df_numerical_stats['MAE_counts'].append(MAE_counts)                
                df_numerical_stats['MAED_counts'].append(MAED_counts)                
                df_numerical_stats['MACV_counts'].append(MACV_counts)                
                df_numerical_stats['corr_counts'].append(corr_counts)                
                df_numerical_stats['RMSE_counts'].append(rmse_counts) 
                df_numerical_stats['taxid_counts'].append(taxids_counts)
                df_numerical_stats['species_counts'].append(species_counts)
                df_numerical_stats['expected_counts'].append(expected_counts)
                df_numerical_stats['observed_counts'].append(observed_counts)

                df_numerical_stats['diff_abundance'].append(diff_abundance)
                df_numerical_stats['MAE_abundance'].append(MAE_abundance)
                df_numerical_stats['MAED_abundance'].append(MAED_abundance)
                df_numerical_stats['MACV_abundance'].append(MACV_abundance)
                df_numerical_stats['corr_abundance'].append(corr_abundance)                
                df_numerical_stats['RMSE_abundance'].append(rmse_abundance) 
                df_numerical_stats['taxid_abundance'].append(taxids_abundance)
                df_numerical_stats['species_abundance'].append(species_abundance)
                df_numerical_stats['expected_abundance'].append(expected_abundance)
                df_numerical_stats['observed_abundance'].append(observed_abundance)

df_numerical_stats = pd.DataFrame(df_numerical_stats)

## Analysis of kappa/F1

F1-score and $\kappa$ are quite different measures but we observe that they are correlated in this data.

In [None]:
# Scatter plot with the identity line
plt.figure(figsize=(8, 6))
sns.scatterplot(x='f1', y='kappa', data=df_nominal_stats, label='Data Points')

# Add the identity line (y = x)
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Identity Line')

# Add title and labels
plt.title('F1-score vs Kappa')
plt.xlabel('F1-score')
plt.ylabel('Kappa')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

# Compute and print correlation (Pearson by default)
corr = df_nominal_stats['f1'].corr(df_nominal_stats['kappa'])
print("Pearson correlation between F1 and Kappa:", corr)

In that sense, we can then use one of the measures to explain the results and don't need the second one. We are going to select the F1 score because it has a more clear interpretability and it is related to precision and recall, which are alrady being used.

# How does the S parametter used during curve fitting affect?

The S parametter is useful to tweak the detection results, so that we can include more or less species during the flagging step. Since it is a structural parametter, we want to fit it first so that we can answer several other comparisons.

To do this we are going to use the nominal variables and their derived statistics.


## Checking recall/precision/F1-score for inclusion/exclusion of species

In [16]:
cols = [f'{i}_norm' for i in LIST_PROFILERS] + ['mean_norm']
modes = range(2, 9)
passn = [2]
S_values = df_nominal_stats['S'].unique()

In [17]:
subset_df = df_nominal_stats[(df_nominal_stats['pass'].isin(passn)) & \
                             (df_nominal_stats['column'].isin(cols)) & \
                              (df_nominal_stats['mode'].isin(modes))]

In [None]:
melted_df = pd.melt(
    subset_df,
    id_vars=['mode', 'S', 'column'],
    value_vars=['recall', 'precision', 'f1'],
    var_name='metric',
    value_name='score'
)

# Create a colormap for 'mode'
norm = Normalize(vmin=melted_df['mode'].min(), vmax=melted_df['mode'].max())
cmap = plt.cm.viridis  # Choose a colormap (e.g., 'viridis', 'plasma', 'cividis')

# Create a FacetGrid: 6x3 grid (row for each profiler, column for each metric)
g = sns.FacetGrid(
    melted_df, 
    col='column', 
    row='metric', 
    height=3, 
    sharey=True, 
    sharex=True
)

# Map the lineplot to the grid
def lineplot_with_cmap(data, **kwargs):
    for mode in sorted(data['mode'].unique()):
        subset = data[data['mode'] == mode]
        plt.plot(subset['S'], subset['score'], label=f"Mode {mode}",
                 color=cmap(norm(mode)), marker='o')

g.map_dataframe(lineplot_with_cmap)

# Create a legend for the discrete modes
handles = [
    plt.Line2D([0], [0], color=cmap(norm(mode)), marker='o', linestyle='', label=f"Mode {mode}")
    for mode in sorted(melted_df['mode'].unique())
]
plt.legend(
    handles=handles, 
    title="", 
    bbox_to_anchor=(1.05, 3), 
    loc='center left', 
    frameon=False
)

# Set x-axis ticks (if you have specific S values)
g.set(xticks=subset_df['S'].unique())

for ax in g.axes.ravel():
    ax.set_title('')

# Add axis labels and titles
for ax, profiler in zip(g.axes[0, :], melted_df['column'].unique()):
    ax.set_title(profiler.replace('_norm', ''))

for ax, score in zip(g.axes[:, 0], ['recall', 'precision', 'F1-score']):
    ax.set_ylabel(score)

plt.subplots_adjust(top=0.9)
g.fig.suptitle("Metrics by Profiler and Mode", fontsize=16)

plt.show()

In [None]:
g = sns.FacetGrid(
    melted_df, 
    col='column', 
    row='metric', 
    height=3, 
    sharey=True, 
    sharex=True
)

g.map(sns.boxplot, 'S', 'score')

# Set x-axis ticks (if you have specific S values)
for ax in g.axes.ravel():
    ax.set_title('')

# Add axis labels and titles
for ax, profiler in zip(g.axes[0, :], melted_df['column'].unique()):
    ax.set_title(profiler.replace('_norm', ''))

for ax, score in zip(g.axes[:, 0], ['recall', 'precision', 'F1-score']):
    ax.set_ylabel(score)

plt.subplots_adjust(top=0.9)
g.fig.suptitle("Metrics by Profiler and Mode", fontsize=16)

plt.show()

The aim of this part of the analysis was to select the "optimal" `S` to then make other comparisons and extract proper conclusions. 
If we look at individual profilers, the aim is not the select the `S` with best F1 score, but to select the smallest `S` that provides a sufficiently high recall, ensuring that we don't lose TP species. This threshold depends on the profiler. For CEN it is 6-7, GAN is 7-10, KAI is 1-2, KR2 is 4-5, KRU is 5-6. We see that at these values the precision drops (expectedly), but it remains stable afterwards for most profilers. Therefore, a value of `S=7` should be sufficient to ensure that the results are correct.

The advantage of using the mean value instead of the individual profilers is that it tends to retrieve a better stability on the precision throughout the S values and modes. 

Therefore, we are going to choose S=2 and S=7 for comparisons of robustness with biological samples. 

#  Does pass0/pass2 (no host pre-mapping vs host pre-mapping) affect the detection of the species?

For this part we are going to run run analyses:
- Retrieve the raw detection of species with the passes, and calculate their jaccard index.
- Calculate the Pearson correlation + RMSE for several mode values.


In [45]:
from matplotlib.colors import Normalize, ListedColormap

In [None]:
df = df_nominal_stats[df_nominal_stats['column'].isin([f'{i}_norm' for i in LIST_PROFILERS] + ['mean_norm'])].copy()

# Filter rows for pass=0 and pass=2
df_pass_0 = df[df['pass'] == 0].set_index(['mode', 'S', 'column'])
df_pass_2 = df[df['pass'] == 2].set_index(['mode', 'S', 'column'])

# Compute the difference (pass=2 - pass=0)
diff_nominal_df = df_pass_2[['precision', 'recall', 'f1']] - df_pass_0[['precision', 'recall', 'f1']]
diff_nominal_df = diff_nominal_df.reset_index()

# Rename columns to indicate the differences
diff_nominal_df.rename(columns={
    'precision': 'precision_diff',
    'recall': 'recall_diff',
    'f1': 'f1_diff'
}, inplace=True)

# Display the result
print(diff_nominal_df)

In [None]:
melted_df = pd.melt(
    diff_nominal_df,
    id_vars=['mode', 'S', 'column'],
    value_vars=['recall_diff', 'precision_diff', 'f1_diff'],
    var_name='metric',
    value_name='score'
)

# Create a colormap for 'mode'
norm = Normalize(vmin=melted_df['mode'].min(), vmax=melted_df['mode'].max())
cmap = plt.cm.viridis  # Choose a colormap (e.g., 'viridis', 'plasma', 'cividis')

# Create a FacetGrid: 6x3 grid (row for each profiler, column for each metric)
g = sns.FacetGrid(
    melted_df, 
    col='column', 
    row='metric', 
    height=3, 
    sharey=True, 
    sharex=True
)

# Map the lineplot to the grid
def lineplot_with_cmap(data, **kwargs):
    for mode in sorted(data['mode'].unique()):
        subset = data[data['mode'] == mode]
        plt.plot(subset['S'], subset['score'], label=f"Mode {mode}",
                 color=cmap(norm(mode)), marker='o')

g.map_dataframe(lineplot_with_cmap)

# Create a legend for the discrete modes
handles = [
    plt.Line2D([0], [0], color=cmap(norm(mode)), marker='o', linestyle='', label=f"Mode {mode}")
    for mode in sorted(melted_df['mode'].unique())
]
plt.legend(
    handles=handles, 
    title="", 
    bbox_to_anchor=(1.05, 3), 
    loc='center left', 
    frameon=False
)

# Set x-axis ticks (if you have specific S values)
g.set(xticks=diff_nominal_df['S'].unique())

for ax in g.axes.ravel():
    ax.set_title('')

# Add axis labels and titles
for ax, profiler in zip(g.axes[0, :], melted_df['column'].unique()):
    ax.set_title(profiler.replace('_norm', ''))

for ax, score in zip(g.axes[:, 0], ['recall', 'precision', 'F1-score']):
    ax.set_ylabel(score)

plt.subplots_adjust(top=0.9)
g.fig.suptitle("Metrics by Profiler and Mode", fontsize=16)

plt.show()

In [None]:
df_numerical_stats[(df_numerical_stats['S'].isin([10, 15])) & (df_numerical_stats['profiler'] == 'centrifuge')]

In [None]:
df_numerical_stats

In [None]:
df = df_numerical_stats.copy()

# Filter rows for pass=0 and pass=2
df_pass_0 = df[df['pass'] == 0].set_index(['mode', 'S', 'profiler'])
df_pass_2 = df[df['pass'] == 2].set_index(['mode', 'S', 'profiler'])

# Compute the difference (pass=2 - pass=0)
diff_numerical_df = df_pass_2[['MAE_counts',	'MAED_counts']] - df_pass_0[['MAE_counts',	'MAED_counts']]
diff_numerical_df = diff_numerical_df.reset_index()

# Rename columns to indicate the differences
diff_numerical_df.rename(columns={
    'MAE_counts': 'MAE_diff',
    'MAED_counts': 'MAED_diff',
}, inplace=True)

# Display the result
print(diff_numerical_df)

In [None]:
melted_df = pd.melt(
    diff_numerical_df,
    id_vars=['mode', 'S', 'profiler'],
    value_vars=['MAE_diff', 'MAED_diff'],
    var_name='metric',
    value_name='score'
)

# Create a colormap for 'mode'
norm = Normalize(vmin=melted_df['mode'].min(), vmax=melted_df['mode'].max())
cmap = plt.cm.viridis  # Choose a colormap (e.g., 'viridis', 'plasma', 'cividis')

# Create a FacetGrid: 6x3 grid (row for each profiler, column for each metric)
g = sns.FacetGrid(
    melted_df, 
    col='profiler', 
    row='metric', 
    height=3, 
    sharey=False, 
    sharex=True
)

# Map the lineplot to the grid
def lineplot_with_cmap(data, **kwargs):
    for mode in sorted(data['mode'].unique()):
        subset = data[data['mode'] == mode]
        plt.plot(subset['S'], subset['score'], label=f"Mode {mode}",
                color=cmap(norm(mode)), marker='o')

g.map_dataframe(lineplot_with_cmap)

# Create a legend for the discrete modes
handles = [
    plt.Line2D([0], [0], color=cmap(norm(mode)), marker='o', linestyle='', label=f"Mode {mode}")
    for mode in sorted(melted_df['mode'].unique())
]
plt.legend(
    handles=handles, 
    title="", 
    bbox_to_anchor=(1.05, 1.75), 
    loc='center left', 
    frameon=False
)

# Set x-axis ticks (if you have specific S values)
g.set(xticks=subset_df['S'].unique())

for ax in g.axes.ravel():
    ax.set_title('')

# Add axis labels and titles
for ax, profiler in zip(g.axes[0, :], melted_df['profiler'].unique()):
    ax.set_title(profiler.replace('_norm', ''))

for ax, score in zip(g.axes[:, 0], ['MAE_diff', 'MAED_diff']):
    ax.set_ylabel(score)

plt.subplots_adjust(top=0.9)
g.fig.suptitle("Metrics by Profiler and Mode", fontsize=16)

plt.show()

### Checking at the correlation between the read counts

In [32]:
# Looking at a concrete example
mode = 5
S = 10
sample = 'ARTIFICIAL'


pass0_df = summary_table_counts = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass0_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t')
pass2_df = summary_table_counts = pd.read_csv(f'{RESULTS_DIR}/summary/ARTIFICIAL_pass2_mode{mode}_taxspecies_S{S}.diversity.tsv', sep='\t')

In [None]:
idx = (pass0_df['mean_norm'] > 300) & (pass2_df['mean_norm'] > 300)
fig, axs = plt.subplots(1,6, figsize=(18, 3))


# Iterate through profilers and calculate Pearson correlation and RMSE
for i, profiler in enumerate(LIST_PROFILERS + ['mean']):
    # Extract values for pass 0 and pass 2
    p0counts = np.log10(1 + np.clip(pass0_df.loc[idx, f'{profiler}_norm'].values, 0, None))
    p2counts = np.log10(1 + np.clip(pass2_df.loc[idx, f'{profiler}_norm'].values, 0, None))

    # Filter out NaN values
    valid_mask = ~np.isnan(p0counts) & ~np.isnan(p2counts)
    p0counts = p0counts[valid_mask]
    p2counts = p2counts[valid_mask]

    # Plot vertical line
    axs[i].plot([1, 6], [1, 6], c='#bc0000', linestyle='--', linewidth=1.5)

    # Scatterplot
    sns.scatterplot(x=p0counts, y=p2counts, ax=axs[i])

    # Calculate Pearson correlation
    corr, _ = pearsonr(p0counts, p2counts)

    # Calculate RMSE (Root Mean Squared Error)
    rmse = np.sqrt(np.mean((p0counts - p2counts) ** 2))

    print(profiler, corr, rmse)

    # Set axis titles
    axs[i].set_title(profiler)
    axs[i].set_xlabel('')
    axs[i].set_ylabel('')

plt.tight_layout()

In [None]:
pass0_df_truetaxids = pass0_df[pass0_df['taxonomy_id'].isin(table_artificial_taxcounts['taxid'].values)]
pass2_df_truetaxids = pass2_df[pass2_df['taxonomy_id'].isin(table_artificial_taxcounts['taxid'].values)]

fig, axs = plt.subplots(1,6, figsize=(18, 3))


# Iterate through profilers and calculate Pearson correlation and RMSE
for i, profiler in enumerate(LIST_PROFILERS + ['mean']):
    # Extract values for pass 0 and pass 2
    p0counts = np.log10(1 + pass0_df_truetaxids.loc[:, f'{profiler}_norm'].values)
    p2counts = np.log10(1 + pass2_df_truetaxids.loc[:, f'{profiler}_norm'].values)


    # Plot vertical line
    axs[i].plot([3, 6], [3, 6], c='#bc0000', linestyle='--', linewidth=1.5)

    # Scatterplot
    sns.scatterplot(x=p0counts, y=p2counts, ax=axs[i])

    # Calculate Pearson correlation
    corr, _ = pearsonr(p0counts, p2counts)

    # Calculate RMSE (Root Mean Squared Error)
    rmse = np.sqrt(np.mean((p0counts - p2counts) ** 2))

    print(profiler, corr, rmse)

    # Set axis titles
    axs[i].set_title(profiler)
    axs[i].set_xlabel('')
    axs[i].set_ylabel('')

plt.tight_layout()

**What do we see**


**ARGUMENTAR QUE NECESITA

In [35]:
df_nominal_stats_sub = df_numerical_stats[(df_numerical_stats['pass'] == 2) & \
                              (df_numerical_stats['mode'].isin([3, 5, 7])) & \
                              (df_numerical_stats['S'] == 10)]

df_numerical_stats_sub = df_numerical_stats[(df_numerical_stats['pass'] == 2) & \
                              (df_numerical_stats['mode'].isin([3, 5, 7])) & \
                              (df_numerical_stats['S'] == 10)]

In [None]:
df_numerical_stats

In [None]:
df_nominal_stats

## Are relative abundances better predictors than absolute ones?

## Is the mean a good approximation for the different profilers?

In [38]:
def plot_prof_couns(df_combo):
    # Assuming your data is in a DataFrame called `df`
    # Expand the diff_counts column into individual rows for plotting
    df_expanded = df_combo.explode('diff_counts')
    df_expanded['diff_counts'] = pd.to_numeric(df_expanded['diff_counts'])

    # Create the plot
    plt.figure(figsize=(10, 4))
    sns.stripplot(
        data=df_expanded,
        x='diff_counts',
        y='profiler',
        jitter=True,  # Adds jitter for better visibility of points
        size=5,  # Adjust point size
        alpha=0.7  # Slight transparency
    )

    for profiler, mae, maed in zip(df_combo['profiler'], df_combo['MAE_counts'], df_combo['MAED_counts']):
        plt.scatter(mae, profiler, color='#aa00bc', label='_nolegend_', s=100, zorder=3, marker = '|')
        plt.plot([mae-maed, mae+maed], [profiler, profiler], color='#aa00bc', label='_nolegend_',)


    # Add a vertical line at x=0
    plt.axvline(0, color='red', linestyle='--', linewidth=1)

    # Add labels and title
    # plt.title('Diff Counts Across Profilers', fontsize=14)
    plt.xlabel('Diff Counts', fontsize=12)
    plt.ylabel('Profiler', fontsize=12)
    plt.grid(True, axis='x', linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()

In [None]:
df_combo = df_numerical_stats[(df_numerical_stats['pass'] == 2) & \
                              (df_numerical_stats['mode'].isin([3, 5, 7])) & \
                              (df_numerical_stats['S'] == 10)]
df_combo.sort_values(by=['profiler', 'mode'])

In [None]:
df_combo = df_numerical_stats[(df_numerical_stats['pass'] == 2) & \
                              (df_numerical_stats['mode'] == 3) & \
                              (df_numerical_stats['S'] == 10)] # We choose a large number because S in not relevant here (but with small S we may select few datasets)

plot_prof_couns(df_combo)


In [None]:
df_combo = df_numerical_stats[(df_numerical_stats['pass'] == 2) & \
                              (df_numerical_stats['mode'] == 5) & \
                              (df_numerical_stats['S'] == 10)] # We choose a large number because S in not relevant here (but with small S we may select few datasets)

plot_prof_couns(df_combo)


In [None]:
df_combo = df_numerical_stats[(df_numerical_stats['pass'] == 2) & \
                              (df_numerical_stats['mode'] == 7) & \
                              (df_numerical_stats['S'] == 10)] # We choose a large number because S in not relevant here (but with small S we may select few datasets)

plot_prof_couns(df_combo)

In [None]:
df_combo = df_numerical_stats[(df_numerical_stats['pass'] == 2) & \
                              (df_numerical_stats['mode'].isin([3, 5, 7])) & \
                              (df_numerical_stats['S'] == 10)]


# Create a FacetGrid with one subplot per mode
g = sns.FacetGrid(data=df_combo, col="mode", col_wrap=3, height=3, sharex=True, sharey=True)

# Map a scatterplot to each subplot, using hue for profilers
g.map_dataframe(sns.scatterplot, "MAE_counts", "MAED_counts", hue="profiler")

# Add axis labels and a title
g.set_axis_labels("MAE Counts", "MAED Counts")
g.set_titles(col_template="Mode: {col_name}")
g.fig.suptitle("Scatter Plot of MAE_counts vs MAED_counts by Mode and Profiler", fontsize=16, y=1.05)

# Adjust legend
g.add_legend(title="Profiler", bbox_to_anchor=(1.15, 0.5))

# Show the plot
plt.tight_layout()
plt.show()


Therefore, choosing the mean value is a good option for several reasons:
- In cases where 