In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
# %pip install scipy
from scipy.stats import mannwhitneyu

# Boxplot function to plot number of reads per sample
def boxplot(input, title):
    df = pd.read_csv(input, sep='\t')

    # Add group column to the dataframe
    df['group'] = df['sample'].apply(lambda x: 'H' if 'H' in x else ('OMC' if 'OMC' in x else 'O'))

    # Define colors for the boxplot
    colors = ['blue', 'orange', 'red']

    # Boxplot with reads aligned per sample
    ax = sns.boxplot(data=df, x='group', y='reads', showfliers=False, palette=colors)
    sns.stripplot(data=df, x='group', y='reads', color='black', size=4)
    plt.title(title)
    plt.xlabel('')
    plt.ylabel('Reads aligned')

    # Perform mannwhitneyu analysis
    group_H = df[df['group'] == 'H']['reads']
    group_OMC = df[df['group'] == 'OMC']['reads']
    group_O = df[df['group'] == 'O']['reads']

    # Compare group H and OMC
    u, p_H_OMC = mannwhitneyu(group_H, group_OMC)

    # Compare group H and O
    u, p_H_O = mannwhitneyu(group_H, group_O)

    # Compare group OMC and O
    u, p_OMC_O = mannwhitneyu(group_OMC, group_O)

    # Get the y-axis limits
    bottom, top = ax.get_ylim()
    y_range = top - bottom

    # Significance bars
    significant_combinations = [((0, 2), p_H_OMC), ((0, 1), p_H_O), ((1, 2), p_OMC_O)]
    for i, significant_combination in enumerate(significant_combinations):
        # Columns corresponding to the datasets of interest
        x1 = significant_combination[0][0]
        x2 = significant_combination[0][1]
        # What level is this bar among the bars above the plot?
        level = len(significant_combinations) - i
        # Plot the bar
        bar_height = (y_range * 0.07 * level) + top
        bar_tips = bar_height - (y_range * 0.02)
        plt.plot(
            [x1, x1, x2, x2],
            [bar_tips, bar_height, bar_height, bar_tips], lw=1, c='k'
        )
        # Significance level
        p = significant_combination[1]
        if p < 0.001:
            sig_symbol = '***'
        elif p < 0.01:
            sig_symbol = '**'
        elif p < 0.05:
            sig_symbol = '*'
        else:
            sig_symbol = 'n.s.'
        text_height = bar_height + (y_range * 0.01)
        plt.text((x1 + x2) * 0.5, text_height, sig_symbol, ha='center', va='bottom', c='k')

    output = os.path.splitext(input)[0] + '.png'
    plt.savefig(output)
    output = os.path.splitext(input)[0] + '.svg'
    plt.savefig(output)
    plt.clf()

    # Boxplot with relative abundance per sample
    ax = sns.boxplot(data=df, x='group', y='relative_abundance', showfliers=False, palette=colors)
    sns.stripplot(data=df, x='group', y='relative_abundance', color='black', size=4)
    plt.title(title)
    plt.xlabel('')
    plt.ylabel('Relative abundance (%)')

    # Perform mannwhitneyu analysis
    group_H = df[df['group'] == 'H']['relative_abundance']
    group_OMC = df[df['group'] == 'OMC']['relative_abundance']
    group_O = df[df['group'] == 'O']['relative_abundance']

    # Compare group H and OMC
    u, p_H_OMC = mannwhitneyu(group_H, group_OMC)

    # Compare group H and O
    u, p_H_O = mannwhitneyu(group_H, group_O)

    # Compare group OMC and O
    u, p_OMC_O = mannwhitneyu(group_OMC, group_O)

    # Get the y-axis limits
    bottom, top = ax.get_ylim()
    y_range = top - bottom

    # Significance bars
    significant_combinations = [((0, 2), p_H_OMC), ((0, 1), p_H_O), ((1, 2), p_OMC_O)]
    for i, significant_combination in enumerate(significant_combinations):
        # Columns corresponding to the datasets of interest
        x1 = significant_combination[0][0]
        x2 = significant_combination[0][1]
        # What level is this bar among the bars above the plot?
        level = len(significant_combinations) - i
        # Plot the bar
        bar_height = (y_range * 0.07 * level) + top
        bar_tips = bar_height - (y_range * 0.02)
        plt.plot(
            [x1, x1, x2, x2],
            [bar_tips, bar_height, bar_height, bar_tips], lw=1, c='k'
        )
        # Significance level
        p = significant_combination[1]
        if p < 0.001:
            sig_symbol = '***'
        elif p < 0.01:
            sig_symbol = '**'
        elif p < 0.05:
            sig_symbol = '*'
        else:
            sig_symbol = 'n.s.'
        text_height = bar_height + (y_range * 0.01)
        plt.text((x1 + x2) * 0.5, text_height, sig_symbol, ha='center', va='bottom', c='k')

    output = os.path.splitext(input)[0] + '_relative_abundance.png'
    plt.savefig(output)
    output = os.path.splitext(input)[0] + '_relative_abundance.svg'
    plt.savefig(output)
    plt.clf()

    # Boxplot with log10 abundance per sample
    ax = sns.boxplot(data=df, x='group', y='log10', showfliers=False, palette=colors)
    sns.stripplot(data=df, x='group', y='log10', color='black', size=4)
    plt.title(title)
    plt.xlabel('')
    plt.ylabel('log10 abundance')

    # Perform mannwhitneyu analysis
    group_H = df[df['group'] == 'H']['log10']
    group_OMC = df[df['group'] == 'OMC']['log10']
    group_O = df[df['group'] == 'O']['log10']

    # Compare group H and OMC
    u, p_H_OMC = mannwhitneyu(group_H, group_OMC)

    # Compare group H and O
    u, p_H_O = mannwhitneyu(group_H, group_O)

    # Compare group OMC and O
    u, p_OMC_O = mannwhitneyu(group_OMC, group_O)

    # Get the y-axis limits
    bottom, top = ax.get_ylim()
    y_range = top - bottom

    # Significance bars
    significant_combinations = [((0, 2), p_H_OMC), ((0, 1), p_H_O), ((1, 2), p_OMC_O)]
    for i, significant_combination in enumerate(significant_combinations):
        # Columns corresponding to the datasets of interest
        x1 = significant_combination[0][0]
        x2 = significant_combination[0][1]
        # What level is this bar among the bars above the plot?
        level = len(significant_combinations) - i
        # Plot the bar
        bar_height = (y_range * 0.07 * level) + top
        bar_tips = bar_height - (y_range * 0.02)
        plt.plot(
            [x1, x1, x2, x2],
            [bar_tips, bar_height, bar_height, bar_tips], lw=1, c='k'
        )
        # Significance level
        p = significant_combination[1]
        if p < 0.001:
            sig_symbol = '***'
        elif p < 0.01:
            sig_symbol = '**'
        elif p < 0.05:
            sig_symbol = '*'
        else:
            sig_symbol = 'n.s.'
        text_height = bar_height + (y_range * 0.01)
        plt.text((x1 + x2) * 0.5, text_height, sig_symbol, ha='center', va='bottom', c='k')

    output = os.path.splitext(input)[0] + '_log10.png'
    plt.savefig(output)
    output = os.path.splitext(input)[0] + '_log10.svg'
    plt.savefig(output)
    plt.clf()

# Execution of the function with RNA reads
boxplot(input='./reads_aligned_per_sample_P._dorei_strain_2_plasmid_rna.tsv',
        title='RNA reads aligned per sample to P. dorei strain 2 plasmid')

# Execution of the function with DNA VLPs reads
boxplot(input='./reads_aligned_per_sample_P._dorei_strain_2_plasmid_dna.tsv',
        title='DNA reads aligned per sample to P. dorei strain 2 plasmid')




Collecting scipy
  Downloading scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl (30.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.3/30.3 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: scipy
Successfully installed scipy-1.13.1
Note: you may need to restart the kernel to use updated packages.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.boxplot(data=df, x='group', y='reads', showfliers=False, palette=colors)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.boxplot(data=df, x='group', y='relative_abundance', showfliers=False, palette=colors)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.boxplot(data=df, x='group', y='log10', showfliers=False, palette=colors)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.boxplot(data=df, x='group', y='reads', showfliers=False,

<Figure size 640x480 with 0 Axes>