---
title: Statistical analyses
author: John Sundh
date: last-modified
format: 
    confluence-html:
        code-fold: true
jupyter: python3
---

**Description**

Statistical analyses of sample groups using taxonomic and functional profiles.

## Functions

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib_inline
import matplotlib.pyplot as plt
import altair as alt
import numpy as np
from skbio.stats.composition import clr
from skbio.stats.distance import permanova
from skbio.diversity.alpha import shannon, observed_otus
from scipy.spatial.distance import pdist, squareform
from scipy.cluster import hierarchy
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.decomposition import PCA
alt.renderers.enable("html")
matplotlib_inline.backend_inline.set_matplotlib_formats('retina')
sns.set_style("whitegrid")

In [2]:
from importlib import reload
import common
reload(common);

In [3]:
def filter_data(counts, sample_df, cond_col=None, cond_val=None, percent_occ=10, quantile_thresh = 0.1):
    if cond_col is None or cond_val is None:
        samples = sample_df.index
    else:
        samples = list(sample_df.loc[sample_df[cond_col]==cond_val].index)
    # Remove zero sum features
    counts = counts.loc[:, samples]
    nz = counts.loc[counts.sum(axis=1)>0]
    # Filter low abundance
    low_threshold = np.quantile(nz.sum(axis=1) / nz.sum(axis=1).sum() * 100, quantile_thresh)
    _ = nz.sum(axis=1) / nz.sum(axis=1).sum() * 100
    filtered = nz.loc[_.loc[_>=low_threshold].index]
    occ = filtered.gt(0).sum(axis=1) / filtered.shape[1] * 100
    filtered = filtered.loc[occ.loc[occ>=percent_occ].index]
    return filtered

def clr_transform(df):
    offset = df + 1
    percent = offset.div(offset.sum()) * 100
    clr_trans = clr(percent.T)
    dataframe = pd.DataFrame(clr_trans).T
    dataframe.columns = df.columns
    dataframe.index = df.index
    return dataframe

In [4]:
def hierarchy_clust_reorder(dataframe, metric="euclidean", method="complete", fts=[], features=False):
    """
    Input is a dataframe with samples as columns and features as rows
    """
    if len(fts) > 0:
        dataframe = dataframe.loc[fts]
    samples = dataframe.columns
    features = dataframe.index
    dataframe = dataframe.loc[:, dataframe.dtypes==float]
    # Reorder samples
    distframe_samples = pdist(dataframe.T.values, metric=metric)
    linkage = hierarchy.linkage(distframe_samples, method=method)
    dend = hierarchy.dendrogram(linkage, no_plot=True)
    samples = samples[dend["leaves"]]
    distframe_features = pdist(dataframe.values, metric=metric)
    linkage = hierarchy.linkage(distframe_features, method=method)
    dend = hierarchy.dendrogram(linkage, no_plot=True)
    features = features[dend["leaves"]]
    return samples, features

In [5]:
def ident_fts(mixomics, deseq, aldex, contrast, tooltip, mixomics_top, y="feature"):    
    mixomics_fts = list(abs(mixomics.loading_comp1).sort_values(ascending=False).head(mixomics_top).index)+list(abs(mixomics.loading_comp2).sort_values(ascending=False).head(mixomics_top).index)
    deseq_fts = list(deseq.loc[(deseq.contrast==contrast)&(deseq.isDE==True)].feature.unique())
    aldex_fts = list(aldex.loc[(aldex.contrast==contrast)&(aldex.isDE==True)].index.unique())
    # Find PFAMs identified by at least 2 tool combinations
    fts_1 = set(mixomics_fts).intersection(deseq_fts)
    fts_2 = set(mixomics_fts).intersection(aldex_fts)
    fts_3 = set(aldex_fts).intersection(deseq_fts)
    _fts = list(set(list(fts_1)+list(fts_2)+list(fts_3)))
    source = aldex.loc[_fts]
    source = source.loc[source.contrast==contrast]
    chart = alt.Chart(source.reset_index()).mark_rect().encode(
        x="contrast", y=alt.Y(y, sort="color"), color=alt.Color("diff_btw").scale(scheme='redblue', reverse=True),
        tooltip=tooltip
    )
    return chart

## Sample info

In [6]:
# Read sample data frame
sample_df = pd.read_csv("../data/sample_list_atlas.tsv", header=0, sep="\t", index_col=0)
sample_df.sort_values("BinGroup", inplace=True)
group_df = pd.DataFrame(sample_df.groupby("BinGroup").size(), columns=["samples"]).sort_index()
sample_dict = sample_df.to_dict(orient="index")
group_df.sort_index().T

# Drop reads columns and assign treatment and generation
sample_info = sample_df.drop(["Reads_raw_R1","Reads_raw_R2"], axis=1)
sample_info.rename(columns = {'BinGroup': 'Group'}, inplace=True)
sample_info = sample_info.assign(Treatment=pd.Series([x.split("_")[-1] for x in sample_info.Group], index=sample_info.index))
sample_info = sample_info.assign(Generation=pd.Series([x[0:2] if x!="mock" else x for x in sample_info.Group], index=sample_info.index))
sample_info.index.name = "sample_id"
sample_info = sample_info.assign(Sample=pd.Series([x.split("-")[0] for x in sample_info.index], index=sample_info.index))

# Read QC info and store total QC:d reads
sample_qc = pd.read_csv("../atlas/stats/read_counts.tsv", sep="\t", usecols=[0,1,4], index_col=0, dtype={"Reads_pe": int})
sample_read_dict = sample_qc.loc[sample_qc.Step=="QC"].drop("Step", axis=1).loc[:, "Reads_pe"].to_dict()

# Read assembly stats
asm_stats = pd.read_csv("../atlas/stats/combined_contig_stats.tsv", header=0, sep="\t", index_col=0)
asm_stats = asm_stats.assign(AssemblySize=pd.Series([round(x/1000000, 2) for x in asm_stats.contig_bp], index=asm_stats.index))
asm_stats.index.name="sample_id"
asm_stats.rename(columns={'n_contigs': "Contigs", "AssemblySize": "Assembly size (Mbp)"}, inplace=True)

sample_stats = pd.merge(asm_stats, sample_info, left_index=True, right_index=True)
sample_stats = pd.merge(sample_stats, sample_qc.loc[sample_qc.Step=="QC"].drop("Step", axis=1), left_index=True, right_index=True)

Technical replicates are chosen based on assembly metrics, _i.e._ the technical replicate with the highest L50 value is chosen. If there is a tie, then the one with highest total QC:d reads is chosen.

In [7]:
drops = common.drop_replicates(sample_stats)
sample_df = sample_stats.drop(drops)
print(f"Dropped replicates {', '.join(sorted(drops))}")

Dropped replicates C11-t.r, C12, C19-t.r, H10, H13-t.r, H32, L11-t.r, L29, L6


In [8]:
# Generate color scales
gen_lut = dict(zip(sample_df.Generation.unique(), sns.color_palette("Set1", len(sample_df.Generation.unique()))))
treat_lut = dict(zip(sample_df.Treatment.unique(), sns.color_palette("Set2", len(sample_df.Treatment.unique()))))
sample_colors = pd.merge(
    pd.DataFrame(sample_df.loc[:, "Generation"].map(gen_lut)),
    pd.DataFrame(sample_df.loc[:, "Treatment"].map(treat_lut)),
    left_index=True, right_index=True
)

## Data

#### MAG coverage

In [9]:
mag_tax = pd.read_csv("../atlas/genomes/taxonomy/gtdb_taxonomy.tsv", sep="\t", index_col=0)
mag_tax = common.add_unassigned(mag_tax)
ranks = list(mag_tax.columns)
mag_cov = pd.read_parquet("../atlas/genomes/counts/median_coverage_genomes.parquet")
mag_cov.set_index("index", inplace=True)
mag_cov.index.name = "sample_id"

mag_cov = mag_cov.loc[sample_df.index]

# Add offset for later CLR transformation
mag_cov_offset = mag_cov+1
mag_clr = pd.DataFrame(clr(mag_cov_offset), index=mag_cov_offset.index, columns=mag_cov_offset.columns)

# Filter MAGs
mag_cov_filter = filter_data(mag_cov.T, sample_df)

# Calculate relative abundance
_ = mag_cov.T
mag_relab = (_.div(_.sum())*100)
mag_relab_tax = pd.merge(mag_relab, mag_tax, left_index=True, right_index=True)

In [10]:
# Filter and calculate CLR
mag_cov_F0 = filter_data(mag_cov.T, sample_df, "Generation", "F0")
mag_cov_F1 = filter_data(mag_cov.T, sample_df, "Generation", "F1")
mag_cov_F2 = filter_data(mag_cov.T, sample_df, "Generation", "F2")

mag_clr_F0 = clr_transform(mag_cov_F0)
mag_clr_F0 = pd.merge(mag_clr_F0, mag_tax, left_index=True, right_index=True)
mag_clr_F1 = clr_transform(mag_cov_F1)
mag_clr_F1 = pd.merge(mag_clr_F1, mag_tax, left_index=True, right_index=True)
mag_clr_F2 = clr_transform(mag_cov_F2)
mag_clr_F2 = pd.merge(mag_clr_F2, mag_tax, left_index=True, right_index=True)

In [11]:
# Generate colors
phylum_lut = dict(zip(mag_tax.phylum.unique(), sns.color_palette("colorblind", len(mag_tax.phylum.unique()))))
class_lut = dict(zip(mag_tax["class"].unique(), sns.color_palette("colorblind", len(mag_tax["class"].unique()))))
mag_colors = pd.merge(
    pd.DataFrame(mag_tax.loc[:, "phylum"].map(phylum_lut)),
    pd.DataFrame(mag_tax.loc[:, "class"].map(class_lut)),
    left_index=True, right_index=True
)

In [12]:
# Group and sum MAGs by rank
mag_taxcov = mag_cov.T
mag_taxcov = pd.merge(mag_tax, mag_taxcov, left_index=True, right_index=True)
mag_taxcov_grouped = {}
for rank in ranks:
    _ = mag_taxcov.groupby(rank).sum(numeric_only=True)
    _.to_csv(f"../atlas/genomes/counts/{rank}.coverage.tsv", sep="\t")
    mag_taxcov_grouped[rank] = _

#### UniRef taxonomy

In [13]:
uniref_cov = pd.read_csv("../atlas/taxonomy/UniRef100.median_fold.tsv", sep="\t", index_col=0)
uniref_ranks = ["superkingdom","phylum","class","order","family","genus","species"]
uniref_cov = uniref_cov.loc[:, uniref_ranks+list(sample_df.index)]
uniref_tax = uniref_cov.loc[:, uniref_ranks]

# Sum to species
uniref_tax.index = uniref_tax.species
uniref_sum = uniref_cov.groupby("species").sum(numeric_only=True)
# Remove unclassified, unknown and uncultured
uc = list(uniref_sum.loc[uniref_sum.index.str.startswith("uc_")].index)
unc = list(uniref_sum.loc[uniref_sum.index.str.startswith("uncultured")].index)
unk = list(set(uniref_tax.loc[uniref_tax.phylum=="unknown"].index))
uniref_sum = uniref_sum.loc[list(set(uniref_sum.index).difference(unc+unk+uc))]
uniref_sum = uniref_sum.filter(regex="^[A-Z][a-z]+ ", axis=0)
uniref_relab = uniref_sum.div(uniref_sum.sum()) * 100
uniref_relab_tax = pd.merge(uniref_relab, uniref_tax, left_index=True, right_index=True)

In [14]:
# Filter and calculate CLR
uniref_cov_F0 = filter_data(uniref_sum, sample_df, "Generation", "F0")
uniref_cov_F1 = filter_data(uniref_sum, sample_df, "Generation", "F1")
uniref_cov_F2 = filter_data(uniref_sum, sample_df, "Generation", "F2")

uniref_clr_F0 = clr_transform(uniref_cov_F0)
uniref_clr_F0 = pd.merge(uniref_clr_F0, uniref_tax, left_index=True, right_index=True)
uniref_clr_F1 = clr_transform(uniref_cov_F1)
uniref_clr_F1 = pd.merge(uniref_clr_F1, uniref_tax, left_index=True, right_index=True)
uniref_clr_F2 = clr_transform(uniref_cov_F2)
uniref_clr_F2 = pd.merge(uniref_clr_F2, uniref_tax, left_index=True, right_index=True)

In [15]:
# Sum to genus
uniref_genus_sum = uniref_cov.groupby("genus").sum(numeric_only=True)
uc = list(uniref_genus_sum.loc[uniref_genus_sum.index.str.startswith("uc_")].index)
unc = list(uniref_genus_sum.loc[uniref_genus_sum.index.str.startswith("uncultured")].index)
unk = list(set(uniref_tax.loc[uniref_tax.phylum=="unknown"].index))
uniref_genus_sum = uniref_genus_sum.loc[list(set(uniref_genus_sum.index).difference(unc+unk+uc))]
uniref_genus_relab = uniref_genus_sum.div(uniref_genus_sum.sum())*100
uniref_genus_tax = uniref_tax.set_index("genus").drop("species", axis=1).groupby(level=0).first()
uniref_genus_ranks = ["superkingdom","phylum","class","order","family"]
uniref_genus_relab_tax = pd.merge(uniref_genus_relab, uniref_genus_tax, left_index=True, right_index=True)

In [16]:
# Filter and calculate CLR
uniref_genus_cov_F0 = filter_data(uniref_genus_sum, sample_df, "Generation", "F0")
uniref_genus_cov_F1 = filter_data(uniref_genus_sum, sample_df, "Generation", "F1")
uniref_genus_cov_F2 = filter_data(uniref_genus_sum, sample_df, "Generation", "F2")

uniref_genus_clr_F0 = clr_transform(uniref_genus_cov_F0)
uniref_genus_clr_F0 = pd.merge(uniref_genus_clr_F0, uniref_genus_tax, left_index=True, right_index=True)
uniref_genus_clr_F1 = clr_transform(uniref_genus_cov_F1)
uniref_genus_clr_F1 = pd.merge(uniref_genus_clr_F1, uniref_genus_tax, left_index=True, right_index=True)
uniref_genus_clr_F2 = clr_transform(uniref_genus_cov_F2)
uniref_genus_clr_F2 = pd.merge(uniref_genus_clr_F2, uniref_genus_tax, left_index=True, right_index=True)

## Results

### Taxonomy

#### Overview of MAGs generational differences

Below we plot the normalized sum of percentages for the MAGs in the total dataset and for each generation. Each circle is a MAG and its summed % abundance is on the y-axis. The median sum of % for the dataset is indicated by a red line.

In [17]:
plotrank = "phylum"
a = common.plot_percent_sum(mag_relab.T, mag_tax, plotrank=plotrank, title="All samples",
                        ranks=["phylum","class","order","family","genus","species", "Genome"])
b = common.plot_percent_sum(mag_relab.T.loc[sample_df.loc[sample_df.Generation=="F0"].index], mag_tax, 
                            plotrank=plotrank, title="F0",
                        ranks=["phylum","class","order","family","genus","species", "Genome"])
c = common.plot_percent_sum(mag_relab.T.loc[sample_df.loc[sample_df.Generation=="F1"].index], mag_tax, 
                            plotrank=plotrank, title="F1",
                        ranks=["phylum","class","order","family","genus","species", "Genome"])
d = common.plot_percent_sum(mag_relab.T.loc[sample_df.loc[sample_df.Generation=="F2"].index], mag_tax, 
                            plotrank=plotrank, title="F2",
                        ranks=["phylum","class","order","family","genus","species", "Genome"])
alt.hconcat(a, b, c, d).resolve_scale(y="shared")

Median % sum: 0.13271455804598964
Median % sum: 0.07979259154620436
Median % sum: 0.04787247533546993
Median % sum: 0.13074647750319623


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df

The median coverage of MAGs is transformed using Central Log Ratio (CLR) (implemented in SciKitBio). We compute the Aitchison distances of the samples using the CLR transformed data

In [18]:
# Compute Aitchison distances (it will be aitchison because the data is CLR transformed)
mag_cov_dist = pd.DataFrame(euclidean_distances(mag_clr), index=mag_clr.index, columns=mag_clr.index)

Then a PCA is computed on the distances to project the samples into 2-dimensional space. We also correlate the MAGs with the first two components.

In [19]:
# Apply PCA
pca = PCA(n_components=3, random_state=42)
mag_cov_pca, mag_cov_expvar = common.pca_fit(data=mag_cov_dist, info_df=sample_info, pca=pca, n_components=3)

mag_pca_corr = pd.merge(pd.DataFrame(mag_clr.corrwith(mag_cov_pca["PC 1"], method="pearson"), columns=["PC 1"]), 
         pd.DataFrame(mag_clr.corrwith(mag_cov_pca["PC 2"], method="pearson"), columns=["PC 2"]), left_index=True, right_index=True)
mag_pca_corr = pd.merge(mag_pca_corr, mag_tax, left_index=True, right_index=True)

In [20]:
a = alt.Chart(mag_cov_pca.reset_index(), title="MAG abundance").mark_point().encode(
    x=alt.X("PC 1").title(f"PC 1 ({round(mag_cov_expvar[0]*100, 1)}%)"), 
    y=alt.Y("PC 2").title(f"PC 2 ({round(mag_cov_expvar[1]*100, 1)}%)"),
    color=alt.Color("Treatment", sort=["C","L","H"]), shape="Generation",
    tooltip = ["sample_id","Group","Treatment","Generation"]
)

color="order"
source = mag_pca_corr.loc[(abs(mag_pca_corr["PC 1"])>0.5)|(abs(mag_pca_corr["PC 2"])>0.5)].reset_index()
b = alt.Chart(source, title="MAG PCA correlation").mark_circle(size=80).encode(
    x="PC 1", y="PC 2",
    color=color,
    tooltip = ranks+["Genome"]
).interactive()

alt.hconcat(a, b).resolve_scale(color="independent", shape="independent")

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


This indicates that theres a shift from Firmicutes in F0 towards Bacteroidota in F1 and even more so in F2.

In [21]:
ft="mag"

# MixOmics
gen_mag_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.samples.tsv", sep="\t", index_col=0)
gen_mag_samples = pd.merge(gen_mag_samples, sample_df, left_index=True, right_index=True)
gen_mag_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.features.tsv", header=0, sep="\t", index_col=0)
gen_mag_features = pd.merge(gen_mag_features, mag_tax, left_index=True, right_index=True)

# DESeq2
gen_mag_deseq_F2vsF0 = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.F2vsF0.tsv", sep="\t", index_col=0)
gen_mag_deseq_F1vsF0 = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.F1vsF0.tsv", sep="\t", index_col=0)
gen_mag_deseq_F2vsF1 = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.F2vsF1.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(gen_mag_deseq_F2vsF0.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["F2vsF0"]*source1.shape[0]
source2 = pd.melt(gen_mag_deseq_F1vsF0.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["F1vsF0"]*source2.shape[0]
source3 = pd.melt(gen_mag_deseq_F2vsF1.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["F2vsF1"]*source2.shape[0]
gen_mag_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
gen_mag_deseq = pd.merge(gen_mag_deseq, mag_tax, left_on="feature", right_index=True)


# ALDEx2
gen_mag_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.tsv", sep="\t", index_col=0)
gen_mag_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.tsv", sep="\t", index_col=0)
source1 = pd.merge(gen_mag_aldex_effect.loc[:, ["generationF1.diff.btw", "generationF1.effect"]],
         gen_mag_aldex_test.loc[:, ["generationF1:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["F1vsF0"]*source1.shape[0]
source2 = pd.merge(gen_mag_aldex_effect.loc[:, ["generationF2.diff.btw", "generationF2.effect"]],
         gen_mag_aldex_test.loc[:, ["generationF2:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["F2vsF0"]*source1.shape[0]
gen_mag_aldex = pd.concat(
    [
        source1, source2
    ]
)
gen_mag_aldex = pd.merge(gen_mag_aldex, mag_tax, left_index=True, right_index=True)

In [22]:
ft="uniref"

# MixOmics
gen_uniref_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.samples.tsv", sep="\t", index_col=0)
gen_uniref_samples = pd.merge(gen_uniref_samples, sample_df, left_index=True, right_index=True)
gen_uniref_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.features.tsv", header=0, sep="\t", index_col=0)
gen_uniref_features = pd.merge(gen_uniref_features, uniref_tax, left_index=True, right_index=True)
gen_uniref_features.index.name="index"

# DESeq2
gen_uniref_deseq_F2vsF0 = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.F2vsF0.tsv", sep="\t", index_col=0)
gen_uniref_deseq_F1vsF0 = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.F1vsF0.tsv", sep="\t", index_col=0)
gen_uniref_deseq_F2vsF1 = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.F2vsF1.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(gen_uniref_deseq_F2vsF0.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["F2vsF0"]*source1.shape[0]
source2 = pd.melt(gen_uniref_deseq_F1vsF0.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["F1vsF0"]*source2.shape[0]
source3 = pd.melt(gen_uniref_deseq_F2vsF1.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["F2vsF1"]*source2.shape[0]
gen_uniref_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
gen_uniref_deseq = pd.merge(gen_uniref_deseq, uniref_tax, left_on="feature", right_index=True)


# ALDEx2
gen_uniref_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.tsv", sep="\t", index_col=0)
gen_uniref_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.tsv", sep="\t", index_col=0)
source1 = pd.merge(gen_uniref_aldex_effect.loc[:, ["generationF1.diff.btw", "generationF1.effect"]],
         gen_uniref_aldex_test.loc[:, ["generationF1:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["F1vsF0"]*source1.shape[0]
source2 = pd.merge(gen_uniref_aldex_effect.loc[:, ["generationF2.diff.btw", "generationF2.effect"]],
         gen_uniref_aldex_test.loc[:, ["generationF2:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["F2vsF0"]*source1.shape[0]
gen_uniref_aldex = pd.concat(
    [
        source1, source2
    ]
)
gen_uniref_aldex = pd.merge(gen_uniref_aldex, uniref_tax, left_index=True, right_index=True)

In [23]:
rank = "order"
a = alt.Chart(gen_mag_samples.reset_index(),
             title="mixOmics sample variates").mark_point(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    shape="Generation",
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(gen_mag_features.loc[(abs(gen_mag_features.corr_comp1)>0.5)|((abs(gen_mag_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics MAG correlations").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","stability_comp1", "stability_comp2", "phylum","class","order","family","genus","species"]
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


The mixOmics package identified several Bacteroidota MAGs (primarily of the Muribaculaceae family) as characterizing the F2 generation.

In [24]:
rank = "order"
a = alt.Chart(gen_uniref_samples.reset_index(),
             title="mixOmics sample variates (UniRef species)").mark_point(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    shape="Generation",
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(gen_uniref_features.loc[(abs(gen_uniref_features.corr_comp1)>0.5)|((abs(gen_uniref_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics correlations (UniRef Species)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","stability_comp1", "stability_comp2", "phylum","class","order","family","genus","species"]
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [25]:
gen_mag_deseq.loc[gen_mag_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

contrast
F1vsF0    120
F2vsF1    132
F2vsF0    184
dtype: int64

DESeq2 found 120, 184 and 132 (out of 232 tested) MAGs that were differentially abundant (at adjusted p-value <0.1) when comparing F1 vs F0, F2 vs F0 and F2 vs F1, respectively. Below we plot the log2FoldChange for the ones with adjusted p-value<0.05 and abs(log2FoldChange) > 4.

DESeq2 also identified MAGs of Bacteroidota (primarily Muribaculaceae and Rikenellaceae) to be more abundant in F1 and F2 compared to F0 while the opposite was true for Firmicutes such as Lachnospiraceae and Ruminococcaceae.

In [26]:
gen_uniref_deseq.loc[gen_uniref_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

contrast
F2vsF1     58
F1vsF0     73
F2vsF0    110
dtype: int64

In the UniRef species data, DESeq2 identified 73 and 110 species that differed significantly between F0 and F1 and F2, respectively.

In [27]:
gen_mag_aldex.loc[gen_mag_aldex.padj<0.1].groupby("contrast").size()

contrast
F1vsF0    32
F2vsF0    72
dtype: int64

ALDEx2 identified 32 and 72 MAGs with a significant (adjusted p-value < 0.1) difference in abundance in F1 and F2 generations, respectively, compared to F0.

In [28]:
gen_uniref_aldex.loc[gen_uniref_aldex.padj<0.1].groupby("contrast").size()

contrast
F1vsF0    19
F2vsF0    35
dtype: int64

In the UniRef species data, ALDEx2 found 19 and 35 differentially abundant species.

Below we visualize the MAGs by plotting the effect versus the median difference between the generations.

In [29]:
source1 = gen_mag_aldex
source1["isDE"] = [False] * source1.shape[0]
source1.loc[(source1.padj<0.05)|(abs(source1.effect)>1), "isDE"] = True
a_1 = alt.Chart(source1.reset_index(), title="ALDEx2 MAGs").mark_point().encode(
    x = "diff_btw", y="effect", shape="contrast", color="isDE",
    tooltip=["effect","diff_btw", "contrast", "feature"]+ranks
).properties(
    width=300,
    height=300
)

b_1 = alt.Chart(source1.loc[source1.isDE==True].reset_index(), title="DE MAGs").mark_point().encode(
    x="diff_btw", y="effect", shape="contrast", color="phylum",
    tooltip=["effect","diff_btw", "contrast", "feature"]+ranks
)
ab_1 = alt.hconcat(a_1,b_1).resolve_scale(
    color='independent',
    shape='independent'
)

source2 = gen_uniref_aldex
source2["isDE"] = [False] * source2.shape[0]
source2.loc[source2.padj<0.05, "isDE"] = True
a_2 = alt.Chart(source2.reset_index(), title="ALDEx2 species").mark_point().encode(
    x = "diff_btw", y="effect", shape="contrast", color="isDE",
    tooltip=["effect","diff_btw", "contrast", "feature"]+uniref_ranks
).properties(
    width=300,
    height=300
)

b_2 = alt.Chart(source2.loc[source2.isDE==True].reset_index(), title="DE species").mark_point().encode(
    x="diff_btw", y="effect", shape="contrast", color="phylum",
    tooltip=["effect","diff_btw", "contrast", "feature"]+uniref_ranks
)
ab_2 = alt.hconcat(a_2,b_2).resolve_scale(
    color='independent',
    shape='independent'
)

alt.vconcat(ab_1, ab_2)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


#### F0 generation

Because of the clear differences in MAG abundances between generations, we here zoom in on the F0 generation.

In [30]:
ft="mag"
generation="F0"
# MixOmics
F0_mag_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F0_mag_samples = pd.merge(F0_mag_samples, sample_df, left_index=True, right_index=True)
F0_mag_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F0_mag_features = pd.merge(F0_mag_features, mag_tax, left_index=True, right_index=True)
F0_2_mag_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.samples.tsv", sep="\t", index_col=0)
F0_2_mag_samples = pd.merge(F0_2_mag_samples, sample_df, left_index=True, right_index=True)
F0_2_mag_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.features.tsv", header=0, sep="\t", index_col=0)
F0_2_mag_features = pd.merge(F0_2_mag_features, mag_tax, left_index=True, right_index=True)
F0_2_mag_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F0_2_mag_samples.Treatment]


# DESeq2
#res.mag.F0_HvsF0_C.tsv
F0_mag_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_C.tsv", sep="\t", index_col=0)
F0_mag_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF0_C.tsv", sep="\t", index_col=0)
F0_mag_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_L.tsv", sep="\t", index_col=0)
F0_mag_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F0_mag_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F0_mag_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F0_mag_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F0_mag_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F0_mag_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F0_mag_deseq = pd.merge(F0_mag_deseq, mag_tax, left_on="feature", right_index=True)

# ALDEx2
F0_mag_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F0_mag_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F0_mag_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F0_mag_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F0_mag_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F0_mag_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F0_mag_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F0_mag_aldex_glm = pd.merge(F0_mag_aldex, mag_tax, left_index=True, right_index=True)
F0_mag_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F0_mag_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F0_mag_aldex_HC["contrast"] = ["HvsC"]*F0_mag_aldex_HC.shape[0]
F0_mag_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F0_mag_aldex_LC["contrast"] = ["LvsC"]*F0_mag_aldex_LC.shape[0]
F0_mag_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F0_mag_aldex = pd.concat([F0_mag_aldex_HC, F0_mag_aldex_LC])
F0_mag_aldex = pd.merge(F0_mag_aldex, mag_tax, left_index=True, right_index=True)

F0_mag_aldex["isDE"] = False
F0_mag_aldex.loc[(abs(F0_mag_aldex.effect)>1)|((F0_mag_aldex.wi_eBH<0.1)), "isDE"] = True

F0_mag_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F0_mag_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F0_mag_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t")
F0_mag_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

In [31]:
ft="uniref"
generation="F0"
# MixOmics
F0_uniref_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F0_uniref_samples = pd.merge(F0_uniref_samples, sample_df, left_index=True, right_index=True)
F0_uniref_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F0_uniref_features = pd.merge(F0_uniref_features, uniref_tax, left_index=True, right_index=True)
F0_uniref_features.index.name = "index"

F0_2_uniref_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.samples.tsv", sep="\t", index_col=0)
F0_2_uniref_samples = pd.merge(F0_2_uniref_samples, sample_df, left_index=True, right_index=True)
F0_2_uniref_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.features.tsv", header=0, sep="\t", index_col=0)
F0_2_uniref_features = pd.merge(F0_2_uniref_features, uniref_tax, left_index=True, right_index=True)
F0_2_uniref_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F0_2_uniref_samples.Treatment]
F0_2_uniref_features.index.name = "index"

# DESeq2
#res.uniref.F0_HvsF0_C.tsv
F0_uniref_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_C.tsv", sep="\t", index_col=0)
F0_uniref_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF0_C.tsv", sep="\t", index_col=0)
F0_uniref_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_L.tsv", sep="\t", index_col=0)
F0_uniref_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F0_uniref_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F0_uniref_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F0_uniref_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F0_uniref_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F0_uniref_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F0_uniref_deseq = pd.merge(F0_uniref_deseq, uniref_tax, left_on="feature", right_index=True)


# ALDEx2
F0_uniref_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F0_uniref_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F0_uniref_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F0_uniref_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F0_uniref_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F0_uniref_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F0_uniref_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F0_uniref_aldex_glm = pd.merge(F0_uniref_aldex, uniref_tax, left_index=True, right_index=True)

F0_uniref_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F0_uniref_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F0_uniref_aldex_HC["contrast"] = ["HvsC"]*F0_uniref_aldex_HC.shape[0]
F0_uniref_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F0_uniref_aldex_LC["contrast"] = ["LvsC"]*F0_uniref_aldex_LC.shape[0]
F0_uniref_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F0_uniref_aldex = pd.concat([F0_uniref_aldex_HC, F0_uniref_aldex_LC])
F0_uniref_aldex = pd.merge(F0_uniref_aldex, uniref_tax, left_index=True, right_index=True)

F0_uniref_aldex["isDE"] = False
F0_uniref_aldex.loc[(abs(F0_uniref_aldex.effect)>1)|((F0_uniref_aldex.wi_eBH<0.1)), "isDE"] = True

F0_uniref_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F0_uniref_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F0_uniref_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t")
F0_uniref_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

In [32]:
ft="uniref.genus"
generation="F0"
# MixOmics
F0_uniref_genus_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F0_uniref_genus_samples = pd.merge(F0_uniref_genus_samples, sample_df, left_index=True, right_index=True)
F0_uniref_genus_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F0_uniref_genus_features = pd.merge(F0_uniref_genus_features, uniref_genus_tax, left_index=True, right_index=True)
F0_uniref_genus_features.index.name = "index"

F0_2_uniref_genus_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.samples.tsv", sep="\t", index_col=0)
F0_2_uniref_genus_samples = pd.merge(F0_2_uniref_genus_samples, sample_df, left_index=True, right_index=True)
F0_2_uniref_genus_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.features.tsv", header=0, sep="\t", index_col=0)
F0_2_uniref_genus_features = pd.merge(F0_2_uniref_genus_features, uniref_genus_tax, left_index=True, right_index=True)
F0_2_uniref_genus_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F0_2_uniref_genus_samples.Treatment]
F0_2_uniref_genus_features.index.name = "index"

# DESeq2
#res.uniref_genus.F0_HvsF0_C.tsv
F0_uniref_genus_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_C.tsv", sep="\t", index_col=0)
F0_uniref_genus_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF0_C.tsv", sep="\t", index_col=0)
F0_uniref_genus_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_L.tsv", sep="\t", index_col=0)
F0_uniref_genus_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F0_uniref_genus_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F0_uniref_genus_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F0_uniref_genus_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F0_uniref_genus_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F0_uniref_genus_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F0_uniref_genus_deseq = pd.merge(F0_uniref_genus_deseq, uniref_genus_tax, left_on="feature", right_index=True)


# ALDEx2
F0_uniref_genus_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F0_uniref_genus_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F0_uniref_genus_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F0_uniref_genus_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F0_uniref_genus_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F0_uniref_genus_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F0_uniref_genus_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F0_uniref_genus_aldex_glm = pd.merge(F0_uniref_genus_aldex, uniref_genus_tax, left_index=True, right_index=True)

F0_uniref_genus_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F0_uniref_genus_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F0_uniref_genus_aldex_HC["contrast"] = ["HvsC"]*F0_uniref_genus_aldex_HC.shape[0]
F0_uniref_genus_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F0_uniref_genus_aldex_LC["contrast"] = ["LvsC"]*F0_uniref_genus_aldex_LC.shape[0]
F0_uniref_genus_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F0_uniref_genus_aldex = pd.concat([F0_uniref_genus_aldex_HC, F0_uniref_genus_aldex_LC])
F0_uniref_genus_aldex = pd.merge(F0_uniref_genus_aldex, uniref_genus_tax, left_index=True, right_index=True)

F0_uniref_genus_aldex["isDE"] = False
F0_uniref_genus_aldex.loc[(abs(F0_uniref_genus_aldex.effect)>1)|((F0_uniref_genus_aldex.wi_eBH<0.1)), "isDE"] = True

F0_uniref_genus_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F0_uniref_genus_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F0_uniref_genus_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t")
F0_uniref_genus_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

##### Alpha diversity

Below are boxplots of alpha diversity for results from the different datasets.

In [33]:
F0_mag_div = calc_diversity(mag_cov_F0)
F0_uniref_div = calc_diversity(uniref_cov_F0)
F0_uniref_genus_div = calc_diversity(uniref_genus_cov_F0)

F0_mag_div = pd.merge(sample_df, F0_mag_div, left_index=True, right_index=True).reset_index()
F0_uniref_div = pd.merge(sample_df, F0_uniref_div, left_index=True, right_index=True).reset_index()
F0_uniref_genus_div = pd.merge(sample_df, F0_uniref_genus_div, left_index=True, right_index=True).reset_index()

source_mag = pd.melt(F0_mag_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")
source_uniref = pd.melt(F0_uniref_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")
source_uniref_genus = pd.melt(F0_uniref_genus_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")

a = alt.Chart(source_mag, title="F0 MAGs").mark_boxplot().encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent")
b = alt.Chart(source_uniref, title="F0 UniRef species").mark_boxplot().encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent")
c = alt.Chart(source_uniref_genus, title="F0 UniRef genera").mark_boxplot().encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent")
alt.hconcat(a,b,c)

NameError: name 'calc_diversity' is not defined

The plot below shows how the final splsda model separates the samples using abundances of the 91 MAGs used in the model. The right plot shows how those MAGs correlate with the two components.

In [None]:
rank = "order"
a = alt.Chart(F0_mag_samples.reset_index(),
             title="mixOmics sample variates (F0)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F0_mag_features.loc[(abs(F0_mag_features.corr_comp1)>0.5)|((abs(F0_mag_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics MAG correlations (F0)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ ranks
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

For the F0 generation, the mixOmics model separated the High treatment from Low and Control along the first component. L and C were separated along the second component to some degree. At the family level, Lachnospiraceae (*e.g.* Caccovicinus) and Oscillospiraceae MAGs (*e.g.* Dysosmobacter) were positively correlated with component 1 and component 2, indicating they are more prevalent in Control samples while several Muribaculaceae MAGs seemed to be negatively correlated with these components, indicating they are more prevalent in L or H samples.

Below we plot the loadings for the MAGs with the highest loadings in the model. MAGs with the highest absolute loadings are the ones that have the highest importance.

In [None]:
fts1 = list(abs(F0_mag_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F0_mag_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F0_mag_features.loc[fts1].reset_index()
source2 = F0_mag_features.loc[fts2].reset_index()
a = alt.Chart(source1, title="Loadings MAGs comp1 (F0)").mark_bar().encode(
    x=alt.X("index", sort=list(source1.sort_values("loading_comp1")["index"])), y="loading_comp1", color="family",
    tooltip=["index"]+ranks
)
b = alt.Chart(source2, title="Loadings MAGs comp2 (F0)").mark_bar().encode(
    x=alt.X("index", sort=list(source2.sort_values("loading_comp2")["index"])), y="loading_comp2", color="family",
    tooltip=["index"]+ranks
)
alt.hconcat(a,b).resolve_scale(
    color='shared',
)

To sanity check this, we also look at the relative abundance of these MAGs across the samples. The plot below shows both relative abundance in % and CLR (Central Log Ratio) transformed values.

In [None]:
w = 300
h=200
fts1 = list(abs(F0_mag_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F0_mag_features.loading_comp2).sort_values(ascending=False).head(10).index)
_relab = mag_relab_tax.loc[list(set(list(fts1+fts2)))]
_relab = pd.merge(pd.melt(_relab.reset_index(), id_vars=["Genome"]+ranks, var_name="Sample", value_name="%"),
             sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F0"]
_clr = mag_clr_F0.loc[list(set(list(fts1+fts2)))]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=["index"]+ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="Genome", y="%", color="genus", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["Genome"]+ranks
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="index", y="CLR", color="genus", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index"]+ranks
).properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="genus",
    tooltip=["Genome"]+ranks
).properties(width=w*3)

alt.vconcat(a,b, c)

Below is a heatmap of the median difference between groups for the MAGs with highest loadings above.

In [None]:
fts1 = list(abs(F0_mag_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F0_mag_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F0_mag_aldex.loc[fts1+fts2].reset_index()
alt.Chart(source1).mark_rect().encode(
    x = "contrast", y=alt.Y("feature", sort="color"), color=alt.Color("diff_btw").scale(scheme='redblue', reverse=True),
    tooltip=["diff_btw", "feature"]+ranks
)

In [None]:
rank = "order"
a = alt.Chart(F0_uniref_samples.reset_index(),
             title="mixOmics sample variates (F0)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F0_uniref_features.loc[(abs(F0_uniref_features.corr_comp1)>0.5)|((abs(F0_uniref_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics species correlations (F0)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ uniref_ranks
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

The splsda model for species used 98 species to separate the samples. The separation of C and L groups is better here as we can see from the first panel of sample variates. Component 1 separates Control from Low and High samples, while component 2 separates L from H. The right panel shows that several species in the Eubacteriales order were negatively correlated with component1, while three Bacteroidales species correlated with positively with component1.

Below we plot the species with highest absolute loadings in the splsda model for UniRef species abundances.

In [None]:
fts1 = list(abs(F0_uniref_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F0_uniref_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F0_uniref_features.loc[fts1].reset_index()
source2 = F0_uniref_features.loc[fts2].reset_index()
a = alt.Chart(source1, title="Loadings UniRef species comp1 (F0)").mark_bar().encode(
    x=alt.X("index", sort=list(source1.sort_values("loading_comp1")["index"])), y="loading_comp1", color="family",
    tooltip=["index"]+uniref_ranks
).properties(width=200, height=200)
b = alt.Chart(source2, title="Loadings UniRef species comp2 (F0)").mark_bar().encode(
    x=alt.X("index", sort=list(source2.sort_values("loading_comp2")["index"])), y="loading_comp2", color="family",
    tooltip=["index"]+uniref_ranks
)
alt.hconcat(a,b).resolve_scale(
    color='shared',
)

And again a plot of relative abundance and CLR values for these species between groups and across samples.

In [None]:
w=300
h=200
fts1 = list(abs(F0_uniref_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F0_uniref_features.loading_comp2).sort_values(ascending=False).head(10).index)
unirefs = list(set(fts1+fts2))
color="family"
_relab = uniref_relab_tax.loc[unirefs]
_relab = pd.merge(pd.melt(_relab, id_vars=uniref_ranks, var_name="Sample", value_name="%"),
            sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F0"]
_clr = uniref_clr_F0.loc[list(set(list(fts1+fts2)))]
_clr = pd.merge(pd.melt(_clr, id_vars=uniref_ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="species", y="%",column=alt.Column("Treatment", sort=["C","L","H"]), color=color,
    tooltip=uniref_ranks+["Sample","%"]
).resolve_scale(y="independent").properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="species", y="CLR",column=alt.Column("Treatment", sort=["C","L","H"]), color=color,
    tooltip=uniref_ranks+["Sample","CLR"]
).resolve_scale(y="independent").properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color=color,
    tooltip=["Sample","%"]+uniref_ranks
).properties(width=w*3)
alt.vconcat(a, b, c).resolve_scale(color="shared")

The heatmap below shows median difference of CLR values between groups for the species with highest loadings.

In [None]:
fts1 = list(abs(F0_uniref_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F0_uniref_features.loading_comp2).sort_values(ascending=False).head(10).index)
source = F0_uniref_aldex.loc[fts1+fts2].reset_index()
alt.Chart(source).mark_rect().encode(
    x = "contrast", y=alt.Y("feature",sort="color"), color=alt.Color("diff_btw").scale(scheme='redblue', reverse=True),
    tooltip=["diff_btw", "feature"]+uniref_ranks
)

Below is a plot of a model using taxonomic abundances summed to the genus level.

In [None]:
rank = "order"
a = alt.Chart(F0_uniref_genus_samples.reset_index(),
             title="mixOmics sample variates (UniRef genera; F0)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F0_uniref_genus_features.loc[(abs(F0_uniref_genus_features.corr_comp1)>0.5)|((abs(F0_uniref_genus_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics feature correlations (UniRef genera; F0)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ uniref_genus_ranks
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

Below are the top UniRef genera from the splsda model using taxonomic abundances summed to genus level.

In [None]:
fts1 = list(abs(F0_uniref_genus_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F0_uniref_genus_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F0_uniref_genus_features.loc[fts1].reset_index()
source2 = F0_uniref_genus_features.loc[fts2].reset_index()
color="index"
a = alt.Chart(source1, title="Loadings comp1 (UniRef genera; F0)").mark_bar().encode(
    x=alt.X("index", sort=list(source1.sort_values("loading_comp1")["index"])), y="loading_comp1", color=color,
    tooltip=["index"]+uniref_genus_ranks
).properties(width=200, height=200)
b = alt.Chart(source2, title="Loadings comp2 (UniRef genera; F0)").mark_bar().encode(
    x=alt.X("index", sort=list(source2.sort_values("loading_comp2")["index"])), y="loading_comp2", color=color,
    tooltip=["index"]+uniref_genus_ranks
).properties(width=200, height=200)
alt.hconcat(a,b).resolve_scale(
    color='shared',
)

Interestingly, the Adlercreutzia genus contains species which seem to have undergone a name change from Enterorhabdus. This is also reflected above in the MAG101 classified to *Adlercreutzia musocicola* which has *Enterorhabdus mucosicola* as a basionym.

Below is a plot of the distribution of genera that have the highest loadings in the mixOmics model. The CLR values are central log ratio transformed.

In [None]:
w=250
h=100
fts1 = list(abs(F0_uniref_genus_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F0_uniref_genus_features.loading_comp2).sort_values(ascending=False).head(10).index)
genera = list(set(fts1+fts2))
color="genus"
_relab = uniref_genus_relab_tax.loc[genera].reset_index()
_relab = pd.merge(pd.melt(_relab, id_vars=["genus"]+uniref_genus_ranks, var_name="Sample", value_name="%"),
            sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F0"]
_clr = uniref_genus_clr_F0.loc[genera].reset_index()
_clr = pd.merge(pd.melt(_clr, id_vars=["genus"]+uniref_genus_ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="genus", y="%",column=alt.Column("Treatment", sort=["C","L","H"]), color=color,
    tooltip=uniref_genus_ranks+["Sample","%"]
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="genus", y="CLR",column=alt.Column("Treatment", sort=["C","L","H"]), color=color,
    tooltip=uniref_genus_ranks+["Sample","CLR"]
).properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color=color,
    tooltip=["Sample","%"]+uniref_genus_ranks+["genus"]
).properties(width=w*3)
alt.vconcat(a, b, c).resolve_scale(color="shared")

Below is a heatmap of median difference of CLR transformed values between groups for the genera with highest loadings.

In [None]:
fts1 = list(abs(F0_uniref_genus_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F0_uniref_genus_features.loading_comp2).sort_values(ascending=False).head(10).index)
source = F0_uniref_genus_aldex.loc[fts1+fts2].reset_index()
alt.Chart(source).mark_rect().encode(
    x = "contrast", y=alt.Y("feature", sort="color"), color=alt.Color("diff_btw").scale(scheme='redblue', reverse=True),
    tooltip=["diff_btw", "feature"]+uniref_genus_ranks
)

In [None]:
F0_mag_deseq.loc[F0_mag_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

In the F0 generation DESeq2 identified 1 and 4 MAGs that had significantly different abundances in L and H treatments vs control, respectively.

Below are so called volcano plots of the different comparisons. Each MAG is a point with the magnitude of change on the x-axis and the adjusted p-value on the y-axis. The MAGs with highest difference are to the left and right, and the most significant ones are to the top.

In [None]:
_ = F0_mag_deseq.loc[F0_mag_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
alt.Chart(_, title="DESeq2 (MAGs; F0)").mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["LvsC","HvsC", "HvsCL", "HvsL"]),
    tooltip=["feature"]+ranks
).properties(width=300, height=200)

Below we plot out the relative abundance and CLR values of MAGs identified as differentially abundant by DESeq2

In [None]:
w = 100
h=200
mags = F0_mag_deseq.loc[F0_mag_deseq.isDE==True].feature.unique()
_ = mag_relab_tax.loc[mags]
_ = pd.merge(pd.melt(_.reset_index(), id_vars=["Genome"]+ranks, var_name="Sample", value_name="%"),
             sample_df, left_on="Sample", right_index=True)
_ = _.loc[_.Generation=="F0"]

_clr = mag_clr_F0.loc[mags]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=["index"]+ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)

x_order = [x for x in _.loc[_.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="H"].sort_values("Sample").Sample.unique()]

a = alt.Chart(_).mark_boxplot().encode(
    x="Genome", y="%", color="genus", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["Genome"]+ranks
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="index", y="CLR", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index"]+ranks,color="genus"
).properties(width=w, height=h)

c = alt.Chart(_).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="genus",
    tooltip=["Genome"]+ranks
).properties(width=w*7.5, height=h)

alt.vconcat(alt.hconcat(a,b), c).resolve_scale(color="shared")

In [None]:
source = F0_mag_deseq.loc[F0_mag_deseq.isDE==True]
a1 = alt.Chart(source.loc[source.contrast=="LvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("feature", sort="color"), color=alt.Color("log2FoldChange").scale(scheme='redblue', reverse=True),
    tooltip=ranks+["log2FoldChange"]
)
a2 = alt.Chart(source.loc[source.contrast=="HvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("feature",sort="color"), color=alt.Color("log2FoldChange").scale(scheme='redblue', reverse=True),
    tooltip=ranks+["log2FoldChange"]
)
alt.hconcat(a1, a2).resolve_scale(color="independent")

MAGs belonging to Schaedlerella and Bacteroides were more abundant in the High treatment, while the opposite was true for Caccovicinus, Bifidobacterium and Dysosmobacter MAGs.

In [None]:
F0_uniref_deseq.loc[F0_uniref_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

In the UniRef data DESeq2 identified 3 and 4 species with significantly different abundances in the L and H treatments. Below is again a volcano plot of the results.

In [None]:
_ = F0_uniref_deseq.loc[F0_uniref_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
alt.Chart(_).mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["LvsC","HvsC", "HvsCL", "HvsL"]),
    tooltip=["feature", "log2FoldChange"]+uniref_ranks
).properties(width=200, height=200)

Below we plot out the relative abundance and CLR transformed values for the differentially abundant species.

In [None]:
w = 100
h = 150
unirefs = F0_uniref_deseq.loc[F0_uniref_deseq.isDE==True].feature.unique()
_ = uniref_relab_tax.loc[unirefs]
_ = pd.merge(pd.melt(_, id_vars=uniref_ranks, var_name="Sample", value_name="%"),
            sample_df, left_on="Sample", right_index=True)
_ = _.loc[_.Generation=="F0"]

_clr = uniref_clr_F0.loc[unirefs]
_clr = pd.merge(pd.melt(_clr, id_vars=uniref_ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)

x_order = [x for x in _.loc[_.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_).mark_boxplot().encode(
    x="species", y="%",column=alt.Column("Treatment", sort=["C","L","H"]), color="genus",
    tooltip=uniref_ranks+["Sample","%"]
).properties(width=w, height=h)

b = alt.Chart(_clr).mark_boxplot().encode(
    x="species", y="CLR",column=alt.Column("Treatment", sort=["C","L","H"]), color="genus",
    tooltip=uniref_ranks+["Sample","CLR"]
).properties(width=w, height=h)

c = alt.Chart(_).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="genus",
    tooltip=["Sample","%"]+uniref_ranks
).properties(width=w*7.5, height=h)
alt.vconcat(alt.hconcat(a, b), c).resolve_scale(color="shared")

And below is a heatmap of log2FoldChange between the comparisons for these species.

In [None]:
source = F0_uniref_deseq.loc[F0_uniref_deseq.isDE==True].reset_index()

a1 = alt.Chart(source.loc[source.contrast=="LvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("feature", sort="color"), color=alt.Color("log2FoldChange").scale(scheme='redblue', reverse=True),
    tooltip=["log2FoldChange", "feature"]+uniref_ranks
)
a2 = alt.Chart(source.loc[source.contrast=="HvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("feature", sort="color"), color=alt.Color("log2FoldChange").scale(scheme='redblue', reverse=True),
    tooltip=["log2FoldChange", "feature"]+uniref_ranks
)
alt.hconcat(a1,a2).resolve_scale(color="independent")

In [None]:
F0_uniref_genus_deseq.loc[F0_uniref_genus_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

When summing the UniRef abundances to genus level DESeq2 did not identify any genera as differentially abundant.

In [None]:
_ = F0_uniref_genus_deseq.loc[F0_uniref_genus_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
alt.Chart(_).mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["LvsC","HvsC", "HvsCL", "HvsL"]),
    tooltip=["feature"]+uniref_genus_ranks
).properties(width=200, height=200)

ALDEx2 did not identify any MAGs with significant (at adjusted p-value <0.05) differences in abundance between treatments in the F0 generation. However, the ALDEx2 developers suggest to not only focus on p-values but instead on the `effect` (median effect size, defined as the median difference between divided by the maximum difference within conditions). Below are volcano-style plots of adjusted p-values vs. `diff_between` (median difference in clr values between conditions) and `effect`. An effect value > 1 or < -1 could mark significant changes.

In [None]:
w = 200
h = 200
a_1 = alt.Chart(F0_mag_aldex.reset_index(), title="ALDEx2 (MAGs)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature", "diff_btw", "effect", "wi_eBH"]+ranks, color="isDE",
).properties(width=w, height=h)
a_2 = alt.Chart(F0_mag_aldex.reset_index(), title="ALDEx2 (MAGs)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature", "diff_btw", "effect", "wi_eBH"]+ranks, color="isDE",
).properties(width=w, height=h)
a = alt.hconcat(a_1, a_2)

b_1 = alt.Chart(F0_uniref_aldex.reset_index(), title="ALDEx2 (UniRef species)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature", "diff_btw", "effect", "wi_eBH"]+uniref_ranks, color="isDE",
).properties(width=w, height=h)
b_2 = alt.Chart(F0_uniref_aldex.reset_index(), title="ALDEx2 (UniRef species)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature", "diff_btw", "effect", "wi_eBH"]+uniref_ranks, color="isDE",
).properties(width=w, height=h)
b = alt.hconcat(b_1, b_2)

c_1 = alt.Chart(F0_uniref_genus_aldex.reset_index(), title="ALDEx2 (UniRef genera)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature", "diff_btw", "effect", "wi_eBH"]+uniref_genus_ranks, color="isDE",
).properties(width=w, height=h)
c_2 = alt.Chart(F0_uniref_genus_aldex.reset_index(), title="ALDEx2 (UniRef genera)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature", "diff_btw", "effect", "wi_eBH"]+uniref_genus_ranks, color="isDE",
).properties(width=w, height=h)
c = alt.hconcat(c_1, c_2)
alt.vconcat(a,b,c)

In [None]:
h = 100
w = 100
unirefs = F0_uniref_aldex.loc[F0_uniref_aldex.isDE==True].index.unique()
_ = uniref_relab_tax.loc[unirefs]
_ = pd.merge(pd.melt(_, id_vars=uniref_ranks, var_name="Sample", value_name="%"),
            sample_df, left_on="Sample", right_index=True)
_ = _.loc[_.Generation=="F0"]

_clr = uniref_clr_F0.loc[unirefs]
_clr = pd.merge(pd.melt(_clr, id_vars=uniref_ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)

x_order = [x for x in _.loc[_.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_).mark_boxplot().encode(
    x="species", y="%",column=alt.Column("Treatment", sort=["C","L","H"]), color="species",
    tooltip=uniref_ranks+["Sample","%"]
).properties(width=w, height=h)

b = alt.Chart(_clr).mark_boxplot().encode(
    x="species", y="CLR",column=alt.Column("Treatment", sort=["C","L","H"]), color="species",
    tooltip=uniref_ranks+["Sample","CLR"]
).properties(width=w, height=h)

c = alt.Chart(_).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="species",
    tooltip=["Sample","%"]+uniref_ranks
).properties(width=w*7, height=h)
alt.vconcat(alt.hconcat(a, b), c).resolve_scale(color="shared")

Below are heatmaps of median CLR values for MAGs and species identified by at least 2 of the 3 tools.

In [None]:
a1 = ident_fts(mixomics=F0_mag_features, deseq=F0_mag_deseq, aldex=F0_mag_aldex, contrast="LvsC", tooltip=["feature"]+ranks+["diff_btw"], mixomics_top=1000000, y="species")
a2 = ident_fts(mixomics=F0_mag_features, deseq=F0_mag_deseq, aldex=F0_mag_aldex, contrast="HvsC", tooltip=["feature"]+ranks+["diff_btw"], mixomics_top=1000000, y="species")
b1 = ident_fts(mixomics=F0_uniref_features, deseq=F0_uniref_deseq, aldex=F0_uniref_aldex, contrast="LvsC", tooltip=["feature"]+uniref_ranks+["diff_btw"], mixomics_top=1000000)
b2 = ident_fts(mixomics=F0_uniref_features, deseq=F0_uniref_deseq, aldex=F0_uniref_aldex, contrast="HvsC", tooltip=["feature"]+uniref_ranks+["diff_btw"], mixomics_top=1000000)
a = alt.hconcat(a1, a2).resolve_scale(color="shared")
b = alt.hconcat(b1, b2).resolve_scale(color="shared")
alt.vconcat(a, b).resolve_scale(color="shared")

#### F1 generation

In [None]:
ft="mag"
generation="F1"
# MixOmics
F1_mag_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F1_mag_samples = pd.merge(F1_mag_samples, sample_df, left_index=True, right_index=True)
F1_mag_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F1_mag_features = pd.merge(F1_mag_features, mag_tax, left_index=True, right_index=True)

F1_2_mag_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_3.samples.tsv", sep="\t", index_col=0)
F1_2_mag_samples = pd.merge(F1_2_mag_samples, sample_df, left_index=True, right_index=True)
F1_2_mag_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_3.features.tsv", header=0, sep="\t", index_col=0)
F1_2_mag_features = pd.merge(F1_2_mag_features, mag_tax, left_index=True, right_index=True)
F1_2_mag_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F1_2_mag_samples.Treatment]

# DESeq2
#res.mag.F1_HvsF1_C.tsv
F1_mag_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_C.tsv", sep="\t", index_col=0)
F1_mag_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF1_C.tsv", sep="\t", index_col=0)
F1_mag_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_L.tsv", sep="\t", index_col=0)
F1_mag_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F1_mag_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F1_mag_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F1_mag_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F1_mag_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F1_mag_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F1_mag_deseq = pd.merge(F1_mag_deseq, mag_tax, left_on="feature", right_index=True)


# ALDEx2
F1_mag_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F1_mag_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F1_mag_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F1_mag_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F1_mag_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F1_mag_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F1_mag_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F1_mag_aldex_glm = pd.merge(F1_mag_aldex, mag_tax, left_index=True, right_index=True)

F1_mag_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F1_mag_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F1_mag_aldex_HC["contrast"] = ["HvsC"]*F1_mag_aldex_HC.shape[0]
F1_mag_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F1_mag_aldex_LC["contrast"] = ["LvsC"]*F1_mag_aldex_LC.shape[0]
F1_mag_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F1_mag_aldex = pd.concat([F1_mag_aldex_HC, F1_mag_aldex_LC])
F1_mag_aldex = pd.merge(F1_mag_aldex, mag_tax, left_index=True, right_index=True)

F1_mag_aldex["isDE"] = False
F1_mag_aldex.loc[(abs(F1_mag_aldex.effect)>1)|((F1_mag_aldex.wi_eBH<0.1)), "isDE"] = True

F1_mag_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F1_mag_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F1_mag_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t")
F1_mag_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

In [None]:
ft="uniref"
generation="F1"
# MixOmics
F1_uniref_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F1_uniref_samples = pd.merge(F1_uniref_samples, sample_df, left_index=True, right_index=True)
F1_uniref_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F1_uniref_features = pd.merge(F1_uniref_features, uniref_tax, left_index=True, right_index=True)
F1_uniref_features.index.name = "index"

F1_2_uniref_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.samples.tsv", sep="\t", index_col=0)
F1_2_uniref_samples = pd.merge(F1_2_uniref_samples, sample_df, left_index=True, right_index=True)
F1_2_uniref_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.features.tsv", header=0, sep="\t", index_col=0)
F1_2_uniref_features = pd.merge(F1_2_uniref_features, uniref_tax, left_index=True, right_index=True)
F1_2_uniref_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F1_2_uniref_samples.Treatment]
F1_2_uniref_features.index.name = "index"

# DESeq2
#res.uniref.F1_HvsF1_C.tsv
F1_uniref_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_C.tsv", sep="\t", index_col=0)
F1_uniref_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF1_C.tsv", sep="\t", index_col=0)
F1_uniref_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_L.tsv", sep="\t", index_col=0)
F1_uniref_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F1_uniref_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F1_uniref_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F1_uniref_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F1_uniref_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F1_uniref_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F1_uniref_deseq = pd.merge(F1_uniref_deseq, uniref_tax, left_on="feature", right_index=True)


# ALDEx2
F1_uniref_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F1_uniref_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F1_uniref_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F1_uniref_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F1_uniref_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F1_uniref_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F1_uniref_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F1_uniref_aldex_glm = pd.merge(F1_uniref_aldex, uniref_tax, left_index=True, right_index=True)

F1_uniref_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F1_uniref_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F1_uniref_aldex_HC["contrast"] = ["HvsC"]*F1_uniref_aldex_HC.shape[0]
F1_uniref_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F1_uniref_aldex_LC["contrast"] = ["LvsC"]*F1_uniref_aldex_LC.shape[0]
F1_uniref_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F1_uniref_aldex = pd.concat([F1_uniref_aldex_HC, F1_uniref_aldex_LC])
F1_uniref_aldex = pd.merge(F1_uniref_aldex, uniref_tax, left_index=True, right_index=True)

F1_uniref_aldex["isDE"] = False
F1_uniref_aldex.loc[(abs(F1_uniref_aldex.effect)>1)|((F1_uniref_aldex.wi_eBH<0.1)), "isDE"] = True

F1_uniref_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F1_uniref_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F1_uniref_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t")
F1_uniref_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

In [None]:
uniref_genus_tax = uniref_tax.set_index("genus").drop("species", axis=1).groupby(level=0).first()
uniref_genus_ranks = ["superkingdom","phylum","class","order","family"]
ft="uniref.genus"
generation="F1"
# MixOmics
F1_uniref_genus_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F1_uniref_genus_samples = pd.merge(F1_uniref_genus_samples, sample_df, left_index=True, right_index=True)
F1_uniref_genus_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F1_uniref_genus_features = pd.merge(F1_uniref_genus_features, uniref_genus_tax, left_index=True, right_index=True)
F1_uniref_genus_features.index.name = "index"

F1_2_uniref_genus_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.samples.tsv", sep="\t", index_col=0)
F1_2_uniref_genus_samples = pd.merge(F1_2_uniref_genus_samples, sample_df, left_index=True, right_index=True)
F1_2_uniref_genus_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.features.tsv", header=0, sep="\t", index_col=0)
F1_2_uniref_genus_features = pd.merge(F1_2_uniref_genus_features, uniref_genus_tax, left_index=True, right_index=True)
F1_2_uniref_genus_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F1_2_uniref_genus_samples.Treatment]
F1_2_uniref_genus_features.index.name = "index"

# DESeq2
#res.uniref_genus.F1_HvsF1_C.tsv
F1_uniref_genus_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_C.tsv", sep="\t", index_col=0)
F1_uniref_genus_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF1_C.tsv", sep="\t", index_col=0)
F1_uniref_genus_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_L.tsv", sep="\t", index_col=0)
F1_uniref_genus_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F1_uniref_genus_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F1_uniref_genus_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F1_uniref_genus_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F1_uniref_genus_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F1_uniref_genus_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F1_uniref_genus_deseq = pd.merge(F1_uniref_genus_deseq, uniref_genus_tax, left_on="feature", right_index=True)


# ALDEx2
F1_uniref_genus_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F1_uniref_genus_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F1_uniref_genus_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F1_uniref_genus_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F1_uniref_genus_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F1_uniref_genus_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F1_uniref_genus_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F1_uniref_genus_aldex_glm = pd.merge(F1_uniref_genus_aldex, uniref_genus_tax, left_index=True, right_index=True)

F1_uniref_genus_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F1_uniref_genus_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F1_uniref_genus_aldex_HC["contrast"] = ["HvsC"]*F1_uniref_genus_aldex_HC.shape[0]
F1_uniref_genus_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F1_uniref_genus_aldex_LC["contrast"] = ["LvsC"]*F1_uniref_genus_aldex_LC.shape[0]
F1_uniref_genus_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F1_uniref_genus_aldex = pd.concat([F1_uniref_genus_aldex_HC, F1_uniref_genus_aldex_LC])
F1_uniref_genus_aldex = pd.merge(F1_uniref_genus_aldex, uniref_genus_tax, left_index=True, right_index=True)

F1_uniref_genus_aldex["isDE"] = False
F1_uniref_genus_aldex.loc[(abs(F1_uniref_genus_aldex.effect)>1)|((F1_uniref_genus_aldex.wi_eBH<0.1)), "isDE"] = True

F1_uniref_genus_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F1_uniref_genus_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F1_uniref_genus_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t")
F1_uniref_genus_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

##### Alpha diversity

In [None]:
F1_mag_div = calc_diversity(mag_cov_F1)
F1_uniref_div = calc_diversity(uniref_cov_F1)
F1_uniref_genus_div = calc_diversity(uniref_genus_cov_F1)

F1_mag_div = pd.merge(sample_df, F1_mag_div, left_index=True, right_index=True).reset_index()
F1_uniref_div = pd.merge(sample_df, F1_uniref_div, left_index=True, right_index=True).reset_index()
F1_uniref_genus_div = pd.merge(sample_df, F1_uniref_genus_div, left_index=True, right_index=True).reset_index()

source_mag = pd.melt(F1_mag_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")
source_uniref = pd.melt(F1_uniref_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")
source_uniref_genus = pd.melt(F1_uniref_genus_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")

a = alt.Chart(source_mag, title="F1 MAGs").mark_boxplot().encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent")
b = alt.Chart(source_uniref, title="F1 UniRef species").mark_boxplot().encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent")
c = alt.Chart(source_uniref_genus, title="F1 UniRef genera").mark_boxplot().encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent")
alt.hconcat(a,b,c)

In [None]:
rank = "order"
a = alt.Chart(F1_mag_samples.reset_index(),
             title="mixOmics sample variates (F1)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F1_mag_features.loc[(abs(F1_mag_features.corr_comp1)>0.5)|((abs(F1_mag_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics MAG correlations (F1)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ ranks
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

For the F1 generation, the mixOmics model separated the High treatment from Low and Control along the first component. L and C were separated along the second component to some degree. However, there was a high error rate in the model compared to when analyzing the generations with all samples.

Below we plot the loadings for the MAGs with the highest loadings in the model. MAGs with the highest absolute loadings are the ones that have the highest importance.

In [None]:
fts1 = list(abs(F1_mag_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F1_mag_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F1_mag_features.loc[fts1].reset_index()
source2 = F1_mag_features.loc[fts2].reset_index()
a = alt.Chart(source1, title="Loadings MAGs comp1 (F1)").mark_bar().encode(
    x=alt.X("index", sort=list(source1.sort_values("loading_comp1")["index"])), y="loading_comp1", color="family",
    tooltip=["index"]+ranks
)
b = alt.Chart(source2, title="Loadings MAGs comp2 (F1)").mark_bar().encode(
    x=alt.X("index", sort=list(source2.sort_values("loading_comp2")["index"])), y="loading_comp2", color="family",
    tooltip=["index"]+ranks
)
alt.hconcat(a,b).resolve_scale(
    color='shared',
)

To sanity check this, we also look at the relative abundance of these MAGs across the samples.

In [None]:
w = 200
h=100
fts1 = list(abs(F1_mag_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F1_mag_features.loading_comp2).sort_values(ascending=False).head(10).index)
_relab = mag_relab_tax.loc[list(set(list(fts1+fts2)))]
_relab = pd.merge(pd.melt(_relab.reset_index(), id_vars=["Genome"]+ranks, var_name="Sample", value_name="%"),
             sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F1"]
_clr = mag_clr_F1.loc[list(set(list(fts1+fts2)))]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=["index"]+ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="Genome", y="%", color="genus", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["Genome"]+ranks
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="index", y="CLR", color="genus", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index"]+ranks
).properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="genus",
    tooltip=["Genome"]+ranks
).properties(width=w*3)

alt.vconcat(a,b, c)

Below is a similar plot of a model using UniRef species abundances.

In [None]:
rank = "order"
a = alt.Chart(F1_uniref_samples.reset_index(),
             title="mixOmics sample variates (F1)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F1_uniref_features.loc[(abs(F1_uniref_features.corr_comp1)>0.5)|((abs(F1_uniref_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics species correlations (F1)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ uniref_ranks
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

The splsda model for species used 48 species to separate the samples. Here there's a similar separation of the treatment groups compared to the MAG data. The right panel shows that several species in the Eubacteriales order were negatively correlated with component1, while three Bacteroidales species correlated with positively with component1.

Below we plot the species with highest absolute loadings in the splsda model for UniRef species abundances.

In [None]:
fts1 = list(abs(F1_uniref_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F1_uniref_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F1_uniref_features.loc[fts1].reset_index()
source2 = F1_uniref_features.loc[fts2].reset_index()
a = alt.Chart(source1, title="Loadings UniRef species comp1 (F1)").mark_bar().encode(
    x=alt.X("index", sort=list(source1.sort_values("loading_comp1")["index"])), y="loading_comp1", color="family",
    tooltip=["index"]+uniref_ranks
)
b = alt.Chart(source2, title="Loadings UniRef species comp2 (F1)").mark_bar().encode(
    x=alt.X("index", sort=list(source2.sort_values("loading_comp2")["index"])), y="loading_comp2", color="family",
    tooltip=["index"]+uniref_ranks
)
alt.hconcat(a,b).resolve_scale(
    color='shared',
)

In [None]:
w=250
h=100
fts1 = list(abs(F1_uniref_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F1_uniref_features.loading_comp2).sort_values(ascending=False).head(10).index)
unirefs = list(set(fts1+fts2))
color="family"
_relab = uniref_relab_tax.loc[unirefs]
_relab = pd.merge(pd.melt(_relab, id_vars=uniref_ranks, var_name="Sample", value_name="%"),
            sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F1"]
_clr = uniref_clr_F1.loc[list(set(list(fts1+fts2)))]
_clr = pd.merge(pd.melt(_clr, id_vars=uniref_ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="species", y="%",column=alt.Column("Treatment", sort=["C","L","H"]), color=color,
    tooltip=uniref_ranks+["Sample","%"]
).resolve_scale(y="independent").properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="species", y="CLR",column=alt.Column("Treatment", sort=["C","L","H"]), color=color,
    tooltip=uniref_ranks+["Sample","CLR"]
).resolve_scale(y="independent").properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color=color,
    tooltip=["Sample","%"]+uniref_ranks
).properties(width=w*3)
alt.vconcat(a, b, c).resolve_scale(color="shared")

Below is a plot of a model using taxonomic abundances summed to the genus level.

In [None]:
rank = "order"
a = alt.Chart(F1_uniref_genus_samples.reset_index(),
             title="mixOmics sample variates (UniRef genera; F1)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F1_uniref_genus_features.loc[(abs(F1_uniref_genus_features.corr_comp1)>0.5)|((abs(F1_uniref_genus_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics feature correlations (UniRef genera; F1)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ uniref_genus_ranks
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

And for the genera summed abundances.

In [None]:
w=250
h=100
fts1 = list(abs(F1_uniref_genus_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F1_uniref_genus_features.loading_comp2).sort_values(ascending=False).head(10).index)
genera = list(set(fts1+fts2))
color="genus"
_relab = uniref_genus_relab_tax.loc[genera].reset_index()
_relab = pd.merge(pd.melt(_relab, id_vars=["genus"]+uniref_genus_ranks, var_name="Sample", value_name="%"),
            sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F1"]
_clr = uniref_genus_clr_F1.loc[genera].reset_index()
_clr = pd.merge(pd.melt(_clr, id_vars=["genus"]+uniref_genus_ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="genus", y="%",column=alt.Column("Treatment", sort=["C","L","H"]), color=color,
    tooltip=uniref_genus_ranks+["Sample","%"]
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="genus", y="CLR",column=alt.Column("Treatment", sort=["C","L","H"]), color=color,
    tooltip=uniref_genus_ranks+["Sample","CLR"]
).properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color=color,
    tooltip=["Sample","%"]+uniref_genus_ranks+["genus"]
).properties(width=w*3)
alt.vconcat(a, b, c).resolve_scale(color="shared")

In [None]:
F1_mag_deseq.loc[F1_mag_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

In the F1 generation DESeq2 identified 1 MAG that had significantly different abundances in H treatment vs control. This was MAG124 classified to genus Coproplasma which was less abundant in the High sample group.

Below are so called volcano plots of the different comparisons. Each MAG is a point with the magnitude of change on the x-axis and the adjusted p-value on the y-axis. The MAGs with highest difference are to the left and right, and the most significant ones are to the top.

In [None]:
_ = F1_mag_deseq.loc[F1_mag_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
#_ = _.loc[abs(_.log2FoldChange)>1]
alt.Chart(_).mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["LvsC","HvsC", "HvsCL", "HvsL"]),
    tooltip=["feature"]+ranks
).properties(width=200)

Below we plot out the relative abundance and CLR transformed values of the MAG identified as differentially abundant by DESeq2

In [None]:
w = 100
h=100
mags = F1_mag_deseq.loc[F1_mag_deseq.isDE==True].feature.unique()
_ = mag_relab_tax.loc[mags]
_ = pd.merge(pd.melt(_.reset_index(), id_vars=["Genome"]+ranks, var_name="Sample", value_name="%"),
             sample_df, left_on="Sample", right_index=True)
_ = _.loc[_.Generation=="F1"]

_clr = mag_clr_F1.loc[mags]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=["index"]+ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)

x_order = [x for x in _.loc[_.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="H"].sort_values("Sample").Sample.unique()]

a = alt.Chart(_).mark_boxplot().encode(
    x="Genome", y="%", color="genus", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["Genome"]+ranks
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="index", y="CLR", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index"]+ranks,color="genus"
).properties(width=w, height=h)

c = alt.Chart(_).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="genus",
    tooltip=["Genome"]+ranks
).properties(width=w*7.5, height=h)

alt.vconcat(alt.hconcat(a,b), c).resolve_scale(color="shared")

In [None]:
F1_uniref_deseq.loc[F1_uniref_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

In the UniRef data DESeq2 identified 4 and 11 species with significantly different abundances in the H and L treatments, respectively. Below is again a volcano plot of the results.

In [None]:
_ = F1_uniref_deseq.loc[F1_uniref_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
alt.Chart(_).mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["LvsC","HvsC", "HvsCL", "HvsL"]),
    tooltip=["feature"]+uniref_ranks
).properties(width=200, height=200)

In [None]:
w = 200
h = 150
unirefs = F1_uniref_deseq.loc[F1_uniref_deseq.isDE==True].feature.unique()
_ = uniref_relab_tax.loc[unirefs]
_ = pd.merge(pd.melt(_, id_vars=uniref_ranks, var_name="Sample", value_name="%"),
            sample_df, left_on="Sample", right_index=True)
_ = _.loc[_.Generation=="F1"]

_clr = uniref_clr_F1.loc[unirefs]
_clr = pd.merge(pd.melt(_clr, id_vars=uniref_ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)

x_order = [x for x in _.loc[_.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_).mark_boxplot().encode(
    x="species", y="%",column=alt.Column("Treatment", sort=["C","L","H"]), color="genus",
    tooltip=uniref_ranks+["Sample","%"]
).properties(width=w, height=h)

b = alt.Chart(_clr).mark_boxplot().encode(
    x="species", y="CLR",column=alt.Column("Treatment", sort=["C","L","H"]), color="genus",
    tooltip=uniref_ranks+["Sample","CLR"]
).properties(width=w, height=h)

c = alt.Chart(_).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="genus",
    tooltip=["Sample","%"]+uniref_ranks
).properties(width=w*6.5, height=h)
alt.vconcat(alt.hconcat(a, b), c).resolve_scale(color="shared")

In [None]:
F1_uniref_genus_deseq.loc[F1_uniref_genus_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

When summing the UniRef abundances to genus level, DESeq2 identified 2 and 6 genera as significant in H and L groups, respectively.

In [None]:
_ = F1_uniref_genus_deseq.loc[F1_uniref_genus_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
alt.Chart(_).mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["LvsC","HvsC", "HvsCL", "HvsL"]),
    tooltip=["feature"]+uniref_genus_ranks
).properties(width=200, height=200)

In [None]:
w = 150
h = 100
uniref_genera = F1_uniref_genus_deseq.loc[F1_uniref_genus_deseq.isDE==True].feature.unique()
_ = uniref_genus_relab_tax.loc[uniref_genera]
_ = pd.merge(pd.melt(_.reset_index(), id_vars=uniref_genus_ranks+["genus"], var_name="Sample", value_name="%"),
            sample_df, left_on="Sample", right_index=True)
_ = _.loc[_.Generation=="F1"]

_clr = uniref_genus_clr_F1.loc[uniref_genera]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=uniref_genus_ranks+["genus"], var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)

x_order = [x for x in _.loc[_.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_).mark_boxplot().encode(
    x="genus", y="%",column=alt.Column("Treatment", sort=["C","L","H"]), color="genus",
    tooltip=uniref_genus_ranks+["Sample","%"]
).properties(width=w, height=h)

b = alt.Chart(_clr).mark_boxplot().encode(
    x="genus", y="CLR",column=alt.Column("Treatment", sort=["C","L","H"]), color="genus",
    tooltip=uniref_genus_ranks+["Sample","CLR"]
).properties(width=w, height=h)

c = alt.Chart(_).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="genus",
    tooltip=["Sample","%"]+uniref_genus_ranks
).properties(width=w*6, height=h)
alt.vconcat(alt.hconcat(a, b), c).resolve_scale(color="shared")

Below are volcano plots for the ALDex2 analysis for F1.

In [None]:
a_1 = alt.Chart(F1_mag_aldex.reset_index(), title="ALDEx2 (MAGs; F1)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature"]+ranks, color="isDE",
).properties(width=200, height=200)
a_2 = alt.Chart(F1_mag_aldex.reset_index(), title="ALDEx2 (MAGs; F1)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature"]+ranks, color="isDE",
).properties(width=200, height=200)
a = alt.hconcat(a_1, a_2)

b_1 = alt.Chart(F1_uniref_aldex.reset_index(), title="ALDEx2 (UniRef species; F1)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature"]+uniref_ranks, color="isDE",
).properties(width=200, height=200)
b_2 = alt.Chart(F1_uniref_aldex.reset_index(), title="ALDEx2 (UniRef species; F1)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature"]+uniref_ranks, color="isDE",
).properties(width=200, height=200)
b = alt.hconcat(b_1, b_2)

c_1 = alt.Chart(F1_uniref_genus_aldex.reset_index(), title="ALDEx2 (UniRef genera; F1)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature"]+uniref_genus_ranks, color="isDE",
).properties(width=200, height=200)
c_2 = alt.Chart(F1_uniref_genus_aldex.reset_index(), title="ALDEx2 (UniRef genera; F1)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature"]+uniref_genus_ranks, color="isDE",
).properties(width=200, height=200)
c = alt.hconcat(c_1, c_2)
alt.vconcat(a,b,c)

In [None]:
mags = F1_mag_aldex.loc[F1_mag_aldex.isDE==True].index.unique()
_ = mag_relab_tax.loc[mags]
_ = pd.merge(pd.melt(_.reset_index(), id_vars=["feature"]+ranks, var_name="Sample", value_name="%"),
             sample_df, left_on="Sample", right_index=True)
_ = _.loc[_.Generation=="F1"]

_clr = mag_clr_F1.loc[mags]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=["feature"]+ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)


x_order = [x for x in _.loc[_.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_).mark_boxplot().encode(
    x="feature", y="%", color="genus", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["feature"]+ranks
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="feature", y="CLR", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["feature"]+ranks,color="genus"
).properties(width=w, height=h)

c = alt.Chart(_).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="genus",
    tooltip=["feature"]+ranks
).properties(width=w*7.5, height=h)

alt.vconcat(alt.hconcat(a,b), c).resolve_scale(color="shared")

In [None]:
h = 100
w = 100
unirefs = F1_uniref_aldex.loc[F1_uniref_aldex.isDE==True].index.unique()
_ = uniref_relab_tax.loc[unirefs]
_ = pd.merge(pd.melt(_, id_vars=uniref_ranks, var_name="Sample", value_name="%"),
            sample_df, left_on="Sample", right_index=True)
_ = _.loc[_.Generation=="F1"]

_clr = uniref_clr_F1.loc[unirefs]
_clr = pd.merge(pd.melt(_clr, id_vars=uniref_ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)

x_order = [x for x in _.loc[_.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_).mark_boxplot().encode(
    x="species", y="%",column=alt.Column("Treatment", sort=["C","L","H"]), color="species",
    tooltip=uniref_ranks+["Sample","%"]
).properties(width=w, height=h)

b = alt.Chart(_clr).mark_boxplot().encode(
    x="species", y="CLR",column=alt.Column("Treatment", sort=["C","L","H"]), color="species",
    tooltip=uniref_ranks+["Sample","CLR"]
).properties(width=w, height=h)

c = alt.Chart(_).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="species",
    tooltip=["Sample","%"]+uniref_ranks
).properties(width=w*7, height=h)
alt.vconcat(alt.hconcat(a, b), c).resolve_scale(color="shared")

In [None]:
h = 100
w = 100
uniref_genera = F1_uniref_genus_aldex.loc[F1_uniref_genus_aldex.isDE==True].index.unique()
_ = uniref_genus_relab_tax.loc[uniref_genera]
_.index.name = "genus"
_ = pd.merge(pd.melt(_.reset_index(), id_vars=uniref_genus_ranks + ["genus"], var_name="Sample", value_name="%"),
            sample_df, left_on="Sample", right_index=True)
_ = _.loc[_.Generation=="F1"]

_clr = uniref_genus_clr_F1.loc[uniref_genera]
_clr.index.name = "genus"
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=uniref_genus_ranks + ["genus"], var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)

x_order = [x for x in _.loc[_.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_).mark_boxplot().encode(
    x="genus", y="%",column=alt.Column("Treatment", sort=["C","L","H"]), color="genus",
    tooltip=uniref_genus_ranks+["Sample","%"]
).properties(width=w, height=h)

b = alt.Chart(_clr).mark_boxplot().encode(
    x="genus", y="CLR",column=alt.Column("Treatment", sort=["C","L","H"]), color="genus",
    tooltip=uniref_genus_ranks+["Sample","CLR"]
).properties(width=w, height=h)

c = alt.Chart(_).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="genus",
    tooltip=["Sample","%"]+uniref_genus_ranks
).properties(width=w*7, height=h)
alt.vconcat(alt.hconcat(a, b), c).resolve_scale(color="shared")

Below are heatmaps of median CLR values for MAGs and species identified by at least 2 of the 3 tools.

In [None]:
a1 = ident_fts(mixomics=F1_mag_features, deseq=F1_mag_deseq, aldex=F1_mag_aldex, contrast="LvsC", tooltip=["feature"]+ranks+["diff_btw"], mixomics_top=1000000, y="species")
a2 = ident_fts(mixomics=F1_mag_features, deseq=F1_mag_deseq, aldex=F1_mag_aldex, contrast="HvsC", tooltip=["feature"]+ranks+["diff_btw"], mixomics_top=1000000, y="species")
b1 = ident_fts(mixomics=F1_uniref_features, deseq=F1_uniref_deseq, aldex=F1_uniref_aldex, contrast="LvsC", tooltip=["feature"]+uniref_ranks+["diff_btw"], mixomics_top=1000000)
b2 = ident_fts(mixomics=F1_uniref_features, deseq=F1_uniref_deseq, aldex=F1_uniref_aldex, contrast="HvsC", tooltip=["feature"]+uniref_ranks+["diff_btw"], mixomics_top=1000000)
c1 = ident_fts(mixomics=F1_uniref_genus_features, deseq=F1_uniref_genus_deseq, aldex=F1_uniref_genus_aldex, contrast="LvsC", tooltip=["feature"]+uniref_genus_ranks+["diff_btw"], mixomics_top=1000000)
c2 = ident_fts(mixomics=F1_uniref_genus_features, deseq=F1_uniref_genus_deseq, aldex=F1_uniref_genus_aldex, contrast="HvsC", tooltip=["feature"]+uniref_genus_ranks+["diff_btw"], mixomics_top=1000000)
a = alt.hconcat(a1, a2).resolve_scale(color="shared")
b = alt.hconcat(b1, b2).resolve_scale(color="shared")
c = alt.hconcat(c1, c2).resolve_scale(color="shared")
alt.vconcat(a, b, c).resolve_scale(color="shared")

#### F2 generation

In [None]:
ft="mag"
generation="F2"
# MixOmics
F2_mag_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F2_mag_samples = pd.merge(F2_mag_samples, sample_df, left_index=True, right_index=True)
F2_mag_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F2_mag_features = pd.merge(F2_mag_features, mag_tax, left_index=True, right_index=True)

F2_2_mag_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.samples.tsv", sep="\t", index_col=0)
F2_2_mag_samples = pd.merge(F2_2_mag_samples, sample_df, left_index=True, right_index=True)
F2_2_mag_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.features.tsv", header=0, sep="\t", index_col=0)
F2_2_mag_features = pd.merge(F2_2_mag_features, mag_tax, left_index=True, right_index=True)
F2_2_mag_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F2_2_mag_samples.Treatment]

# DESeq2
#res.mag.F2_HvsF2_C.tsv
F2_mag_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_C.tsv", sep="\t", index_col=0)
F2_mag_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF2_C.tsv", sep="\t", index_col=0)
F2_mag_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_L.tsv", sep="\t", index_col=0)
F2_mag_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F2_mag_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F2_mag_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F2_mag_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F2_mag_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F2_mag_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F2_mag_deseq = pd.merge(F2_mag_deseq, mag_tax, left_on="feature", right_index=True)


# ALDEx2
F2_mag_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F2_mag_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F2_mag_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F2_mag_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F2_mag_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F2_mag_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F2_mag_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F2_mag_aldex_glm = pd.merge(F2_mag_aldex, mag_tax, left_index=True, right_index=True)

F2_mag_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F2_mag_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F2_mag_aldex_HC["contrast"] = ["HvsC"]*F2_mag_aldex_HC.shape[0]
F2_mag_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F2_mag_aldex_LC["contrast"] = ["LvsC"]*F2_mag_aldex_LC.shape[0]
F2_mag_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F2_mag_aldex = pd.concat([F2_mag_aldex_HC, F2_mag_aldex_LC])
F2_mag_aldex = pd.merge(F2_mag_aldex, mag_tax, left_index=True, right_index=True)

F2_mag_aldex["isDE"] = False
F2_mag_aldex.loc[(abs(F2_mag_aldex.effect)>1)|((F2_mag_aldex.wi_eBH<0.1)), "isDE"] = True

F2_mag_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F2_mag_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F2_mag_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t")
F2_mag_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

In [None]:
ft="uniref"
generation="F2"
# MixOmics
F2_uniref_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F2_uniref_samples = pd.merge(F2_uniref_samples, sample_df, left_index=True, right_index=True)
F2_uniref_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F2_uniref_features = pd.merge(F2_uniref_features, uniref_tax, left_index=True, right_index=True)
F2_uniref_features.index.name = "index"

F2_2_uniref_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.samples.tsv", sep="\t", index_col=0)
F2_2_uniref_samples = pd.merge(F2_2_uniref_samples, sample_df, left_index=True, right_index=True)
F2_2_uniref_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.features.tsv", header=0, sep="\t", index_col=0)
F2_2_uniref_features = pd.merge(F2_2_uniref_features, uniref_tax, left_index=True, right_index=True)
F2_2_uniref_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F2_2_uniref_samples.Treatment]
F2_2_uniref_features.index.name = "index"

# DESeq2
#res.uniref.F2_HvsF2_C.tsv
F2_uniref_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_C.tsv", sep="\t", index_col=0)
F2_uniref_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF2_C.tsv", sep="\t", index_col=0)
F2_uniref_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_L.tsv", sep="\t", index_col=0)
F2_uniref_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F2_uniref_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F2_uniref_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F2_uniref_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F2_uniref_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F2_uniref_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F2_uniref_deseq = pd.merge(F2_uniref_deseq, uniref_tax, left_on="feature", right_index=True)


# ALDEx2
F2_uniref_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F2_uniref_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F2_uniref_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F2_uniref_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F2_uniref_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F2_uniref_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F2_uniref_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F2_uniref_aldex_glm = pd.merge(F2_uniref_aldex, uniref_tax, left_index=True, right_index=True)

F2_uniref_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F2_uniref_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F2_uniref_aldex_HC["contrast"] = ["HvsC"]*F2_uniref_aldex_HC.shape[0]
F2_uniref_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F2_uniref_aldex_LC["contrast"] = ["LvsC"]*F2_uniref_aldex_LC.shape[0]
F2_uniref_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F2_uniref_aldex = pd.concat([F2_uniref_aldex_HC, F2_uniref_aldex_LC])
F2_uniref_aldex = pd.merge(F2_uniref_aldex, uniref_tax, left_index=True, right_index=True)

F2_uniref_aldex["isDE"] = False
F2_uniref_aldex.loc[(abs(F2_uniref_aldex.effect)>1)|((F2_uniref_aldex.wi_eBH<0.1)), "isDE"] = True

F2_uniref_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F2_uniref_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F2_uniref_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t")
F2_uniref_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

In [None]:
uniref_genus_tax = uniref_tax.set_index("genus").drop("species", axis=1).groupby(level=0).first()
uniref_genus_ranks = ["superkingdom","phylum","class","order","family"]
ft="uniref.genus"
generation="F2"
# MixOmics
F2_uniref_genus_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F2_uniref_genus_samples = pd.merge(F2_uniref_genus_samples, sample_df, left_index=True, right_index=True)
F2_uniref_genus_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F2_uniref_genus_features = pd.merge(F2_uniref_genus_features, uniref_genus_tax, left_index=True, right_index=True)
F2_uniref_genus_features.index.name = "index"

F2_2_uniref_genus_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.samples.tsv", sep="\t", index_col=0)
F2_2_uniref_genus_samples = pd.merge(F2_2_uniref_genus_samples, sample_df, left_index=True, right_index=True)
F2_2_uniref_genus_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.features.tsv", header=0, sep="\t", index_col=0)
F2_2_uniref_genus_features = pd.merge(F2_2_uniref_genus_features, uniref_genus_tax, left_index=True, right_index=True)
F2_2_uniref_genus_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F2_2_uniref_genus_samples.Treatment]
F2_2_uniref_genus_features.index.name = "index"

# DESeq2
#res.uniref_genus.F2_HvsF2_C.tsv
F2_uniref_genus_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_C.tsv", sep="\t", index_col=0)
F2_uniref_genus_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF2_C.tsv", sep="\t", index_col=0)
F2_uniref_genus_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_L.tsv", sep="\t", index_col=0)
F2_uniref_genus_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F2_uniref_genus_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F2_uniref_genus_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F2_uniref_genus_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F2_uniref_genus_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F2_uniref_genus_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F2_uniref_genus_deseq = pd.merge(F2_uniref_genus_deseq, uniref_genus_tax, left_on="feature", right_index=True)


# ALDEx2
F2_uniref_genus_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F2_uniref_genus_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F2_uniref_genus_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F2_uniref_genus_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F2_uniref_genus_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F2_uniref_genus_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F2_uniref_genus_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F2_uniref_genus_aldex_glm = pd.merge(F2_uniref_genus_aldex, uniref_genus_tax, left_index=True, right_index=True)

F2_uniref_genus_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F2_uniref_genus_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F2_uniref_genus_aldex_HC["contrast"] = ["HvsC"]*F2_uniref_genus_aldex_HC.shape[0]
F2_uniref_genus_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F2_uniref_genus_aldex_LC["contrast"] = ["LvsC"]*F2_uniref_genus_aldex_LC.shape[0]
F2_uniref_genus_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F2_uniref_genus_aldex = pd.concat([F2_uniref_genus_aldex_HC, F2_uniref_genus_aldex_LC])
F2_uniref_genus_aldex = pd.merge(F2_uniref_genus_aldex, uniref_genus_tax, left_index=True, right_index=True)

F2_uniref_genus_aldex["isDE"] = False
F2_uniref_genus_aldex.loc[(abs(F2_uniref_genus_aldex.effect)>1)|((F2_uniref_genus_aldex.wi_eBH<0.1)), "isDE"] = True

F2_uniref_genus_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F2_uniref_genus_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F2_uniref_genus_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t",)
F2_uniref_genus_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

##### Alpha diversity

In [None]:
F2_mag_div = calc_diversity(mag_cov_F2)
F2_uniref_div = calc_diversity(uniref_cov_F2)
F2_uniref_genus_div = calc_diversity(uniref_genus_cov_F2)

F2_mag_div = pd.merge(sample_df, F2_mag_div, left_index=True, right_index=True).reset_index()
F2_uniref_div = pd.merge(sample_df, F2_uniref_div, left_index=True, right_index=True).reset_index()
F2_uniref_genus_div = pd.merge(sample_df, F2_uniref_genus_div, left_index=True, right_index=True).reset_index()

source_mag = pd.melt(F2_mag_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")
source_uniref = pd.melt(F2_uniref_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")
source_uniref_genus = pd.melt(F2_uniref_genus_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")

a = alt.Chart(source_mag, title="F2 MAGs").mark_boxplot().encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent")
b = alt.Chart(source_uniref, title="F2 UniRef species").mark_boxplot().encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent")
c = alt.Chart(source_uniref_genus, title="F2 UniRef genera").mark_boxplot().encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent")
alt.hconcat(a,b,c)

In [None]:
rank = "order"
a = alt.Chart(F2_mag_samples.reset_index(),
             title="mixOmics sample variates (F2)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F2_mag_features.loc[(abs(F2_mag_features.corr_comp1)>0.5)|((abs(F2_mag_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics MAG correlations (F2)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ ranks
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

Below we plot the loadings for the MAGs with the highest loadings in the model. MAGs with the highest absolute loadings are the ones that have the highest importance.

In [None]:
fts1 = list(abs(F2_mag_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F2_mag_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F2_mag_features.loc[fts1].reset_index()
source2 = F2_mag_features.loc[fts2].reset_index()
a = alt.Chart(source1, title="Loadings MAGs comp1 (F2)").mark_bar().encode(
    x=alt.X("index", sort=list(source1.sort_values("loading_comp1")["index"])), y="loading_comp1", color="family",
    tooltip=["index"]+ranks
)
b = alt.Chart(source2, title="Loadings MAGs comp2 (F2)").mark_bar().encode(
    x=alt.X("index", sort=list(source2.sort_values("loading_comp2")["index"])), y="loading_comp2", color="family",
    tooltip=["index"]+ranks
)
alt.hconcat(a,b).resolve_scale(
    color='shared',
)

To sanity check this, we also look at the relative abundance of these MAGs across the samples.

In [None]:
w = 250
h=100
fts1 = list(abs(F2_mag_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F2_mag_features.loading_comp2).sort_values(ascending=False).head(10).index)
_relab = mag_relab_tax.loc[list(set(list(fts1+fts2)))]
_relab = pd.merge(pd.melt(_relab.reset_index(), id_vars=["Genome"]+ranks, var_name="Sample", value_name="%"),
             sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F2"]
_clr = mag_clr_F2.loc[list(set(list(fts1+fts2)))]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=["index"]+ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="Genome", y="%", color="genus", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["Genome"]+ranks
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="index", y="CLR", color="genus", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index"]+ranks
).properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="genus",
    tooltip=["Genome"]+ranks
).properties(width=w*3)

alt.vconcat(a,b, c)

Below is a similar plot of a model using UniRef species abundances.

In [None]:
rank = "order"
a = alt.Chart(F2_uniref_samples.reset_index(),
             title="mixOmics sample variates (F2)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F2_uniref_features.loc[(abs(F2_uniref_features.corr_comp1)>0.5)|((abs(F2_uniref_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics species correlations (F2)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ uniref_ranks
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

The splsda model for species used 48 species to separate the samples. Here there's a similar separation of the treatment groups compared to the MAG data. The right panel shows that several species in the Eubacteriales order were negatively correlated with component1, while three Bacteroidales species correlated with positively with component1.

Below we plot the species with highest absolute loadings in the splsda model for UniRef species abundances.

In [None]:
fts1 = list(abs(F2_uniref_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F2_uniref_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F2_uniref_features.loc[fts1].reset_index()
source2 = F2_uniref_features.loc[fts2].reset_index()
a = alt.Chart(source1, title="Loadings UniRef species comp1 (F2)").mark_bar().encode(
    x=alt.X("index", sort=list(source1.sort_values("loading_comp1")["index"])), y="loading_comp1", color="family",
    tooltip=["index"]+uniref_ranks
)
b = alt.Chart(source2, title="Loadings UniRef species comp2 (F2)").mark_bar().encode(
    x=alt.X("index", sort=list(source2.sort_values("loading_comp2")["index"])), y="loading_comp2", color="family",
    tooltip=["index"]+uniref_ranks
)
alt.hconcat(a,b).resolve_scale(
    color='shared',
)

In [None]:
w=250
h=100
fts1 = list(abs(F2_uniref_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F2_uniref_features.loading_comp2).sort_values(ascending=False).head(10).index)
unirefs = list(set(fts1+fts2))
color="family"
_relab = uniref_relab_tax.loc[unirefs]
_relab = pd.merge(pd.melt(_relab, id_vars=uniref_ranks, var_name="Sample", value_name="%"),
            sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F2"]
_clr = uniref_clr_F2.loc[list(set(list(fts1+fts2)))]
_clr = pd.merge(pd.melt(_clr, id_vars=uniref_ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="species", y="%",column=alt.Column("Treatment", sort=["C","L","H"]), color=color,
    tooltip=uniref_ranks+["Sample","%"]
).resolve_scale(y="independent").properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="species", y="CLR",column=alt.Column("Treatment", sort=["C","L","H"]), color=color,
    tooltip=uniref_ranks+["Sample","CLR"]
).resolve_scale(y="independent").properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color=color,
    tooltip=["Sample","%"]+uniref_ranks
).properties(width=w*3)
alt.vconcat(a, b, c).resolve_scale(color="shared")

Below is a plot of a model using taxonomic abundances summed to the genus level.

In [None]:
rank = "order"
a = alt.Chart(F2_uniref_genus_samples.reset_index(),
             title="mixOmics sample variates (UniRef genera; F2)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F2_uniref_genus_features.loc[(abs(F2_uniref_genus_features.corr_comp1)>0.5)|((abs(F2_uniref_genus_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics feature correlations (UniRef genera; F2)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ uniref_genus_ranks
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

And for the genera summed abundances.

In [None]:
fts1 = list(abs(F2_uniref_genus_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F2_uniref_genus_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F2_uniref_genus_features.loc[fts1].reset_index()
source2 = F2_uniref_genus_features.loc[fts2].reset_index()
a = alt.Chart(source1, title="Loadings comp1 (UniRef genera; F2)").mark_bar().encode(
    x=alt.X("index", sort=list(source1.sort_values("loading_comp1")["index"])), y="loading_comp1", color="family",
    tooltip=["index"]+uniref_genus_ranks
)
b = alt.Chart(source2, title="Loadings comp2 (UniRef genera; F2)").mark_bar().encode(
    x=alt.X("index", sort=list(source2.sort_values("loading_comp2")["index"])), y="loading_comp2", color="family",
    tooltip=["index"]+uniref_genus_ranks
)
alt.hconcat(a,b).resolve_scale(
    color='shared',
)

In [None]:
w=250
h=100
fts1 = list(abs(F2_uniref_genus_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F2_uniref_genus_features.loading_comp2).sort_values(ascending=False).head(10).index)
genera = list(set(fts1+fts2))
color="genus"
_relab = uniref_genus_relab_tax.loc[genera].reset_index()
_relab = pd.merge(pd.melt(_relab, id_vars=["genus"]+uniref_genus_ranks, var_name="Sample", value_name="%"),
            sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F2"]
_clr = uniref_genus_clr_F2.loc[genera].reset_index()
_clr = pd.merge(pd.melt(_clr, id_vars=["genus"]+uniref_genus_ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="genus", y="%",column=alt.Column("Treatment", sort=["C","L","H"]), color=color,
    tooltip=uniref_genus_ranks+["Sample","%"]
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="genus", y="CLR",column=alt.Column("Treatment", sort=["C","L","H"]), color=color,
    tooltip=uniref_genus_ranks+["Sample","CLR"]
).properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color=color,
    tooltip=["Sample","%"]+uniref_genus_ranks+["genus"]
).properties(width=w*3)
alt.vconcat(a, b, c).resolve_scale(color="shared")

In [None]:
F2_mag_deseq.loc[F2_mag_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

Below are so called volcano plots of the different comparisons. Each MAG is a point with the magnitude of change on the x-axis and the adjusted p-value on the y-axis. The MAGs with highest difference are to the left and right, and the most significant ones are to the top.

In [None]:
_ = F2_mag_deseq.loc[F2_mag_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
alt.Chart(_).mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["LvsC","HvsC", "HvsCL", "HvsL"]),
    tooltip=["feature"]+ranks
).properties(width=400)

Below we plot out the relative abundance of the MAG identified as differentially abundant by DESeq2

In [None]:
w = 400
h=100
color="family"
mags = F2_mag_deseq.loc[F2_mag_deseq.isDE==True].feature.unique()
_ = mag_relab_tax.loc[mags]
_ = pd.merge(pd.melt(_.reset_index(), id_vars=["Genome"]+ranks, var_name="Sample", value_name="%"),
             sample_df, left_on="Sample", right_index=True)
_ = _.loc[_.Generation=="F2"]

_clr = mag_clr_F2.loc[mags]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=["index"]+ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)

x_order = [x for x in _.loc[_.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="H"].sort_values("Sample").Sample.unique()]

a = alt.Chart(_).mark_boxplot().encode(
    x="Genome", y="%", color=color, column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["Genome"]+ranks
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="index", y="CLR", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index"]+ranks,color=color
).properties(width=w, height=h)

c = alt.Chart(_).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color=color,
    tooltip=["Genome"]+ranks
).properties(width=w*3, height=h)

alt.vconcat(a,b, c).resolve_scale(color="shared")

In [None]:
F2_uniref_deseq.loc[F2_uniref_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

In [None]:
_ = F2_uniref_deseq.loc[F2_uniref_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
#_ = _.loc[abs(_.log2FoldChange)>1]
alt.Chart(_).mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["LvsC","HvsC", "HvsCL", "HvsL"]),
    tooltip=["feature"]+uniref_ranks
).properties(width=200, height=200)

In [None]:
w = 100
h = 150
unirefs = F2_uniref_deseq.loc[F2_uniref_deseq.isDE==True].feature.unique()
_ = uniref_relab_tax.loc[unirefs]
_ = pd.merge(pd.melt(_, id_vars=uniref_ranks, var_name="Sample", value_name="%"),
            sample_df, left_on="Sample", right_index=True)
_ = _.loc[_.Generation=="F2"]

_clr = uniref_clr_F2.loc[unirefs]
_clr = pd.merge(pd.melt(_clr, id_vars=uniref_ranks, var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)

x_order = [x for x in _.loc[_.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_).mark_boxplot().encode(
    x="species", y="%",column=alt.Column("Treatment", sort=["C","L","H"]), color="genus",
    tooltip=uniref_ranks+["Sample","%"]
).properties(width=w, height=h)

b = alt.Chart(_clr).mark_boxplot().encode(
    x="species", y="CLR",column=alt.Column("Treatment", sort=["C","L","H"]), color="genus",
    tooltip=uniref_ranks+["Sample","CLR"]
).properties(width=w, height=h)

c = alt.Chart(_).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="genus",
    tooltip=["Sample","%"]+uniref_ranks
).properties(width=w*6.5, height=h)
alt.vconcat(alt.hconcat(a, b), c).resolve_scale(color="shared")

In [None]:
F2_uniref_genus_deseq.loc[F2_uniref_genus_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

In [None]:
_ = F2_uniref_genus_deseq.loc[F2_uniref_genus_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
#_ = _.loc[abs(_.log2FoldChange)>1]
alt.Chart(_).mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["LvsC","HvsC", "HvsCL", "HvsL"]),
    tooltip=["feature"]+uniref_genus_ranks
).properties(width=200, height=200)

In [None]:
w = 100
h = 150
uniref_genera = F2_uniref_genus_deseq.loc[F2_uniref_genus_deseq.isDE==True].feature.unique()
_ = uniref_genus_relab_tax.loc[uniref_genera]
_ = pd.merge(pd.melt(_.reset_index(), id_vars=uniref_genus_ranks+["genus"], var_name="Sample", value_name="%"),
            sample_df, left_on="Sample", right_index=True)
_ = _.loc[_.Generation=="F2"]

_clr = uniref_genus_clr_F2.loc[uniref_genera]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=uniref_genus_ranks+["genus"], var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)

x_order = [x for x in _.loc[_.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _.loc[_.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_).mark_boxplot().encode(
    x="genus", y="%",column=alt.Column("Treatment", sort=["C","L","H"]), color="genus",
    tooltip=uniref_genus_ranks+["Sample","%"]
).properties(width=w, height=h)

b = alt.Chart(_clr).mark_boxplot().encode(
    x="genus", y="CLR",column=alt.Column("Treatment", sort=["C","L","H"]), color="genus",
    tooltip=uniref_genus_ranks+["Sample","CLR"]
).properties(width=w, height=h)

c = alt.Chart(_).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="genus",
    tooltip=["Sample","%"]+uniref_genus_ranks
).properties(width=w*6, height=h)
alt.vconcat(alt.hconcat(a, b), c).resolve_scale(color="shared")

Below are ALDex2 volcano plots for F2.

In [None]:
a_1 = alt.Chart(F2_mag_aldex.reset_index(), title="ALDEx2 (MAGs; F2)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature"]+ranks, color="isDE",
).properties(width=200, height=200)
a_2 = alt.Chart(F2_mag_aldex.reset_index(), title="ALDEx2 (MAGs; F2)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature"]+ranks, color="isDE",
).properties(width=200, height=200)
a = alt.hconcat(a_1, a_2)

b_1 = alt.Chart(F2_uniref_aldex.reset_index(), title="ALDEx2 (UniRef species; F2)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature"]+uniref_ranks, color="isDE",
).properties(width=200, height=200)
b_2 = alt.Chart(F2_uniref_aldex.reset_index(), title="ALDEx2 (UniRef species; F2)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature"]+uniref_ranks, color="isDE",
).properties(width=200, height=200)
b = alt.hconcat(b_1, b_2)

c_1 = alt.Chart(F2_uniref_genus_aldex.reset_index(), title="ALDEx2 (UniRef genera; F2)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature"]+uniref_genus_ranks, color="isDE",
).properties(width=200, height=200)
c_2 = alt.Chart(F2_uniref_genus_aldex.reset_index(), title="ALDEx2 (UniRef genera; F2)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast",
    tooltip=["feature"]+uniref_genus_ranks, color="isDE",
).properties(width=200, height=200)
c = alt.hconcat(c_1, c_2)
alt.vconcat(a,b,c)

Below are heatmaps of median CLR values for MAGs and species identified by at least 2 of the 3 tools.

In [None]:
a1 = ident_fts(mixomics=F2_mag_features, deseq=F2_mag_deseq, aldex=F2_mag_aldex, contrast="LvsC", tooltip=["feature"]+ranks+["diff_btw"], mixomics_top=1000000, y="species")
a2 = ident_fts(mixomics=F2_mag_features, deseq=F2_mag_deseq, aldex=F2_mag_aldex, contrast="HvsC", tooltip=["feature"]+ranks+["diff_btw"], mixomics_top=1000000, y="species")
b1 = ident_fts(mixomics=F2_uniref_features, deseq=F2_uniref_deseq, aldex=F2_uniref_aldex, contrast="LvsC", tooltip=["feature"]+uniref_ranks+["diff_btw"], mixomics_top=1000000)
b2 = ident_fts(mixomics=F2_uniref_features, deseq=F2_uniref_deseq, aldex=F2_uniref_aldex, contrast="HvsC", tooltip=["feature"]+uniref_ranks+["diff_btw"], mixomics_top=1000000)
c1 = ident_fts(mixomics=F2_uniref_genus_features, deseq=F2_uniref_genus_deseq, aldex=F2_uniref_genus_aldex, contrast="LvsC", tooltip=["feature"]+uniref_genus_ranks+["diff_btw"], mixomics_top=1000000)
c2 = ident_fts(mixomics=F2_uniref_genus_features, deseq=F2_uniref_genus_deseq, aldex=F2_uniref_genus_aldex, contrast="HvsC", tooltip=["feature"]+uniref_genus_ranks+["diff_btw"], mixomics_top=1000000)
a = alt.hconcat(a1, a2).resolve_scale(color="shared")
b = alt.hconcat(b1, b2).resolve_scale(color="shared")
c = alt.hconcat(c1, c2).resolve_scale(color="shared")
alt.vconcat(a, b, c).resolve_scale(color="shared")

### PFAMs

In [None]:
from urllib.request import urlretrieve
import gzip as gz
url = "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.dat.gz"
filename = "Pfam-A.hmm.dat.gz"
urlretrieve(url, filename);

In [None]:
pfam_dat = {}
with gz.open(filename, 'rt') as fhin:
    for line in fhin:
        line = line.rstrip()
        if line == "//":
            continue
        if line.startswith("#=GF ID"):
            pfam_id = line.rsplit()[-1]
            pfam_dat[pfam_id] = {"AC": "", "DE": "", "CL": "None"}
            continue
        key = line.rsplit()[1]
        if not key in ["AC","DE","CL"]:
            continue
        value = " ".join(line.rsplit()[2:])
        pfam_dat[pfam_id][key] = value
pfam_info = pd.DataFrame(pfam_dat).T

In [None]:
pfam_cov = pd.read_csv("../atlas/Genecatalog/counts/PFAMs.median_coverage.tsv", sep="\t", index_col=0)
pfam_cov_info = pd.merge(pfam_info, pfam_cov, left_index=True, right_index=True)

In [None]:
pfam_relab = pfam_cov.div(pfam_cov.sum())*100
pfam_relab_info = pd.merge(pfam_info, pfam_relab, left_index=True, right_index=True )

In [None]:
pfam_cov_F0 = filter_data(pfam_cov, sample_df, "Generation", "F0")
pfam_cov_F1 = filter_data(pfam_cov, sample_df, "Generation", "F1")
pfam_cov_F2 = filter_data(pfam_cov, sample_df, "Generation", "F2")

pfam_clr_F0 = clr_transform(pfam_cov_F0)
pfam_clr_F0 = pd.merge(pfam_clr_F0, pfam_info, left_index=True, right_index=True)
pfam_clr_F1 = clr_transform(pfam_cov_F1)
pfam_clr_F1 = pd.merge(pfam_clr_F1, pfam_info, left_index=True, right_index=True)
pfam_clr_F2 = clr_transform(pfam_cov_F2)
pfam_clr_F2 = pd.merge(pfam_clr_F2, pfam_info, left_index=True, right_index=True)

#### F0 generation

In [None]:
ft="pfam"
generation="F0"
# MixOmics
F0_pfam_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F0_pfam_samples = pd.merge(F0_pfam_samples, sample_df, left_index=True, right_index=True)
F0_pfam_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F0_pfam_features = pd.merge(F0_pfam_features, pfam_info, left_index=True, right_index=True)
F0_2_pfam_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.samples.tsv", sep="\t", index_col=0)
F0_2_pfam_samples = pd.merge(F0_2_pfam_samples, sample_df, left_index=True, right_index=True)
F0_2_pfam_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.features.tsv", header=0, sep="\t", index_col=0)
F0_2_pfam_features = pd.merge(F0_2_pfam_features, pfam_info, left_index=True, right_index=True)
F0_2_pfam_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F0_2_pfam_samples.Treatment]


# DESeq2
#res.pfam.F0_HvsF0_C.tsv
F0_pfam_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_C.tsv", sep="\t", index_col=0)
F0_pfam_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF0_C.tsv", sep="\t", index_col=0)
F0_pfam_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_L.tsv", sep="\t", index_col=0)
F0_pfam_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F0_pfam_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F0_pfam_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F0_pfam_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F0_pfam_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F0_pfam_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F0_pfam_deseq = pd.merge(F0_pfam_deseq, pfam_info, left_on="feature", right_index=True)

# ALDEx2
F0_pfam_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F0_pfam_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F0_pfam_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F0_pfam_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F0_pfam_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F0_pfam_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F0_pfam_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F0_pfam_aldex_glm = pd.merge(F0_pfam_aldex, pfam_info, left_index=True, right_index=True)
F0_pfam_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F0_pfam_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F0_pfam_aldex_HC["contrast"] = ["HvsC"]*F0_pfam_aldex_HC.shape[0]
F0_pfam_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F0_pfam_aldex_LC["contrast"] = ["LvsC"]*F0_pfam_aldex_LC.shape[0]
F0_pfam_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F0_pfam_aldex = pd.concat([F0_pfam_aldex_HC, F0_pfam_aldex_LC])
F0_pfam_aldex = pd.merge(F0_pfam_aldex, pfam_info, left_index=True, right_index=True)

F0_pfam_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F0_pfam_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F0_pfam_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t")
F0_pfam_aldex["isDE"] = False
F0_pfam_aldex.loc[(abs(F0_pfam_aldex.effect)>1)|((F0_pfam_aldex.wi_eBH<0.1)), "isDE"] = True
F0_pfam_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

##### Alpha diversity

In [None]:
F0_pfam_div = calc_diversity(pfam_cov_F0)

F0_pfam_div = pd.merge(sample_df, F0_pfam_div, left_index=True, right_index=True).reset_index()

source_pfam = pd.melt(F0_pfam_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")

alt.Chart(source_pfam, title="F0 PFAMs").mark_boxplot().encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent")

In [None]:
rank = "CL"
a = alt.Chart(F0_pfam_samples.reset_index(),
             title="mixOmics sample variates (PFAMs; F0)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F0_pfam_features.loc[(abs(F0_pfam_features.corr_comp1)>0.5)|((abs(F0_pfam_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics feature correlations (PFAMs; F0)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ ["DE","CL","AC"]
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

In [None]:
fts1 = list(abs(F0_pfam_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F0_pfam_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F0_pfam_features.loc[fts1].reset_index()
source2 = F0_pfam_features.loc[fts2].reset_index()
a = alt.Chart(source1, title="Loadings PFAMs comp1 (F0)").mark_bar().encode(
    x=alt.X("index", sort=list(source1.sort_values("loading_comp1")["index"])), y="loading_comp1", color="CL",
    tooltip=["index"]+["DE","CL","AC"]
)
b = alt.Chart(source2, title="Loadings PFAMs comp2 (F0)").mark_bar().encode(
    x=alt.X("index", sort=list(source2.sort_values("loading_comp2")["index"])), y="loading_comp2", color="CL",
    tooltip=["index"]+["DE","CL","AC"]
)
alt.hconcat(a,b).resolve_scale(
    color='shared',
)

In [None]:
w = 250
h=100
fts1 = list(abs(F0_pfam_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F0_pfam_features.loading_comp2).sort_values(ascending=False).head(10).index)
_relab = pfam_relab_info.loc[list(set(list(fts1+fts2)))]
_relab = pd.merge(pd.melt(_relab.reset_index(), id_vars=["index","DE","AC","CL"], var_name="Sample", value_name="%"),
             sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F0"]
_clr = pfam_clr_F0.loc[list(set(list(fts1+fts2)))]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=["index","DE","AC","CL"], var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="index", y="%", color="CL", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index","DE","AC","CL"],
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="index", y="CLR", color="CL", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index","DE","AC","CL"]
).properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="CL",
    tooltip=["index","DE","AC","CL"]
).properties(width=w*3)

alt.vconcat(a,b, c)

In [None]:
F0_pfam_deseq.loc[F0_pfam_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

In the F0 generation DESeq2 identified 22 PFAMs that differed between Low and Control and 46 that differed between High and Low.

In [None]:
_ = F0_pfam_deseq.loc[F0_pfam_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
_ = _.loc[abs(_.log2FoldChange)>1]
alt.Chart(_, title="DESeq2 (pfams; F0)").mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["HvsC","LvsC", "HvsL", "HvsCL"]),
    tooltip=["feature", "DE", "AC"]
).properties(width=200, height=200)

Below are heatmaps showing log2FoldChange for significant PFAMs between the LvsC and HvsC comparisons.

In [None]:
source = F0_pfam_deseq.loc[F0_pfam_deseq.isDE==True]
a1 = alt.Chart(source.loc[source.contrast=="LvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("DE", sort="color"), color=alt.Color("log2FoldChange").scale(scheme='redblue', reverse=True),
    tooltip=["AC","DE","CL","log2FoldChange"]
)
a2 = alt.Chart(source.loc[source.contrast=="HvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("DE",sort="color"), color=alt.Color("log2FoldChange").scale(scheme='redblue', reverse=True),
    tooltip=["AC","DE","CL","log2FoldChange"]
)
alt.hconcat(a1, a2).resolve_scale(color="independent")

In [None]:
F0_pfam_aldex.loc[F0_pfam_aldex.isDE==True].groupby(["contrast"]).size().sort_values()

ALDex2 identified 10 and 15 PFAMs as differentially abundant in the High and Low treatment, respectively. Note that this is using either `abs(effect)>1` or adjusted p-value < 0.1.

In [None]:
source = F0_pfam_aldex.reset_index()
source = source.loc[abs(source.diff_btw)>1]
a_1 = alt.Chart(source, title="ALDEx2 (pfams; F0)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), column="contrast", color="isDE",
    tooltip=["feature", "DE","AC", "effect", "wi_eBH", "isDE"],
).properties(width=200, height=200)
a_2 = alt.Chart(source, title="ALDEx2 (pfams; F0)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast", color="isDE",
    tooltip=["feature", "DE","AC", "effect", "wi_eBH", "isDE"]
).properties(width=200, height=200)
alt.hconcat(a_1, a_2)

In [None]:
source = F0_pfam_aldex.loc[F0_pfam_aldex.isDE==True]
a1 = alt.Chart(source.loc[source.contrast=="LvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("DE", sort="color"), color=alt.Color("diff_btw").scale(scheme='redblue', reverse=True),
    tooltip=["AC","DE","CL","diff_btw", "effect", "wi_eBH"]
)
a2 = alt.Chart(source.loc[source.contrast=="HvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("DE",sort="color"), color=alt.Color("diff_btw").scale(scheme='redblue', reverse=True),
    tooltip=["AC","DE","CL","diff_btw","effect", "wi_eBH"]
)
alt.hconcat(a1, a2).resolve_scale(color="independent")

Below are heatmaps of PFAMs identified as important/significant by at least two of the tools.

In [None]:
a1 = ident_fts(mixomics=F0_pfam_features, deseq=F0_pfam_deseq, aldex=F0_pfam_aldex, contrast="LvsC", tooltip=["feature","AC","DE","CL","diff_btw","wi_eBH","effect"], mixomics_top=100000)
a2 = ident_fts(mixomics=F0_pfam_features, deseq=F0_pfam_deseq, aldex=F0_pfam_aldex, contrast="HvsC", tooltip=["feature","AC","DE","CL","diff_btw","wi_eBH","effect"], mixomics_top=100000)
alt.hconcat(a1, a2).resolve_scale("independent")

Below we plot features marked as differentially abundant by both ALDEx2 **and** DESeq2 **and** that were in the top 20 features for components 1 and 2 in the mixOmics analysis.

#### F1 generation

In [None]:
ft="pfam"
generation="F1"
# MixOmics
F1_pfam_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F1_pfam_samples = pd.merge(F1_pfam_samples, sample_df, left_index=True, right_index=True)
F1_pfam_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F1_pfam_features = pd.merge(F1_pfam_features, pfam_info, left_index=True, right_index=True)
F1_2_pfam_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.samples.tsv", sep="\t", index_col=0)
F1_2_pfam_samples = pd.merge(F1_2_pfam_samples, sample_df, left_index=True, right_index=True)
F1_2_pfam_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.features.tsv", header=0, sep="\t", index_col=0)
F1_2_pfam_features = pd.merge(F1_2_pfam_features, pfam_info, left_index=True, right_index=True)
F1_2_pfam_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F1_2_pfam_samples.Treatment]


# DESeq2
#res.pfam.F1_HvsF1_C.tsv
F1_pfam_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_C.tsv", sep="\t", index_col=0)
F1_pfam_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF1_C.tsv", sep="\t", index_col=0)
F1_pfam_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_L.tsv", sep="\t", index_col=0)
F1_pfam_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F1_pfam_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F1_pfam_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F1_pfam_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F1_pfam_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F1_pfam_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F1_pfam_deseq = pd.merge(F1_pfam_deseq, pfam_info, left_on="feature", right_index=True)

# ALDEx2
F1_pfam_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F1_pfam_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F1_pfam_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F1_pfam_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F1_pfam_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F1_pfam_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F1_pfam_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F1_pfam_aldex_glm = pd.merge(F1_pfam_aldex, pfam_info, left_index=True, right_index=True)
F1_pfam_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F1_pfam_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F1_pfam_aldex_HC["contrast"] = ["HvsC"]*F1_pfam_aldex_HC.shape[0]
F1_pfam_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F1_pfam_aldex_LC["contrast"] = ["LvsC"]*F1_pfam_aldex_LC.shape[0]
F1_pfam_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F1_pfam_aldex = pd.concat([F1_pfam_aldex_HC, F1_pfam_aldex_LC])
F1_pfam_aldex = pd.merge(F1_pfam_aldex, pfam_info, left_index=True, right_index=True)

F1_pfam_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F1_pfam_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F1_pfam_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t")
F1_pfam_aldex["isDE"] = False
F1_pfam_aldex.loc[(abs(F1_pfam_aldex.effect)>1)|((F1_pfam_aldex.wi_eBH<0.1)), "isDE"] = True
F1_pfam_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

##### Alpha diversity

In [None]:
F1_pfam_div = calc_diversity(pfam_cov_F1)

F1_pfam_div = pd.merge(sample_df, F1_pfam_div, left_index=True, right_index=True).reset_index()

source_pfam = pd.melt(F1_pfam_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")

alt.Chart(source_pfam, title="F1 PFAMs").mark_boxplot().encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent")

In [None]:
rank = "CL"
a = alt.Chart(F1_pfam_samples.reset_index(),
             title="mixOmics sample variates (PFAMs; F1)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F1_pfam_features.loc[(abs(F1_pfam_features.corr_comp1)>0.5)|((abs(F1_pfam_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics feature correlations (PFAMs; F1)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ ["DE","CL","AC"]
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

In [None]:
fts1 = list(abs(F1_pfam_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F1_pfam_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F1_pfam_features.loc[fts1].reset_index()
source2 = F1_pfam_features.loc[fts2].reset_index()
a = alt.Chart(source1, title="Loadings PFAMs comp1 (F1)").mark_bar().encode(
    x=alt.X("index", sort=list(source1.sort_values("loading_comp1")["index"])), y="loading_comp1", color="CL",
    tooltip=["index"]+["DE","CL","AC"]
)
b = alt.Chart(source2, title="Loadings PFAMs comp2 (F1)").mark_bar().encode(
    x=alt.X("index", sort=list(source2.sort_values("loading_comp2")["index"])), y="loading_comp2", color="CL",
    tooltip=["index"]+["DE","CL","AC"]
)
alt.hconcat(a,b).resolve_scale(
    color='shared',
)

In [None]:
w = 250
h=100
fts1 = list(abs(F1_pfam_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F1_pfam_features.loading_comp2).sort_values(ascending=False).head(10).index)
_relab = pfam_relab_info.loc[list(set(list(fts1+fts2)))]
_relab = pd.merge(pd.melt(_relab.reset_index(), id_vars=["index","DE","AC","CL"], var_name="Sample", value_name="%"),
             sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F1"]
_clr = pfam_clr_F1.loc[list(set(list(fts1+fts2)))]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=["index","DE","AC","CL"], var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="index", y="%", color="CL", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index","DE","AC","CL"],
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="index", y="CLR", color="CL", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index","DE","AC","CL"]
).properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="CL",
    tooltip=["index","DE","AC","CL"]
).properties(width=w*3)

alt.vconcat(a,b, c)

In [None]:
F1_pfam_deseq.loc[F1_pfam_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

In the F1 generation DESeq2 identified no PFAMs with a significant difference between treatments.

In [None]:
_ = F1_pfam_deseq.loc[F1_pfam_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
_ = _.loc[abs(_.log2FoldChange)>1]
alt.Chart(_, title="DESeq2 (pfams; F1)").mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["HvsC","LvsC", "HvsL", "HvsCL"]),
    tooltip=["feature", "DE", "AC"]
).properties(width=200, height=200)

In [None]:
F1_pfam_aldex.loc[F1_pfam_aldex.isDE==True].groupby(["contrast"]).size().sort_values()

ALDex2 identified 32 and 51 PFAMs as differentially abundant in the High and Low treatment, respectively. Note that this is using either `abs(effect)>1` or adjusted p-value < 0.1.

In [None]:
source = F1_pfam_aldex.reset_index()
source = source.loc[abs(source.diff_btw)>1]
a_1 = alt.Chart(source, title="ALDEx2 (pfams; F1)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), 
    column="contrast", color="isDE",
    tooltip=["feature", "DE","AC", "effect", "wi_eBH", "isDE"],
).properties(width=200, height=200)
a_2 = alt.Chart(source, title="ALDEx2 (pfams; F1)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast", color="isDE",
    tooltip=["feature", "DE","AC", "effect", "wi_eBH", "isDE"]
).properties(width=200, height=200)
alt.hconcat(a_1, a_2)

In [None]:
source = F1_pfam_aldex.loc[F1_pfam_aldex.isDE==True]
a1 = alt.Chart(source.loc[source.contrast=="LvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("DE", sort="color"), color=alt.Color("diff_btw").scale(scheme='redblue', reverse=True),
    tooltip=["AC","DE","CL","diff_btw", "effect", "wi_eBH"]
)
a2 = alt.Chart(source.loc[source.contrast=="HvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("DE",sort="color"), color=alt.Color("diff_btw").scale(scheme='redblue', reverse=True),
    tooltip=["AC","DE","CL","diff_btw","effect", "wi_eBH"]
)
alt.hconcat(a1, a2).resolve_scale(color="independent")

Below are heatmaps of PFAMs identified as important/significant by at least two of the tools.

In [None]:
a1 = ident_fts(mixomics=F1_pfam_features, deseq=F1_pfam_deseq, aldex=F1_pfam_aldex, contrast="LvsC", tooltip=["feature","AC","DE","CL","diff_btw","wi_eBH","effect"], mixomics_top=100000)
a2 = ident_fts(mixomics=F1_pfam_features, deseq=F1_pfam_deseq, aldex=F1_pfam_aldex, contrast="HvsC", tooltip=["feature","AC","DE","CL","diff_btw","wi_eBH","effect"], mixomics_top=100000)
alt.hconcat(a1, a2).resolve_scale("independent")

#### F2 generation

In [None]:
ft="pfam"
generation="F2"
# MixOmics
F2_pfam_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F2_pfam_samples = pd.merge(F2_pfam_samples, sample_df, left_index=True, right_index=True)
F2_pfam_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F2_pfam_features = pd.merge(F2_pfam_features, pfam_info, left_index=True, right_index=True)
F2_2_pfam_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.samples.tsv", sep="\t", index_col=0)
F2_2_pfam_samples = pd.merge(F2_2_pfam_samples, sample_df, left_index=True, right_index=True)
F2_2_pfam_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.features.tsv", header=0, sep="\t", index_col=0)
F2_2_pfam_features = pd.merge(F2_2_pfam_features, pfam_info, left_index=True, right_index=True)
F2_2_pfam_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F2_2_pfam_samples.Treatment]


# DESeq2
#res.pfam.F2_HvsF2_C.tsv
F2_pfam_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_C.tsv", sep="\t", index_col=0)
F2_pfam_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF2_C.tsv", sep="\t", index_col=0)
F2_pfam_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_L.tsv", sep="\t", index_col=0)
F2_pfam_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F2_pfam_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F2_pfam_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F2_pfam_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F2_pfam_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F2_pfam_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F2_pfam_deseq = pd.merge(F2_pfam_deseq, pfam_info, left_on="feature", right_index=True)

# ALDEx2
F2_pfam_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F2_pfam_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F2_pfam_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F2_pfam_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F2_pfam_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F2_pfam_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F2_pfam_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F2_pfam_aldex_glm = pd.merge(F2_pfam_aldex, pfam_info, left_index=True, right_index=True)
F2_pfam_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F2_pfam_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F2_pfam_aldex_HC["contrast"] = ["HvsC"]*F2_pfam_aldex_HC.shape[0]
F2_pfam_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F2_pfam_aldex_LC["contrast"] = ["LvsC"]*F2_pfam_aldex_LC.shape[0]
F2_pfam_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F2_pfam_aldex = pd.concat([F2_pfam_aldex_HC, F2_pfam_aldex_LC])
F2_pfam_aldex = pd.merge(F2_pfam_aldex, pfam_info, left_index=True, right_index=True)

F2_pfam_aldex["isDE"] = False
F2_pfam_aldex.loc[(abs(F2_pfam_aldex.effect)>1)|((F2_pfam_aldex.wi_eBH<0.1)), "isDE"] = True

F2_pfam_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F2_pfam_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F2_pfam_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t")
F2_pfam_aldex["isDE"] = False
F2_pfam_aldex.loc[(abs(F2_pfam_aldex.effect)>1)|((F2_pfam_aldex.wi_eBH<0.1)), "isDE"] = True
F2_pfam_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

##### Alpha diversity

In [None]:
F2_pfam_div = calc_diversity(pfam_cov_F2)

F2_pfam_div = pd.merge(sample_df, F2_pfam_div, left_index=True, right_index=True).reset_index()

source_pfam = pd.melt(F2_pfam_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")

alt.Chart(source_pfam, title="F2 PFAMs").mark_boxplot(size=25).encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent").properties(width=100, height=200)

In [None]:
rank = "CL"
a = alt.Chart(F2_pfam_samples.reset_index(),
             title="mixOmics sample variates (PFAMs; F2)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F2_pfam_features.loc[(abs(F2_pfam_features.corr_comp1)>0.5)|((abs(F2_pfam_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics feature correlations (PFAMs; F2)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ ["DE","CL","AC"]
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

In [None]:
fts1 = list(abs(F2_pfam_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F2_pfam_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F2_pfam_features.loc[fts1].reset_index()
source2 = F2_pfam_features.loc[fts2].reset_index()
a = alt.Chart(source1, title="Loadings PFAMs comp1 (F2)").mark_bar().encode(
    x=alt.X("index", sort=list(source1.sort_values("loading_comp1")["index"])), y="loading_comp1", color="CL",
    tooltip=["index"]+["DE","CL","AC"]
)
b = alt.Chart(source2, title="Loadings PFAMs comp2 (F2)").mark_bar().encode(
    x=alt.X("index", sort=list(source2.sort_values("loading_comp2")["index"])), y="loading_comp2", color="CL",
    tooltip=["index"]+["DE","CL","AC"]
)
alt.hconcat(a,b).resolve_scale(
    color='shared',
)

In [None]:
w = 250
h=100
fts1 = list(abs(F2_pfam_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F2_pfam_features.loading_comp2).sort_values(ascending=False).head(10).index)
_relab = pfam_relab_info.loc[list(set(list(fts1+fts2)))]
_relab = pd.merge(pd.melt(_relab.reset_index(), id_vars=["index","DE","AC","CL"], var_name="Sample", value_name="%"),
             sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F2"]
_clr = pfam_clr_F2.loc[list(set(list(fts1+fts2)))]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=["index","DE","AC","CL"], var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="index", y="%", color="CL", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index","DE","AC","CL"],
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="index", y="CLR", color="CL", column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index","DE","AC","CL"]
).properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color="CL",
    tooltip=["index","DE","AC","CL"]
).properties(width=w*3)

alt.vconcat(a,b, c)

In [None]:
F2_pfam_deseq.loc[F2_pfam_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

In the F2 generation DESeq2 identified 10 and 17 PFAMs that differed between Low vs Control and High vs Control, respectively.

In [None]:
_ = F2_pfam_deseq.loc[F2_pfam_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
_ = _.loc[abs(_.log2FoldChange)>1]
alt.Chart(_, title="DESeq2 (pfams; F2)").mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["HvsC","LvsC", "HvsL", "HvsCL"]),
    tooltip=["feature", "DE", "AC"]
).properties(width=200, height=200)

In [None]:
source = F2_pfam_deseq.loc[F2_pfam_deseq.isDE==True]
a1 = alt.Chart(source.loc[source.contrast=="LvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("DE", sort="color"), color=alt.Color("log2FoldChange").scale(scheme='redblue', reverse=True),
    tooltip=["AC","DE","CL","log2FoldChange"]
)
a2 = alt.Chart(source.loc[source.contrast=="HvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("DE",sort="color"), color=alt.Color("log2FoldChange").scale(scheme='redblue', reverse=True),
    tooltip=["AC","DE","CL","log2FoldChange"]
)
alt.hconcat(a1, a2).resolve_scale(color="independent")

For F2, ALDEx2 did not identify any significant pfams.

In [None]:
source = F2_pfam_aldex.reset_index()
source = source.loc[abs(source.diff_btw)>1]
a_1 = alt.Chart(source, title="ALDEx2 (pfams; F2)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), 
    column="contrast", color="isDE",
    tooltip=["feature", "DE","AC", "effect", "wi_eBH", "isDE"],
).properties(width=200, height=200)
a_2 = alt.Chart(source, title="ALDEx2 (pfams; F2)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast", color="isDE",
    tooltip=["feature", "DE","AC", "effect", "wi_eBH", "isDE"]
).properties(width=200, height=200)
alt.hconcat(a_1, a_2)

Below are heatmaps of PFAMs identified as important/significant by at least two of the tools. Note that here we use the intersection with all PFAMs used in the mixomics model.

In [None]:
a1 = ident_fts(mixomics=F2_pfam_features, deseq=F2_pfam_deseq, aldex=F2_pfam_aldex, contrast="LvsC", tooltip=["feature","AC","DE","CL","diff_btw","wi_eBH","effect"], mixomics_top=10000000)
a2 = ident_fts(mixomics=F2_pfam_features, deseq=F2_pfam_deseq, aldex=F2_pfam_aldex, contrast="HvsC", tooltip=["feature","AC","DE","CL","diff_btw","wi_eBH","effect"], mixomics_top=10000000)
alt.hconcat(a1, a2).resolve_scale("independent")

### KEGG orthologs

In [None]:
import json
def get_kegg_ortholog_hierarchy(s):
    hier = {}
    # First level is 'ko00001'
    for d1 in s['children']:
        c1 = d1['name']
        for d2 in d1['children']:
            c2 = d2['name']
            for d3 in d2['children']:
                c3 = d3['name']
                if not "children" in d3.keys():
                    continue
                for ko in d3['children']:
                    ko_name = ko['name'].split("\t")[0]
                    ko_id = ko_name.split(" ")[0]
                    if "[EC:" in ko_name:
                        enzymes = ko_name.split("[")[-1].split("]")[0].lstrip("EC:").split(" ")
                    else:
                        enzymes = []
                    d = {"KO_category1": c1, "KO_category2": c2, "pathway": c3, "name": ko_name, "enzymes": enzymes}
                    try:
                        hier[ko_id].append(d)
                    except KeyError:
                        hier[ko_id] = [d]
    return hier
with open("ko00001.keg.json", 'r') as fhin:
    s = json.load(fhin)
kegg_hier = get_kegg_ortholog_hierarchy(s)

In [None]:
ko2cat = {}

for key, l in kegg_hier.items():
    cat_counts = {}
    cats = []
    for item in l:
        cat1 = item["KO_category1"]
        cat2 = item["KO_category2"]
        cat = f"{cat1}|{cat2}"
        cats.append(cat)
    for c in list(set(cats)):
        count = cats.count(c)
        cat_counts[c] = count
    cat_counts = pd.DataFrame(cat_counts, index=["n"]).T
    cat1, cat2 = cat_counts.sort_values("n", ascending=False).head(1).index[0].split("|")
    ko2cat[key] = {"KO_category1": cat1, "KO_category2": cat2}

In [None]:
ko = pd.read_csv("kegg_kos.tsv", sep="\t", index_col=0, comment="#")
ko_info = pd.merge(ko, pd.DataFrame(ko2cat).T, left_index=True, right_index=True)
ko_info.rename(index=lambda x: f"ko:{x}", inplace=True)

In [None]:
ko_cov = pd.read_csv("../atlas/Genecatalog/counts/KO.median_coverage.tsv", sep="\t", index_col=0)
ko_cov_info = pd.merge(ko_info, ko_cov, left_index=True, right_index=True)

In [None]:
ko_relab = ko_cov.div(ko_cov.sum())*100
ko_relab_info = pd.merge(ko_info, ko_relab, left_index=True, right_index=True )

In [None]:
ko_cov_F0 = filter_data(ko_cov, sample_df, "Generation", "F0")
ko_cov_F1 = filter_data(ko_cov, sample_df, "Generation", "F1")
ko_cov_F2 = filter_data(ko_cov, sample_df, "Generation", "F2")

ko_clr_F0 = clr_transform(ko_cov_F0)
ko_clr_F0 = pd.merge(ko_clr_F0, ko_info, left_index=True, right_index=True)
ko_clr_F1 = clr_transform(ko_cov_F1)
ko_clr_F1 = pd.merge(ko_clr_F1, ko_info, left_index=True, right_index=True)
ko_clr_F2 = clr_transform(ko_cov_F2)
ko_clr_F2 = pd.merge(ko_clr_F2, ko_info, left_index=True, right_index=True)

#### F0 generation

In [None]:
ft="ko"
generation="F0"
# MixOmics
F0_ko_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F0_ko_samples = pd.merge(F0_ko_samples, sample_df, left_index=True, right_index=True)
F0_ko_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F0_ko_features = pd.merge(F0_ko_features, ko_info, left_index=True, right_index=True)
F0_2_ko_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.samples.tsv", sep="\t", index_col=0)
F0_2_ko_samples = pd.merge(F0_2_ko_samples, sample_df, left_index=True, right_index=True)
F0_2_ko_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.features.tsv", header=0, sep="\t", index_col=0)
F0_2_ko_features = pd.merge(F0_2_ko_features, ko_info, left_index=True, right_index=True)
F0_2_ko_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F0_2_ko_samples.Treatment]


# DESeq2
#res.ko.F0_HvsF0_C.tsv
F0_ko_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_C.tsv", sep="\t", index_col=0)
F0_ko_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF0_C.tsv", sep="\t", index_col=0)
F0_ko_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_L.tsv", sep="\t", index_col=0)
F0_ko_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF0_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F0_ko_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F0_ko_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F0_ko_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F0_ko_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F0_ko_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F0_ko_deseq = pd.merge(F0_ko_deseq, ko_info, left_on="feature", right_index=True)

# ALDEx2
F0_ko_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F0_ko_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F0_ko_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F0_ko_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F0_ko_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F0_ko_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F0_ko_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F0_ko_aldex_glm = pd.merge(F0_ko_aldex, ko_info, left_index=True, right_index=True)
F0_ko_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F0_ko_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F0_ko_aldex_HC["contrast"] = ["HvsC"]*F0_ko_aldex_HC.shape[0]
F0_ko_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F0_ko_aldex_LC["contrast"] = ["LvsC"]*F0_ko_aldex_LC.shape[0]
F0_ko_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F0_ko_aldex = pd.concat([F0_ko_aldex_HC, F0_ko_aldex_LC])
F0_ko_aldex = pd.merge(F0_ko_aldex, ko_info, left_index=True, right_index=True)

F0_ko_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F0_ko_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F0_ko_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t")
F0_ko_aldex["isDE"] = False
F0_ko_aldex.loc[(abs(F0_ko_aldex.effect)>1)|((F0_ko_aldex.wi_eBH<0.1)), "isDE"] = True
F0_ko_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

##### Alpha diversity

In [None]:
F0_ko_div = calc_diversity(ko_cov_F0)

F0_ko_div = pd.merge(sample_df, F0_ko_div, left_index=True, right_index=True).reset_index()

source_ko = pd.melt(F0_ko_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")

alt.Chart(source_ko, title="F0 kos").mark_boxplot(size=25).encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent").properties(width=100, height=200)

In [None]:
rank = "KO_category1"
a = alt.Chart(F0_ko_samples.reset_index(),
             title="mixOmics sample variates (kos; F0)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F0_ko_features.loc[(abs(F0_ko_features.corr_comp1)>0.5)|((abs(F0_ko_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics feature correlations (kos; F0)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ ["KO_name","KO_category1","KO_category2"]
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

In [None]:
color="KO_category1"
fts1 = list(abs(F0_ko_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F0_ko_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F0_ko_features.loc[fts1].reset_index()
source2 = F0_ko_features.loc[fts2].reset_index()
a = alt.Chart(source1, title="Loadings kos comp1 (F0)").mark_bar().encode(
    x=alt.X("index", sort=list(source1.sort_values("loading_comp1")["index"])), y="loading_comp1", 
    color=color,
    tooltip=["index"]+["KO_name","KO_category1","KO_category2"]
)
b = alt.Chart(source2, title="Loadings kos comp2 (F0)").mark_bar().encode(
    x=alt.X("index", sort=list(source2.sort_values("loading_comp2")["index"])), y="loading_comp2", 
    color=color,
    tooltip=["index"]+["KO_name","KO_category1","KO_category2"]
)
alt.hconcat(a,b).resolve_scale(
    color='shared',
)

In [None]:
color = "KO_category2"
w = 250
h=100
fts1 = list(abs(F0_ko_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F0_ko_features.loading_comp2).sort_values(ascending=False).head(10).index)
_relab = ko_relab_info.loc[list(set(list(fts1+fts2)))]
_relab = pd.merge(pd.melt(_relab.reset_index(), id_vars=["index","KO_name","KO_category1","KO_category2"], var_name="Sample", value_name="%"),
             sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F0"]
_clr = ko_clr_F0.loc[list(set(list(fts1+fts2)))]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=["index","KO_name","KO_category1","KO_category2"], var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="index", y="%", color=color, column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index","KO_name","KO_category1","KO_category2"],
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="index", y="CLR", color=color, column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index","KO_name","KO_category1","KO_category2"]
).properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color=color,
    tooltip=["index","KO_name","KO_category1","KO_category2"]
).properties(width=w*3)

alt.vconcat(a,b, c)

In [None]:
F0_ko_deseq.loc[F0_ko_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

In the F0 generation DESeq2 identified 19 kos that differed between Low and Control and 1 that differed between High and Control.

In [None]:
_ = F0_ko_deseq.loc[F0_ko_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
_ = _.loc[abs(_.log2FoldChange)>1]
alt.Chart(_, title="DESeq2 (kos; F0)").mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["HvsC","LvsC", "HvsL", "HvsCL"]),
    tooltip=["feature", "KO_name","KO_category1","KO_category2"]
).properties(width=200, height=200)

Below are heatmaps showing log2FoldChange for significant KOs in each comparison.

In [None]:
source = F0_ko_deseq.loc[F0_ko_deseq.isDE==True]
a1 = alt.Chart(source.loc[source.contrast=="LvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("KO_name", sort="color"), color=alt.Color("log2FoldChange").scale(scheme='redblue', reverse=True),
    tooltip=["feature", "KO_name","KO_category1","KO_category2", "log2FoldChange", "padj"]
)
a2 = alt.Chart(source.loc[source.contrast=="HvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("KO_name",sort="color"), color=alt.Color("log2FoldChange").scale(scheme='redblue', reverse=True),
    tooltip=["feature", "KO_name","KO_category1","KO_category2", "log2FoldChange", "padj"]
)
alt.hconcat(a1, a2).resolve_scale(color="independent")

In [None]:
F0_ko_aldex.loc[F0_ko_aldex.isDE==True].groupby(["contrast"]).size().sort_values()

ALDex2 identified 5 and 3 Kegg orthologs in L vs C and H vs C.

In [None]:
source = F0_ko_aldex.reset_index()
source = source.loc[abs(source.diff_btw)>0.5]
a_1 = alt.Chart(source, title="ALDEx2 (kos; F0)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), 
    column="contrast", color="isDE",
    tooltip=["feature", "KO_name","KO_category1","KO_category2", "diff_btw", "wi_eBH"]
).properties(width=200, height=200)
a_2 = alt.Chart(source, title="ALDEx2 (kos; F0)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast", color="isDE",
    tooltip=["feature", "KO_name","KO_category1","KO_category2", "diff_btw", "wi_eBH"]
).properties(width=200, height=200)
alt.hconcat(a_1, a_2)

In [None]:
source = F0_ko_aldex.loc[F0_ko_aldex.isDE==True]
a1 = alt.Chart(source.loc[source.contrast=="LvsC"].reset_index()).mark_rect().encode(
    x = "contrast", y=alt.Y("KO_name", sort="color"), color=alt.Color("diff_btw").scale(scheme='redblue', reverse=True),
    tooltip=["feature", "KO_name","KO_category1","KO_category2", "diff_btw", "wi_eBH"]
)
a2 = alt.Chart(source.loc[source.contrast=="HvsC"].reset_index()).mark_rect().encode(
    x = "contrast", y=alt.Y("KO_name",sort="color"), color=alt.Color("diff_btw").scale(scheme='redblue', reverse=True),
    tooltip=["feature", "KO_name","KO_category1","KO_category2", "diff_btw", "wi_eBH"]
)
alt.hconcat(a1, a2).resolve_scale(color="independent")

Below are heatmaps of median difference between groups for KOs identified by at least two of the tools.

In [None]:
a1 = ident_fts(mixomics=F0_ko_features, deseq=F0_ko_deseq, aldex=F0_ko_aldex, contrast="LvsC", tooltip=["feature","KO_name", "KO_category1", "KO_category2", "diff_btw","wi_eBH","effect"], mixomics_top=1000000, y="KO_name")
a2 = ident_fts(mixomics=F0_ko_features, deseq=F0_ko_deseq, aldex=F0_ko_aldex, contrast="HvsC", tooltip=["feature","KO_name", "KO_category1", "KO_category2", "diff_btw","wi_eBH","effect"], mixomics_top=1000000, y="KO_name")
alt.hconcat(a1, a2).resolve_scale("independent")

#### F1 generation

In [None]:
ft="ko"
generation="F1"
# MixOmics
F1_ko_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F1_ko_samples = pd.merge(F1_ko_samples, sample_df, left_index=True, right_index=True)
F1_ko_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F1_ko_features = pd.merge(F1_ko_features, ko_info, left_index=True, right_index=True)
F1_2_ko_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.samples.tsv", sep="\t", index_col=0)
F1_2_ko_samples = pd.merge(F1_2_ko_samples, sample_df, left_index=True, right_index=True)
F1_2_ko_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.features.tsv", header=0, sep="\t", index_col=0)
F1_2_ko_features = pd.merge(F1_2_ko_features, ko_info, left_index=True, right_index=True)
F1_2_ko_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F1_2_ko_samples.Treatment]


# DESeq2
#res.ko.F1_HvsF1_C.tsv
F1_ko_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_C.tsv", sep="\t", index_col=0)
F1_ko_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF1_C.tsv", sep="\t", index_col=0)
F1_ko_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_L.tsv", sep="\t", index_col=0)
F1_ko_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF1_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F1_ko_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F1_ko_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F1_ko_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F1_ko_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F1_ko_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F1_ko_deseq = pd.merge(F1_ko_deseq, ko_info, left_on="feature", right_index=True)

# ALDEx2
F1_ko_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F1_ko_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F1_ko_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F1_ko_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F1_ko_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F1_ko_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F1_ko_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F1_ko_aldex_glm = pd.merge(F1_ko_aldex, ko_info, left_index=True, right_index=True)
F1_ko_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F1_ko_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F1_ko_aldex_HC["contrast"] = ["HvsC"]*F1_ko_aldex_HC.shape[0]
F1_ko_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F1_ko_aldex_LC["contrast"] = ["LvsC"]*F1_ko_aldex_LC.shape[0]
F1_ko_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F1_ko_aldex = pd.concat([F1_ko_aldex_HC, F1_ko_aldex_LC])
F1_ko_aldex = pd.merge(F1_ko_aldex, ko_info, left_index=True, right_index=True)

F1_ko_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F1_ko_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F1_ko_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t")
F1_ko_aldex["isDE"] = False
F1_ko_aldex.loc[(abs(F1_ko_aldex.effect)>1)|((F1_ko_aldex.wi_eBH<0.1)), "isDE"] = True
F1_ko_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

##### Alpha diversity

In [None]:
F1_ko_div = calc_diversity(ko_cov_F1)

F1_ko_div = pd.merge(sample_df, F1_ko_div, left_index=True, right_index=True).reset_index()

source_ko = pd.melt(F1_ko_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")

alt.Chart(source_ko, title="F1 kos").mark_boxplot(size=25).encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent").properties(width=100, height=200)

In [None]:
rank = "KO_category1"
a = alt.Chart(F1_ko_samples.reset_index(),
             title="mixOmics sample variates (kos; F1)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F1_ko_features.loc[(abs(F1_ko_features.corr_comp1)>0.5)|((abs(F1_ko_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics feature correlations (kos; F1)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ ["KO_name","KO_category1","KO_category2"]
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

In [None]:
color="KO_category1"
fts1 = list(abs(F1_ko_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F1_ko_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F1_ko_features.loc[fts1].reset_index()
source2 = F1_ko_features.loc[fts2].reset_index()
a = alt.Chart(source1, title="Loadings kos comp1 (F1)").mark_bar().encode(
    x=alt.X("index", sort=list(source1.sort_values("loading_comp1")["index"])), y="loading_comp1", 
    color=color,
    tooltip=["index"]+["KO_name","KO_category1","KO_category2"]
)
b = alt.Chart(source2, title="Loadings kos comp2 (F1)").mark_bar().encode(
    x=alt.X("index", sort=list(source2.sort_values("loading_comp2")["index"])), y="loading_comp2", 
    color=color,
    tooltip=["index"]+["KO_name","KO_category1","KO_category2"]
)
alt.hconcat(a,b).resolve_scale(
    color='shared',
)

In [None]:
color = "KO_category2"
w = 250
h=100
fts1 = list(abs(F1_ko_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F1_ko_features.loading_comp2).sort_values(ascending=False).head(10).index)
_relab = ko_relab_info.loc[list(set(list(fts1+fts2)))]
_relab = pd.merge(pd.melt(_relab.reset_index(), id_vars=["index","KO_name","KO_category1","KO_category2"], var_name="Sample", value_name="%"),
             sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F1"]
_clr = ko_clr_F1.loc[list(set(list(fts1+fts2)))]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=["index","KO_name","KO_category1","KO_category2"], var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="index", y="%", color=color, column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index","KO_name","KO_category1","KO_category2"],
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="index", y="CLR", color=color, column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index","KO_name","KO_category1","KO_category2"]
).properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color=color,
    tooltip=["index","KO_name","KO_category1","KO_category2"]
).properties(width=w*3)

alt.vconcat(a,b, c)

In [None]:
F1_ko_deseq.loc[F1_ko_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

In the F1 generation DESeq2 did not identify any significantly different KOs.

In [None]:
_ = F1_ko_deseq.loc[F1_ko_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
_ = _.loc[abs(_.log2FoldChange)>1]
alt.Chart(_, title="DESeq2 (kos; F1)").mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["HvsC","LvsC", "HvsL", "HvsCL"]),
    tooltip=["feature", "KO_name","KO_category1","KO_category2"]
).properties(width=200, height=200)

In [None]:
F1_ko_aldex.loc[F1_ko_aldex.isDE==True].groupby(["contrast"]).size().sort_values()

ALDex2 identified 59 and 23 Kegg orthologs in L vs C and H vs C.

In [None]:
source = F1_ko_aldex.reset_index()
source = source.loc[abs(source.diff_btw)>0.5]
a_1 = alt.Chart(source, title="ALDEx2 (kos; F1)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), 
    column="contrast", color="isDE",
    tooltip=["feature", "KO_name","KO_category1","KO_category2", "diff_btw", "wi_eBH"]
).properties(width=200, height=200)
a_2 = alt.Chart(source, title="ALDEx2 (kos; F1)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast", color="isDE",
    tooltip=["feature", "KO_name","KO_category1","KO_category2", "diff_btw", "wi_eBH"]
).properties(width=200, height=200)
alt.hconcat(a_1, a_2)

In [None]:
ko_cat1_lut = dict(zip(ko_info.KO_category1.unique(), sns.color_palette("Set1")))
ko_cat1_colors = ko_info.loc[:, "KO_category1"].map(ko_cat1_lut)

In [None]:
source = F1_ko_aldex.loc[F1_ko_aldex.isDE==True]
a1 = alt.Chart(source.loc[source.contrast=="LvsC"].reset_index()).mark_rect().encode(
    x = "contrast", y=alt.Y("KO_name", sort="color"), color=alt.Color("diff_btw").scale(scheme='redblue', reverse=True),
    tooltip=["feature", "KO_name","KO_category1","KO_category2", "diff_btw", "wi_eBH"]
)
a2 = alt.Chart(source.loc[source.contrast=="HvsC"].reset_index()).mark_rect().encode(
    x = "contrast", y=alt.Y("KO_name",sort="color"), color=alt.Color("diff_btw").scale(scheme='redblue', reverse=True),
    tooltip=["feature", "KO_name","KO_category1","KO_category2", "diff_btw", "wi_eBH"]
)
alt.hconcat(a1, a2).resolve_scale(color="independent")

Below are heatmaps of median difference between groups for KOs identified by at least two of the tools.

In [None]:
a1 = ident_fts(mixomics=F1_ko_features, deseq=F1_ko_deseq, aldex=F1_ko_aldex, contrast="LvsC", tooltip=["feature","KO_name", "KO_category1", "KO_category2", "diff_btw","wi_eBH","effect"], mixomics_top=100000, y="KO_name")
a2 = ident_fts(mixomics=F1_ko_features, deseq=F1_ko_deseq, aldex=F1_ko_aldex, contrast="HvsC", tooltip=["feature","KO_name", "KO_category1", "KO_category2", "diff_btw","wi_eBH","effect"], mixomics_top=100000, y="KO_name")
alt.hconcat(a1, a2).resolve_scale("independent")

#### F2 generation

In [None]:
ft="ko"
generation="F2"
# MixOmics
F2_ko_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.samples.tsv", sep="\t", index_col=0)
F2_ko_samples = pd.merge(F2_ko_samples, sample_df, left_index=True, right_index=True)
F2_ko_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}.features.tsv", header=0, sep="\t", index_col=0)
F2_ko_features = pd.merge(F2_ko_features, ko_info, left_index=True, right_index=True)
F2_2_ko_samples = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.samples.tsv", sep="\t", index_col=0)
F2_2_ko_samples = pd.merge(F2_2_ko_samples, sample_df, left_index=True, right_index=True)
F2_2_ko_features = pd.read_csv(f"../atlas/stats/mixOmics/final.splsda.{ft}.{generation}_2.features.tsv", header=0, sep="\t", index_col=0)
F2_2_ko_features = pd.merge(F2_2_ko_features, ko_info, left_index=True, right_index=True)
F2_2_ko_samples["Treatment_2"] = ["C+L" if x in ["C", "L"] else x for x in F2_2_ko_samples.Treatment]


# DESeq2
#res.ko.F2_HvsF2_C.tsv
F2_ko_deseq_HvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_C.tsv", sep="\t", index_col=0)
F2_ko_deseq_LvsC = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_LvsF2_C.tsv", sep="\t", index_col=0)
F2_ko_deseq_HvsL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_L.tsv", sep="\t", index_col=0)
F2_ko_deseq_HvsCL = pd.read_csv(f"../atlas/stats/DESeq2/res.{ft}.{generation}_HvsF2_CL.tsv", sep="\t", index_col=0)
id_vars=["feature","isDE","log2FoldChange","padj"]
source1 = pd.melt(F2_ko_deseq_HvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source1.drop(["variable","value"], axis=1, inplace=True)
source1["contrast"] = ["HvsC"]*source1.shape[0]
source2 = pd.melt(F2_ko_deseq_LvsC.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source2.drop(["variable","value"], axis=1, inplace=True)
source2["contrast"] = ["LvsC"]*source2.shape[0]
source3 = pd.melt(F2_ko_deseq_HvsL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source3.drop(["variable","value"], axis=1, inplace=True)
source3["contrast"] = ["HvsL"]*source2.shape[0]
source4 = pd.melt(F2_ko_deseq_HvsCL.reset_index(), id_vars=id_vars).groupby("feature").first().reset_index()
source4.drop(["variable","value"], axis=1, inplace=True)
source4["contrast"] = ["HvsCL"]*source2.shape[0]
F2_ko_deseq = pd.concat(
    [
    source1.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source2.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source3.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]],
    source4.loc[:, ["feature","isDE","padj","log2FoldChange","contrast"]]
    ]
)
F2_ko_deseq = pd.merge(F2_ko_deseq, ko_info, left_on="feature", right_index=True)

# ALDEx2
F2_ko_aldex_test = pd.read_csv(f"../atlas/stats/ALDEx2/glm.test.{ft}.{generation}.tsv", sep="\t", index_col=0)
F2_ko_aldex_effect = pd.read_csv(f"../atlas/stats/ALDEx2/glm.effect.{ft}.{generation}.tsv", sep="\t", index_col=0)
source1 = pd.merge(F2_ko_aldex_effect.loc[:, ["treatmentL.diff.btw", "treatmentL.effect"]],
         F2_ko_aldex_test.loc[:, ["treatmentL:pval.holm"]], left_index=True, right_index=True)
source1.columns = ["diff_btw","effect","padj"]
source1["contrast"] = ["LvsC"]*source1.shape[0]
source2 = pd.merge(F2_ko_aldex_effect.loc[:, ["treatmentH.diff.btw", "treatmentH.effect"]],
         F2_ko_aldex_test.loc[:, ["treatmentH:pval.holm"]], left_index=True, right_index=True)
source2.columns = ["diff_btw","effect","padj"]
source2["contrast"] = ["HvsC"]*source1.shape[0]
source3 = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_2.tsv", sep="\t", index_col=0)
source3 = source3.loc[:, ["diff.btw","effect","wi.eBH"]]
source3.columns = ["diff_btw","effect","padj"]
source3["contrast"] = ["HvsCL"]*source3.shape[0]
F2_ko_aldex = pd.concat(
    [
        source1, source2, source3
    ]
)
F2_ko_aldex_glm = pd.merge(F2_ko_aldex, ko_info, left_index=True, right_index=True)
F2_ko_aldex_HC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_HC.tsv", sep="\t", index_col=0)
F2_ko_aldex_LC = pd.read_csv(f"../atlas/stats/ALDEx2/{ft}.{generation}_LC.tsv", sep="\t", index_col=0)
F2_ko_aldex_HC["contrast"] = ["HvsC"]*F2_ko_aldex_HC.shape[0]
F2_ko_aldex_HC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F2_ko_aldex_LC["contrast"] = ["LvsC"]*F2_ko_aldex_LC.shape[0]
F2_ko_aldex_LC.rename(columns=lambda x: x.replace(".","_"), inplace=True)
F2_ko_aldex = pd.concat([F2_ko_aldex_HC, F2_ko_aldex_LC])
F2_ko_aldex = pd.merge(F2_ko_aldex, ko_info, left_index=True, right_index=True)

F2_ko_samples.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.samples.tsv", sep="\t")
F2_ko_features.to_csv(f"../atlas/stats/{generation}_{ft}.mixomics.features.tsv", sep="\t")
F2_ko_deseq.set_index("feature").to_csv(f"../atlas/stats/{generation}_{ft}_deseq.tsv", sep="\t")
F2_ko_aldex["isDE"] = False
F2_ko_aldex.loc[(abs(F2_ko_aldex.effect)>1)|((F2_ko_aldex.wi_eBH<0.1)), "isDE"] = True
F2_ko_aldex.to_csv(f"../atlas/stats/{generation}_{ft}_aldex.tsv", sep="\t")

##### Alpha diversity

In [None]:
F2_ko_div = calc_diversity(ko_cov_F2)

F2_ko_div = pd.merge(sample_df, F2_ko_div, left_index=True, right_index=True).reset_index()

source_ko = pd.melt(F2_ko_div.loc[:, ["index","shannon","observed","Treatment","Reads_pe"]], 
        id_vars=["index","Treatment","Reads_pe"], var_name="diversity")

alt.Chart(source_ko, title="F2 kos").mark_boxplot(size=25).encode(
    x=alt.X("Treatment", sort=["C","L","H"]), y="value", column="diversity", 
    color=alt.Color("Treatment",sort=["C","L","H"]),
).resolve_scale(y="independent").properties(width=100, height=200)

In [None]:
rank = "KO_category1"
a = alt.Chart(F2_ko_samples.reset_index(),
             title="mixOmics sample variates (kos; F2)").mark_circle(size=120).encode(
    x='comp1',
    y='comp2',
    color=alt.Color("Treatment", sort=["C","L","H"]),
    tooltip=['Sample', 'Generation', 'Treatment']
).interactive()

b = alt.Chart(F2_ko_features.loc[(abs(F2_ko_features.corr_comp1)>0.5)|((abs(F2_ko_features.corr_comp2)>0.5))].reset_index(),
             title="mixOmics feature correlations (kos; F2)").mark_circle(size=60).encode(
    x="corr_comp1",
    y="corr_comp2",
    color=rank,
    #size="Stability",
    tooltip=["index","loading_comp1","loading_comp2","stability_comp1","stability_comp2","corr_comp1","corr_comp2"]+ ["KO_name","KO_category1","KO_category2"]
).interactive()


alt.hconcat(a,b).resolve_scale(
    color='independent',
    shape='independent'
)

In [None]:
color="KO_category1"
fts1 = list(abs(F2_ko_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F2_ko_features.loading_comp2).sort_values(ascending=False).head(10).index)
source1 = F2_ko_features.loc[fts1].reset_index()
source2 = F2_ko_features.loc[fts2].reset_index()
a = alt.Chart(source1, title="Loadings kos comp1 (F2)").mark_bar().encode(
    x=alt.X("index", sort=list(source1.sort_values("loading_comp1")["index"])), y="loading_comp1", 
    color=color,
    tooltip=["index"]+["KO_name","KO_category1","KO_category2"]
)
b = alt.Chart(source2, title="Loadings kos comp2 (F2)").mark_bar().encode(
    x=alt.X("index", sort=list(source2.sort_values("loading_comp2")["index"])), y="loading_comp2", 
    color=color,
    tooltip=["index"]+["KO_name","KO_category1","KO_category2"]
)
alt.hconcat(a,b).resolve_scale(
    color='shared',
)

In [None]:
color = "KO_category2"
w = 250
h=100
fts1 = list(abs(F2_ko_features.loading_comp1).sort_values(ascending=False).head(10).index)
fts2 = list(abs(F2_ko_features.loading_comp2).sort_values(ascending=False).head(10).index)
_relab = ko_relab_info.loc[list(set(list(fts1+fts2)))]
_relab = pd.merge(pd.melt(_relab.reset_index(), id_vars=["index","KO_name","KO_category1","KO_category2"], var_name="Sample", value_name="%"),
             sample_df, left_on="Sample", right_index=True)
_relab = _relab.loc[_relab.Generation=="F2"]
_clr = ko_clr_F2.loc[list(set(list(fts1+fts2)))]
_clr = pd.merge(pd.melt(_clr.reset_index(), id_vars=["index","KO_name","KO_category1","KO_category2"], var_name="Sample", value_name="CLR"),
             sample_df, left_on="Sample", right_index=True)
x_order = [x for x in _relab.loc[_relab.Treatment=="C"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="L"].sort_values("Sample").Sample.unique()] + [x for x in _relab.loc[_relab.Treatment=="H"].sort_values("Sample").Sample.unique()]
a = alt.Chart(_relab).mark_boxplot().encode(
    x="index", y="%", color=color, column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index","KO_name","KO_category1","KO_category2"],
).properties(width=w, height=h)
b = alt.Chart(_clr).mark_boxplot().encode(
    x="index", y="CLR", color=color, column=alt.Column("Treatment", sort=["C","L","H"]),
    tooltip=["index","KO_name","KO_category1","KO_category2"]
).properties(width=w, height=h)
c = alt.Chart(_relab).mark_bar().encode(
    y=alt.Y('sum(%)'), x=alt.X("Sample", sort=x_order), color=color,
    tooltip=["index","KO_name","KO_category1","KO_category2"]
).properties(width=w*3)

alt.vconcat(a,b, c)

In [None]:
F2_ko_deseq.loc[F2_ko_deseq.isDE==True].groupby(["contrast"]).size().sort_values()

In [None]:
_ = F2_ko_deseq.loc[F2_ko_deseq.contrast.isin(["HvsC","LvsC","HvsCL", "HvsL"])]
_ = _.loc[abs(_.log2FoldChange)>1]
alt.Chart(_, title="DESeq2 (kos; F2)").mark_point().encode(
    x="log2FoldChange", y=alt.Y("padj", sort="descending"),color="isDE",
    column=alt.Column("contrast", sort=["HvsC","LvsC", "HvsL", "HvsCL"]),
    tooltip=["feature", "KO_name","KO_category1","KO_category2"]
).properties(width=200, height=200)

Below are heatmaps showing log2FoldChange for significant KOs in each comparison.

In [None]:
source = F2_ko_deseq.loc[F2_ko_deseq.isDE==True]
a1 = alt.Chart(source.loc[source.contrast=="LvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("KO_name", sort="color"), color=alt.Color("log2FoldChange").scale(scheme='redblue', reverse=True),
    tooltip=["feature", "KO_name","KO_category1","KO_category2", "log2FoldChange", "padj"]
)
a2 = alt.Chart(source.loc[source.contrast=="HvsC"]).mark_rect().encode(
    x = "contrast", y=alt.Y("KO_name",sort="color"), color=alt.Color("log2FoldChange").scale(scheme='redblue', reverse=True),
    tooltip=["feature", "KO_name","KO_category1","KO_category2", "log2FoldChange", "padj"]
)
alt.hconcat(a1, a2).resolve_scale(color="independent")

In [None]:
F2_ko_aldex.loc[F2_ko_aldex.isDE==True].groupby(["contrast"]).size().sort_values()

ALDex2 found no differentially abundant Kegg orthologs in F2.

In [None]:
source = F2_ko_aldex.reset_index()
source = source.loc[abs(source.diff_btw)>0.5]
a_1 = alt.Chart(source, title="ALDEx2 (kos; F2)").mark_circle().encode(
    x = "diff_btw", y=alt.Y("wi_eBH", sort="descending"), 
    column="contrast", color="isDE",
    tooltip=["feature", "KO_name","KO_category1","KO_category2", "diff_btw", "wi_eBH"]
).properties(width=200, height=200)
a_2 = alt.Chart(source, title="ALDEx2 (kos; F2)").mark_circle().encode(
    x = "effect", y=alt.Y("wi_eBH", sort="descending"), column="contrast", color="isDE",
    tooltip=["feature", "KO_name","KO_category1","KO_category2", "diff_btw", "wi_eBH"]
).properties(width=200, height=200)
alt.hconcat(a_1, a_2)

Below are heatmaps of median difference between groups for KOs identified by at least two of the tools.

In [None]:
a1 = ident_fts(mixomics=F2_ko_features, deseq=F2_ko_deseq, aldex=F2_ko_aldex, contrast="LvsC", tooltip=["feature","KO_name", "KO_category1", "KO_category2", "diff_btw","wi_eBH","effect"], mixomics_top=100000, y="KO_name")
a2 = ident_fts(mixomics=F2_ko_features, deseq=F2_ko_deseq, aldex=F2_ko_aldex, contrast="HvsC", tooltip=["feature","KO_name", "KO_category1", "KO_category2", "diff_btw","wi_eBH","effect"], mixomics_top=100000, y="KO_name")
alt.hconcat(a1, a2).resolve_scale("independent")

### Resistance genes

Resistance genes were identified both in MAGs and in the full GeneCatalog using the [Resistance Gene Identifier tool](https://github.com/arpcard/rgi). Below is a quick look at the results from the MAGs.

In [None]:
mag_stats = pd.read_csv("../atlas/genomes/genome_quality.tsv", header=0, index_col=0, sep="\t")

In [None]:
rgi_mags = pd.read_csv("../atlas/genomes/annotations/rgi/rgi.out.txt", sep="\t")
rgi_mags["MAG"] = [x.split("_")[0] for x in rgi_mags["ORF_ID"]]
rgi_mags = pd.merge(rgi_mags, mag_tax, left_on="MAG", right_index=True)

Sum number of RGs per MAG, then normalize by total coding sequences.

In [None]:
mag_rgi_counts = pd.merge(pd.DataFrame(rgi_mags.groupby("MAG").size(), columns=["RGs"]), mag_stats, left_index=True, right_index=True, how="outer")
mag_rgi_counts.fillna(0, inplace=True)
mag_rgi_counts = pd.merge(mag_rgi_counts, mag_tax, left_index=True, right_index=True)
mag_rgi_counts["%RGs"] = mag_rgi_counts["RGs"] / mag_rgi_counts["total_coding_sequences"] * 100
top_rg_mags = mag_rgi_counts.loc[mag_rgi_counts["%RGs"]>0.4].index
_ = pd.melt(mag_clr.loc[:, top_rg_mags].reset_index(), id_vars=["index"], var_name="MAG")
_ = pd.merge(_, sample_df, left_on="index", right_index=True)
_ = pd.merge(_, mag_tax, left_on="MAG", right_index=True)

Below are all MAGs plotted with total coding sequences vs % resistance genes.

In [None]:
alt.Chart(mag_rgi_counts.reset_index()).mark_circle().encode(
    x = "%RGs", y="total_coding_sequences", color="phylum",
    tooltip=["index"]+ranks+["RGs", "%RGs"]
)

Most MAGs had RG hits below 0.3% of their total coding sequences. 5 MAGs had hits>0.4%:

- MAG143 (Unclassified Scatovivens (Clostridia))
- MAG136 (Unclassified CAG-269 (Clostridia))
- MAG146 (Unclassified CAG-269 (Clostridia))
- MAG262 (Unclassified UBA2730 (Bacilli))
- MAG260 (Klebsiella oxytoca (Gammaproteobacteria))

Below is the relative abundance of these 5 MAGs per Treatment and Generation. Each point is one of the MAGs in one sample.

In [None]:
source = pd.merge(pd.merge(pd.melt(mag_relab.loc[top_rg_mags].reset_index(), id_vars=["index"], value_name="%", var_name="Sample"), mag_tax, left_on="index", right_index=True), sample_df, left_on="Sample", right_index=True)
alt.Chart(source).mark_circle(size=60).encode(
    x = alt.X("Treatment", sort=["C","L","H"]), xOffset="jitter:Q", y="%", column="Generation", 
    color="index", tooltip=["Sample", "index", "species", "%"]
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
)
