In [2]:
# ------------ helper: common imports & 90-mm figure width -------------
import pandas as pd, numpy as np, matplotlib.pyplot as plt, pathlib, textwrap
import matplotlib as mpl
mpl.rcParams["pdf.fonttype"] = 42   # editable text in Illustrator
FIG_W = 3.54        # 90 mm → inches (3.54")


In [3]:
# ── CONFIG ───────────────────────────────────────────────────────────
presence_file = "/home/jovyan/fig3_2_presence_absence.tsv"    # ← edit if needed
# ─────────────────────────────────────────────────────────────

from upsetplot import UpSet, from_indicators

# 1. load presence/absence table
pres = pd.read_csv(presence_file, sep="\t")
up_df = from_indicators(pres[["in_oesophagus","in_oral"]], pres["species"])

# 2. UpSet plot
fig = plt.figure(figsize=(FIG_W, 2.8))
UpSet(up_df, subset_size="count", show_counts=True).plot(fig=fig)
plt.tight_layout()
plt.savefig("fig3_2A_upset.pdf"); plt.savefig("fig3_2A_upset.svg")
plt.close()
print("✓ fig3_2A_upset.* saved")

# 3. lollipop (Jaccard index)
intersect = (pres["in_oesophagus"] & pres["in_oral"]).sum()
jaccard = intersect / (
    pres["in_oesophagus"].sum() + pres["in_oral"].sum() - intersect
)

plt.figure(figsize=(FIG_W, 1.5))
plt.stem([0], [jaccard], basefmt=" "); plt.xlim(-0.5, 0.5)
plt.ylabel("Jaccard index"); plt.xticks([])
plt.title("Species overlap (oesophagus ∩ oral)")
plt.tight_layout()
plt.savefig("fig3_2B_lollipop.pdf"); plt.savefig("fig3_2B_lollipop.svg")
plt.close()
print("✓ fig3_2B_lollipop.* saved (Jaccard ≈ {:.3f})".format(jaccard))


ValueError: The indicators must all be boolean

In [4]:
import pandas as pd, matplotlib.pyplot as plt
from upsetplot import from_indicators, UpSet

presence_file = "/home/jovyan/fig3_2_presence_absence.tsv"  # adjust if needed
pres = pd.read_csv(presence_file, sep="\t")

# convert 0/1 → False/True
indicators = pres[["in_oesophagus", "in_oral"]].astype(bool)

# build UpSet data and plot
up_df = from_indicators(indicators, pres["species"])

fig = plt.figure(figsize=(FIG_W, 2.8))          # FIG_W = 3.54 from helper cell
UpSet(up_df, subset_size="count", show_counts=True).plot(fig=fig)
plt.tight_layout()
plt.savefig("fig3_2A_upset.pdf"); plt.savefig("fig3_2A_upset.svg")
plt.close()
print("✓ fig3_2A_upset.* saved")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  styles["linewidth"].fillna(1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  styles["facecolor"].fillna(self._facecolor, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

✓ fig3_2A_upset.* saved


In [5]:
# --- lollipop plot of species overlap (Figure 3·2 B) -----------------
import matplotlib.pyplot as plt
import pathlib, numpy as np

# re-use the Boolean indicators already loaded
intersect = (indicators["in_oesophagus"] & indicators["in_oral"]).sum()
jaccard   = intersect / (
    indicators["in_oesophagus"].sum()
    + indicators["in_oral"].sum()
    - intersect
)

plt.figure(figsize=(FIG_W, 1.4))       # FIG_W = 3.54 from helper cell
plt.stem([0], [jaccard], basefmt=" ", use_line_collection=True)
plt.xlim(-0.5, 0.5); plt.xticks([])
plt.ylabel("Jaccard index"); plt.title("Oesophagus ∩ Oral")
plt.tight_layout()

out = pathlib.Path("fig3_2B_lollipop")
plt.savefig(out.with_suffix(".pdf")); plt.savefig(out.with_suffix(".svg"))
plt.close()
print(f"✓ {out.name}.* saved  (Jaccard ≈ {jaccard:.3f})")


TypeError: stem() got an unexpected keyword argument 'use_line_collection'

<Figure size 354x140 with 0 Axes>

In [6]:
# --- lollipop plot of species overlap (Figure 3·2 B) -----------------
import matplotlib.pyplot as plt
import pathlib

# indicators is already defined from the UpSet cell
intersect = (indicators["in_oesophagus"] & indicators["in_oral"]).sum()
jaccard   = intersect / (
    indicators["in_oesophagus"].sum()
    + indicators["in_oral"].sum()
    - intersect
)

plt.figure(figsize=(FIG_W, 1.4))      # FIG_W = 3.54 from helper cell
plt.stem([0], [jaccard], basefmt=" ") # remove use_line_collection for old MPL
plt.xlim(-0.5, 0.5); plt.xticks([])
plt.ylabel("Jaccard index"); plt.title("Oesophagus ∩ Oral")
plt.tight_layout()

out = pathlib.Path("fig3_2B_lollipop")
plt.savefig(out.with_suffix(".pdf")); plt.savefig(out.with_suffix(".svg"))
plt.close()
print(f"✓ {out.name}.* saved  (Jaccard ≈ {jaccard:.3f})")


✓ fig3_2B_lollipop.* saved  (Jaccard ≈ 1.000)


In [7]:
# ---------- CONFIG ----------
module_matrix = "/full/path/to/your/module_matrix.tsv"   # ← edit
# ----------------------------

import pandas as pd, seaborn as sns, matplotlib.pyplot as plt, pathlib
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

FIG_W = 3.54  # 90 mm

mat = pd.read_csv(module_matrix, sep="\t", index_col=0)

# --- heat-map (top-50 variable modules) ---
top50 = mat.var().nlargest(50).index
plt.figure(figsize=(FIG_W, 4.5))
sns.heatmap(mat[top50], cmap="viridis", cbar_kws={"label":"Completeness"})
plt.ylabel("MAG"); plt.xlabel("KEGG module (top 50 var)")
plt.tight_layout()
plt.savefig("fig3_3A_heatmap.pdf"); plt.savefig("fig3_3A_heatmap.svg")
plt.close(); print("✓ fig3_3A_heatmap.* saved")

# --- simple PCoA (PCA on scaled data) ---
X = StandardScaler().fit_transform(mat)
pca = PCA(n_components=2).fit_transform(X)
plt.figure(figsize=(FIG_W, 3))
plt.scatter(pca[:,0], pca[:,1], s=30)
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.tight_layout()
plt.savefig("fig3_3C_pcoa.pdf"); plt.savefig("fig3_3C_pcoa.svg")
plt.close(); print("✓ fig3_3C_pcoa.* saved")


ModuleNotFoundError: No module named 'sklearn'

In [8]:
# ---------- CONFIG ----------
module_matrix = "/full/path/to/your/module_matrix.tsv"   # ← edit
# ----------------------------

import pandas as pd, seaborn as sns, matplotlib.pyplot as plt, pathlib
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

FIG_W = 3.54  # 90 mm

mat = pd.read_csv(module_matrix, sep="\t", index_col=0)

# --- heat-map (top-50 variable modules) ---
top50 = mat.var().nlargest(50).index
plt.figure(figsize=(FIG_W, 4.5))
sns.heatmap(mat[top50], cmap="viridis", cbar_kws={"label":"Completeness"})
plt.ylabel("MAG"); plt.xlabel("KEGG module (top 50 var)")
plt.tight_layout()
plt.savefig("fig3_3A_heatmap.pdf"); plt.savefig("fig3_3A_heatmap.svg")
plt.close(); print("✓ fig3_3A_heatmap.* saved")

# --- simple PCoA (PCA on scaled data) ---
X = StandardScaler().fit_transform(mat)
pca = PCA(n_components=2).fit_transform(X)
plt.figure(figsize=(FIG_W, 3))
plt.scatter(pca[:,0], pca[:,1], s=30)
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.tight_layout()
plt.savefig("fig3_3C_pcoa.pdf"); plt.savefig("fig3_3C_pcoa.svg")
plt.close(); print("✓ fig3_3C_pcoa.* saved")


ModuleNotFoundError: No module named 'sklearn'

In [9]:
# ═════════════════  LAST-STEP ONE-CELL  (Fig 3·3 A & C)  ═════════════════
# Path to your DRAM/METABOLIC module matrix (rows = MAGs, cols = KEGG modules)
module_matrix = "/full/path/to/your/module_matrix.tsv"           # ← edit here
# (If you do not have it yet, skip this cell and finish later.)

# 1 ─ Install any missing libraries right inside this kernel
import subprocess, sys, importlib
for pkg in ["scikit-learn", "seaborn"]:
    try:
        importlib.import_module(pkg.replace("-", "_"))
    except ModuleNotFoundError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", pkg])

# 2 ─ Imports (now guaranteed to work)
import pandas as pd, seaborn as sns, matplotlib.pyplot as plt, pathlib
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

FIG_W = 3.54   # 90 mm journal column width

# 3 ─ Load the module-completeness matrix
mat = pd.read_csv(module_matrix, sep="\t", index_col=0)

# 4 ─ Heat-map (top-50 most variable modules)
top50 = mat.var().nlargest(50).index
plt.figure(figsize=(FIG_W, 4.5))
sns.heatmap(mat[top50], cmap="viridis", cbar_kws={"label": "Completeness"})
plt.ylabel("MAG"); plt.xlabel("KEGG module (top 50 var)")
plt.tight_layout()
plt.savefig("fig3_3A_heatmap.pdf"); plt.savefig("fig3_3A_heatmap.svg")
plt.close()
print("✓ fig3_3A_heatmap.* saved")

# 5 ─ Simple PCoA (PCA) scatter
X = StandardScaler().fit_transform(mat)
pc1, pc2 = PCA(n_components=2).fit_transform(X).T
plt.figure(figsize=(FIG_W, 3))
plt.scatter(pc1, pc2, s=30)
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.tight_layout()
plt.savefig("fig3_3C_pcoa.pdf"); plt.savefig("fig3_3C_pcoa.svg")
plt.close()
print("✓ fig3_3C_pcoa.* saved")
# ════════════════════════════════════════════════════════════════════════


FileNotFoundError: [Errno 2] No such file or directory: '/full/path/to/your/module_matrix.tsv'