# 1. Import Packages

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import qiime2 as q2
from qiime2 import Visualization
from scipy.stats import shapiro, kruskal, f_oneway

# 2. Data Directionary

In [2]:
# Location
data_dir = "Project_data/FUNGuild"
! mkdir -p "$data_dir"

In [3]:
# Paths to project inputs
input_table    = "Project_data/Taxonomy/table_filtered.qza"
input_taxonomy = "Project_data/Taxonomy/taxonomy_pretrained.qza"
input_metadata = "Project_data/Metadata/updated_fungut_metadata.tsv"

# 3. Export QIIME2 Artifacts to TSV

In [4]:
# Export taxonomy file
! qiime tools export \
    --input-path "$input_taxonomy" \
    --output-path "$data_dir"

# Export feature table (BIOM)
! qiime tools export \
    --input-path "$input_table" \
    --output-path "$data_dir"

  import pkg_resources
[32mExported Project_data/Taxonomy/taxonomy_pretrained.qza as TSVTaxonomyDirectoryFormat to directory Project_data/FUNGuild[0m
  import pkg_resources
[32mExported Project_data/Taxonomy/table_filtered.qza as BIOMV210DirFmt to directory Project_data/FUNGuild[0m
[0m[?25hAll QIIME2 artifacts exported into: Project_data/FUNGuild


In [5]:
# Convert BIOM → TSV
biom_path = f"{data_dir}/feature-table.biom"
tsv_path = f"{data_dir}/feature-table.tsv"

! biom convert \
    -i "$biom_path" \
    -o "$tsv_path" \
    --to-tsv

# 4. Build FUNGuild input table

In [15]:
feature_tsv = f"{data_dir}/feature-table.tsv"

# Load feature table
feature_df = pd.read_csv(feature_tsv, sep="\t", skiprows=[0], index_col=0)

print("Feature table shape:", feature_df.shape)
feature_df.head()

Feature table shape: (895, 150)


Unnamed: 0_level_0,ERR5327198,ERR5327199,ERR5327266,ERR5327282,ERR5327284,ERR5327285,ERR5327287,ERR5327288,ERR5327289,ERR5327300,...,ERR5327586,ERR5327587,ERR5327591,ERR5327592,ERR5327596,ERR5327599,ERR5327604,ERR5327605,ERR5327615,ERR5327620
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4306469410ceb9944820bcea0e5952b2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
f817ec6ce9c52036235cf4002fffa65e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,16772.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6ccbc16d6454d876ebb3cc5428d78b65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
633ec982f89c4772064757cec6e7d3ad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ffafb7e58eb5ffd894cf6b5a4610dc89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
tax_tsv     = f"{data_dir}/taxonomy.tsv"

# Load taxonomy table
tax_df = pd.read_csv(tax_tsv, sep='\t', comment='#')
tax_df = tax_df.rename(columns={"Feature ID": "feature_id",
                                "Taxon": "taxonomy"})

tax_df.head()

Unnamed: 0,feature_id,taxonomy,Confidence
0,f872ab159e2219de905e49b556b85d05,k__Fungi;p__Ascomycota;c__Saccharomycetes;o__S...,0.999972
1,d3b20e3fa30a65662fc3a6e0057d6e90,k__Fungi;p__Ascomycota;c__Ascomycota_cls_Incer...,0.999984
2,333df8f222ab30bfdffd5b7d3c5a789b,k__Fungi;p__Ascomycota;c__Saccharomycetes;o__S...,0.918087
3,beba691fc7ee5c7219589a54ec45b0bd,k__Fungi;p__Ascomycota;c__Eurotiomycetes;o__Eu...,0.841928
4,5e7d76c82d92bc95d366c4c5796d3b21,Unassigned,0.380525


In [19]:
# Merge counts + taxonomy into one table

# Make taxonomy a Series indexed by feature_id
tax_series = tax_df.set_index("feature_id")["taxonomy"]

# Join feature table with taxonomy using the feature IDs
funguild_input = feature_df.join(tax_series, how="left")

# Drop features without taxonomy
#funguild_input = funguild_input.dropna(subset=["taxonomy"])

print("FUNGuild input shape:", funguild_input.shape)
funguild_input.head()

FUNGuild input shape: (895, 151)


Unnamed: 0_level_0,ERR5327198,ERR5327199,ERR5327266,ERR5327282,ERR5327284,ERR5327285,ERR5327287,ERR5327288,ERR5327289,ERR5327300,...,ERR5327587,ERR5327591,ERR5327592,ERR5327596,ERR5327599,ERR5327604,ERR5327605,ERR5327615,ERR5327620,taxonomy
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4306469410ceb9944820bcea0e5952b2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
f817ec6ce9c52036235cf4002fffa65e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,16772.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
6ccbc16d6454d876ebb3cc5428d78b65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
633ec982f89c4772064757cec6e7d3ad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
ffafb7e58eb5ffd894cf6b5a4610dc89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [None]:
# Save table
funguild_input_path = f"{data_dir}/funguild_input.txt"
funguild_input.to_csv(funguild_input_path, sep="\t")

print("Saved FUNGuild input to:", funguild_input_path)

In [20]:
print("Feature table index (first 5):")
print(feature_df.index[:5])
print("dtype:", feature_df.index.dtype)

print("\nTaxonomy feature_id (first 5):")
print(tax_df["feature_id"].head())
print("dtype:", tax_df["feature_id"].dtype)

Feature table index (first 5):
Index(['4306469410ceb9944820bcea0e5952b2', 'f817ec6ce9c52036235cf4002fffa65e',
       '6ccbc16d6454d876ebb3cc5428d78b65', '633ec982f89c4772064757cec6e7d3ad',
       'ffafb7e58eb5ffd894cf6b5a4610dc89'],
      dtype='object', name='#OTU ID')
dtype: object

Taxonomy feature_id (first 5):
0    f872ab159e2219de905e49b556b85d05
1    d3b20e3fa30a65662fc3a6e0057d6e90
2    333df8f222ab30bfdffd5b7d3c5a789b
3    beba691fc7ee5c7219589a54ec45b0bd
4    5e7d76c82d92bc95d366c4c5796d3b21
Name: feature_id, dtype: object
dtype: object


In [21]:
shared_ids = set(feature_df.index) & set(tax_df["feature_id"])
print("Number of shared IDs:", len(shared_ids))

Number of shared IDs: 0
