# 1. Import Packages

In [5]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import qiime2 as q2
from qiime2 import Visualization
from scipy.stats import shapiro, kruskal, f_oneway

# 2. Data Directionary

In [6]:
# Location
data_dir = "Project_data/FungalTrait"
! mkdir -p "$data_dir"

In [7]:
# Paths to project inputs
input_table    = "Project_data/Taxonomy/table_filtered.qza"
input_taxonomy = "Project_data/Taxonomy/taxonomy_pretrained.qza"
input_metadata = "Project_data/Metadata/updated_fungut_metadata.tsv"

# 3. Export QIIME2 Artifacts to TSV

In [24]:
# Export taxonomy file
! qiime tools export \
    --input-path "$input_taxonomy" \
    --output-path "$data_dir"

# Export feature table (BIOM)
! qiime tools export \
    --input-path "$input_table" \
    --output-path "$data_dir"

  import pkg_resources
[32mExported Project_data/Taxonomy/taxonomy_pretrained.qza as TSVTaxonomyDirectoryFormat to directory Project_data/FungalTrait[0m
  import pkg_resources
[32mExported Project_data/Taxonomy/table_filtered.qza as BIOMV210DirFmt to directory Project_data/FungalTrait[0m
[0m[?25h

In [25]:
# Convert BIOM -> TSV
biom_path = f"{data_dir}/feature-table.biom"
tsv_path = f"{data_dir}/feature-table.tsv"

! biom convert \
    -i "$biom_path" \
    -o "$tsv_path" \
    --to-tsv

# 4. Build the input table

In [26]:
feature_tsv = f"{data_dir}/feature-table.tsv"

# Load feature table
feature_df = pd.read_csv(
    feature_tsv,
    sep="\t",
    skiprows=[0],      
    index_col=0     
)

feature_df.index.name = "feature_id"
counts_df = feature_df.reset_index() 

print("Feature table shape:", feature_df.shape)
feature_df.head()

Feature table shape: (895, 150)


Unnamed: 0_level_0,ERR5327198,ERR5327199,ERR5327266,ERR5327282,ERR5327284,ERR5327285,ERR5327287,ERR5327288,ERR5327289,ERR5327300,...,ERR5327586,ERR5327587,ERR5327591,ERR5327592,ERR5327596,ERR5327599,ERR5327604,ERR5327605,ERR5327615,ERR5327620
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
212d6d39ee0685d7a60db6f139264523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7a536ff3a619af0125d63375a045715b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
f99a453bb41ad922be6277dbcfd21424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
523fc0b6d591c50e1ba33176259742ab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
04205cbd96d2c4c30512706ed168536c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
tax_tsv     = f"{data_dir}/taxonomy.tsv"

# Load taxonomy table
tax_df = pd.read_csv(
    tax_tsv,
    sep="\t",
    comment="#"
)
tax_df = tax_df.rename(columns={
    "Feature ID": "feature_id",
    "Taxon": "taxonomy"
})

# Standardise column names
tax_df = tax_df.rename(columns={
    "Feature ID": "feature_id",
    "Taxon": "taxonomy"
})

print("Taxonomy table shape:", tax_df.shape)
tax_df.head()

Taxonomy table shape: (993, 3)


Unnamed: 0,feature_id,taxonomy,Confidence
0,f1e00c6f31a5546a15c206010ff3583c,k__Fungi;p__Ascomycota;c__Saccharomycetes;o__S...,0.737076
1,1ada978c8f3ff08af2668393869257ac,Unassigned,0.369232
2,d43c87d3898407682adec71ba5b27fa4,k__Fungi;p__Ascomycota;c__Eurotiomycetes;o__Eu...,0.793387
3,f931ac9a7305cdd99a02c25a624e5bb8,k__Fungi;p__Ascomycota;c__Saccharomycetes;o__S...,0.950055
4,21c0d99a18a5bd7179fdac2b3f37ece4,k__Fungi;p__Ascomycota;c__Saccharomycetes;o__S...,0.999989


In [28]:
feature_ids = set(feature_df.index)
tax_ids     = set(tax_df["feature_id"])

shared_ids = feature_ids & tax_ids
print("Number of IDs in feature table:", len(feature_ids))
print("Number of IDs in taxonomy   :", len(tax_ids))
print("Number of shared IDs        :", len(shared_ids))

Number of IDs in feature table: 895
Number of IDs in taxonomy   : 993
Number of shared IDs        : 895


In [29]:
# Merge counts + taxonomy into one table
fungaltrait_input = counts_df.merge(
    tax_df[["feature_id", "taxonomy"]],
    on="feature_id",
    how="left"
)

# Drop features without taxonomy
fungaltrait_input = fungaltrait_input.dropna(subset=["taxonomy"])

# rename feature_id 
fungaltrait_input = fungaltrait_input.rename(columns={"feature_id": "ASV_ID"})

print("FungalTrait input shape:", fungaltrait_input.shape)
fungaltrait_input.head()

FungalTrait input shape: (895, 152)


Unnamed: 0,ASV_ID,ERR5327198,ERR5327199,ERR5327266,ERR5327282,ERR5327284,ERR5327285,ERR5327287,ERR5327288,ERR5327289,...,ERR5327587,ERR5327591,ERR5327592,ERR5327596,ERR5327599,ERR5327604,ERR5327605,ERR5327615,ERR5327620,taxonomy
0,212d6d39ee0685d7a60db6f139264523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,k__Fungi;p__Basidiomycota;c__Tremellomycetes;o...
1,7a536ff3a619af0125d63375a045715b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,k__Fungi;p__Ascomycota;c__Saccharomycetes;o__S...
2,f99a453bb41ad922be6277dbcfd21424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,k__Fungi;p__Basidiomycota;c__Tremellomycetes;o...
3,523fc0b6d591c50e1ba33176259742ab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,k__Fungi;p__Ascomycota;c__Saccharomycetes;o__S...
4,04205cbd96d2c4c30512706ed168536c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,k__Fungi;p__Ascomycota;c__Saccharomycetes;o__S...


In [30]:
# Save table
fungaltrait_input_path = f"{data_dir}/fungaltrait_input.txt"
fungaltrait_input.to_csv(fungaltrait_input_path, sep="\t", index=False)

print("Saved FungalTrait input to:", fungaltrait_input_path)

Saved FungalTrait input to: Project_data/FungalTrait/fungaltrait_input.txt


# 5. Extract genus & species from taxonomy

In [32]:
def extract_genus_species(tax_str):
    if pd.isna(tax_str):
        return pd.Series({"genus": np.nan, "species": np.nan})

    parts = [p.strip() for p in tax_str.split(";")]

    # find g__ and s__ entries
    genus   = next((p[3:] for p in parts if p.startswith("g__")), np.nan)
    species = next((p[3:] for p in parts if p.startswith("s__")), np.nan)

    # FungalTraits expects species names with spaces instead of underscores
    if isinstance(species, str) and species not in ("", "unassigned", "Unassigned", "s__"):
        species_clean = species.replace("_", " ")
    else:
        species_clean = np.nan

    return pd.Series({"genus": genus, "species": species_clean})


# Apply genus/species extraction
tax_parsed = fungaltrait_input["taxonomy"].apply(extract_genus_species)

# Overwrite (or create) genus/species columns
fungaltrait_input[["genus", "species"]] = tax_parsed

print(fungaltrait_input[["ASV_ID", "taxonomy", "genus", "species"]].head())

                             ASV_ID  \
0  212d6d39ee0685d7a60db6f139264523   
1  7a536ff3a619af0125d63375a045715b   
2  f99a453bb41ad922be6277dbcfd21424   
3  523fc0b6d591c50e1ba33176259742ab   
4  04205cbd96d2c4c30512706ed168536c   

                                            taxonomy              genus  \
0  k__Fungi;p__Basidiomycota;c__Tremellomycetes;o...       Papiliotrema   
1  k__Fungi;p__Ascomycota;c__Saccharomycetes;o__S...         Geotrichum   
2  k__Fungi;p__Basidiomycota;c__Tremellomycetes;o...  Cystofilobasidium   
3  k__Fungi;p__Ascomycota;c__Saccharomycetes;o__S...            Candida   
4  k__Fungi;p__Ascomycota;c__Saccharomycetes;o__S...         Geotrichum   

                       species  
0      Papiliotrema flavescens  
1                Geotrichum sp  
2  Cystofilobasidium capitatum  
3             Candida albicans  
4                          NaN  


In [33]:
# only keep ASVs with species-level IDs
fungaltrait_input_species = fungaltrait_input.dropna(subset=["species"])

print("Rows before species filter:", fungaltrait_input.shape[0])
print("Rows after species filter :", fungaltrait_input_species.shape[0])

Rows before species filter: 895
Rows after species filter : 657


# 6. Create mapping table for FungalTraits

In [34]:
# ASV ↔ species mapping
asv_species_map = fungaltrait_input_species[["ASV_ID", "genus", "species"]].drop_duplicates()

asv_species_path = f"{data_dir}/fungaltrait_asv_species.tsv"
asv_species_map.to_csv(asv_species_path, sep="\t", index=False)

print("Saved ASV–species map to:", asv_species_path)
asv_species_map.head()

Saved ASV–species map to: Project_data/FungalTrait/fungaltrait_asv_species.tsv


Unnamed: 0,ASV_ID,genus,species
0,212d6d39ee0685d7a60db6f139264523,Papiliotrema,Papiliotrema flavescens
1,7a536ff3a619af0125d63375a045715b,Geotrichum,Geotrichum sp
2,f99a453bb41ad922be6277dbcfd21424,Cystofilobasidium,Cystofilobasidium capitatum
3,523fc0b6d591c50e1ba33176259742ab,Candida,Candida albicans
5,c84004c0020de7d6ff5bde95677e8688,Atractiella,Atractiella solani


In [35]:
# Count table (ASVs × samples) with IDs
sample_cols = [c for c in fungaltrait_input_species.columns
               if c not in ["taxonomy", "genus", "species"]]

counts_only = fungaltrait_input_species[sample_cols]

counts_path = f"{data_dir}/fungaltrait_counts.tsv"
counts_only.to_csv(counts_path, sep="\t", index=False)

print("Saved counts table to:", counts_path)
counts_only.head()

Saved counts table to: Project_data/FungalTrait/fungaltrait_counts.tsv


Unnamed: 0,ASV_ID,ERR5327198,ERR5327199,ERR5327266,ERR5327282,ERR5327284,ERR5327285,ERR5327287,ERR5327288,ERR5327289,...,ERR5327586,ERR5327587,ERR5327591,ERR5327592,ERR5327596,ERR5327599,ERR5327604,ERR5327605,ERR5327615,ERR5327620
0,212d6d39ee0685d7a60db6f139264523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7a536ff3a619af0125d63375a045715b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,f99a453bb41ad922be6277dbcfd21424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,523fc0b6d591c50e1ba33176259742ab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,c84004c0020de7d6ff5bde95677e8688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 7. FungalTraits

```
### FungalTraits done in R

# 0. Packages
library(readr)
library(dplyr)
library(reshape2)
library(ggplot2)
library(tidyr)


# 1. Load data

# ASV ↔ species mapping (ASV_ID, genus, species)
asv_species <- read_tsv("fungaltrait_asv_species.tsv")

# Count table (ASV_ID + sample columns)
asv_counts  <- read_tsv("fungaltrait_counts.tsv")

# Metadata
metadata <- read_tsv("updated_fungut_metadata.tsv")

head(asv_species)
head(asv_counts)[, 1:5]



# 2. Load FungalTrait database

# FungalTraits table downloaded from Google Sheets as CSV
fungaltraits <- read_csv("FungalTraits_v1.2.csv")

# Clean column names a bit: no spaces, consistent case
names(fungaltraits) <- gsub("\\s+", "_", names(fungaltraits))

# Make sure there is a column called "species"
if ("Species" %in% names(fungaltraits) && !"species" %in% names(fungaltraits)) {
  fungaltraits <- fungaltraits |> rename(species = Species)
}

# Same for genus if necessary (GENUS → genus)
if ("GENUS" %in% names(fungaltraits) && !"genus" %in% names(fungaltraits)) {
  fungaltraits <- fungaltraits |> rename(genus = GENUS)
}

# Have a look:
head(fungaltraits)



# 3. Attach traits to each ASV (species-level join)

# Join by species name: every ASV gets the traits of its species
asv_with_traits <- asv_species %>%
  left_join(fungaltraits, by = "genus")

# Check: do we see trait columns attached?
head(asv_with_traits[, c("ASV_ID", "genus", "species", "primary_lifestyle")])



# 4. Merges traits with the ASV abundance counts
asv_trait_counts <- asv_with_traits %>%
  left_join(asv_counts, by = "ASV_ID")

# Check
head(asv_trait_counts[, c("ASV_ID", "genus", "primary_lifestyle", "ERR5327198")])



# 5. Compute Lifestyle Abundance per Sample

# keep only columns that are numeric abundance values
sample_cols <- names(asv_trait_counts)[sapply(asv_trait_counts, is.numeric)]

head(sample_cols)
length(sample_cols)

# Summarize abundance per lifestyle
trophic_by_sample <- asv_trait_counts %>%
  filter(!is.na(primary_lifestyle)) %>%
  group_by(primary_lifestyle) %>%
  summarise(across(all_of(sample_cols), sum)) %>%
  ungroup()

# Absolute counts per lifestyle per sample
head(trophic_by_sample[, 1:6])

# Convert to relative abundance
trophic_rel <- trophic_by_sample
trophic_rel[sample_cols] <- apply(trophic_rel[sample_cols], 2, function(x) x / sum(x))

# Relative abundance per lifestyle per sample
head(trophic_rel[, 1:6])

# transpose: samples as rows, lifestyles as columns
trophic_rel_t <- t(trophic_rel[sample_cols])
colnames(trophic_rel_t) <- trophic_rel$primary_lifestyle
trophic_rel_t <- as.data.frame(trophic_rel_t)

# add sample IDs as a column
trophic_rel_t$SampleID <- rownames(trophic_rel_t)

# Check
head(trophic_rel_t)



# 6. Attach Sample Metadata
colnames(metadata)
metadata <- metadata %>%
  rename(SampleID = ID)

trophic_final <- trophic_rel_t %>%
  left_join(metadata, by = "SampleID")

# Check
head(trophic_final)



# 7. Visualizations

# Select the lifestyle columns
meta_cols <- colnames(metadata)
lifestyle_cols <- setdiff(colnames(trophic_final),
                          c("SampleID", meta_cols))

lifestyle_cols

trophic_long <- trophic_final %>%
  select(SampleID, all_of(lifestyle_cols)) %>%
  pivot_longer(
    cols      = all_of(lifestyle_cols),
    names_to  = "Lifestyle",
    values_to = "RelAbundance"
  ) %>%
  left_join(metadata, by = "SampleID")


# IBD group
trophic_group <- trophic_long %>% 
  group_by(ibd_sample, Lifestyle) %>%           
  summarise(mean_rel = mean(RelAbundance),
            .groups = "drop")

plot_ibd <- ggplot(trophic_group,
       aes(x = ibd_sample, y = mean_rel, fill = Lifestyle)) +
  geom_col(position = "fill") +              
  ylab("Mean relative abundance") +
  xlab("Environment (IBD)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Gluten Status
trophic_group_2 <- trophic_long %>% 
  group_by(gluten_sample, Lifestyle) %>%           
  summarise(mean_rel = mean(RelAbundance),
            .groups = "drop")

plot_gluten <- ggplot(trophic_group_2,
       aes(x = gluten_sample, y = mean_rel, fill = Lifestyle)) +
  geom_col(position = "fill") +              
  ylab("Mean relative abundance") +
  xlab("Environment (Gluten Status)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


# Diet
trophic_group_3 <- trophic_long %>% 
  group_by(diet_type_sample, Lifestyle) %>%           
  summarise(mean_rel = mean(RelAbundance),
            .groups = "drop")

plot_diet <- ggplot(trophic_group_3,
       aes(x = diet_type_sample, y = mean_rel, fill = Lifestyle)) +
  geom_col(position = "fill") +              
  ylab("Mean relative abundance") +
  xlab("Environment (Diet Type)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Gender
trophic_group_4 <- trophic_long %>% 
  group_by(sex_sample, Lifestyle) %>%           
  summarise(mean_rel = mean(RelAbundance),
            .groups = "drop")

plot_sex <- ggplot(trophic_group_4,
       aes(x = sex_sample, y = mean_rel, fill = Lifestyle)) +
  geom_col(position = "fill") +              
  ylab("Mean relative abundance") +
  xlab("Environment (Gender)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# BMI
trophic_group_5 <- trophic_long %>% 
  group_by(bmi_category, Lifestyle) %>%           
  summarise(mean_rel = mean(RelAbundance),
            .groups = "drop")

plot_bmi <- ggplot(trophic_group_5,
       aes(x = bmi_category, y = mean_rel, fill = Lifestyle)) +
  geom_col(position = "fill") +              
  ylab("Mean relative abundance") +
  xlab("Environment (BMI)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Continent
trophic_group_6 <- trophic_long %>% 
  group_by(continent, Lifestyle) %>%           
  summarise(mean_rel = mean(RelAbundance),
            .groups = "drop")

plot_continent <- ggplot(trophic_group_6,
       aes(x = continent, y = mean_rel, fill = Lifestyle)) +
  geom_col(position = "fill") +              
  ylab("Mean relative abundance") +
  xlab("Environment (Continent)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))



# 8. Statistical testing

# metadata variables you analysed in Step 7
meta_vars <- c("ibd_sample", "gluten_sample", "diet_type_sample",
               "sex_sample", "bmi_category", "continent")

# all lifestyle columns
lifestyle_cols <- setdiff(colnames(trophic_final),
                          c("SampleID", colnames(metadata)))

# Kruskal–Wallis
run_kw <- function(meta_var) {
  results <- lapply(lifestyle_cols, function(lf) {
    
    # remove samples with missing metadata
    df <- trophic_final %>%
      filter(!is.na(.data[[meta_var]]))
    
    test <- kruskal.test(df[[lf]] ~ df[[meta_var]])
    
    data.frame(
      Lifestyle = lf,
      Metadata  = meta_var,
      p_value   = test$p.value
    )
  })
  
  do.call(rbind, results)
}

kw_results <- lapply(meta_vars, run_kw)
kw_results <- do.call(rbind, kw_results)

# adjust for multiple testing
kw_results$padj <- p.adjust(kw_results$p_value, method = "fdr")

kw_results

# significant results
subset(kw_results, padj < 0.05)



# 9. Export Plots
dir.create("figures", showWarnings = FALSE)

ggsave("figures/plot_ibd.png", plot_ibd, width = 8, height = 5, dpi = 300)
ggsave("figures/plot_gluten.png", plot_gluten, width = 8, height = 5, dpi = 300)
ggsave("figures/plot_diet.png", plot_diet, width = 8, height = 5, dpi = 300)
ggsave("figures/plot_sex.png", plot_sex, width = 8, height = 5, dpi = 300)
ggsave("figures/plot_bmi.png", plot_bmi, width = 8, height = 5, dpi = 300)
ggsave("figures/plot_continent.png", plot_continent, width = 8, height = 5, dpi = 300)
```

In [9]:
%load_ext rpy2.ipython

In [10]:
%%R
install.packages("readr")
install.packages("dplyr")
install.packages("reshape2")
install.packages("ggplot2")
install.packages("tidyr")

--- Please select a CRAN mirror for use in this session ---
Secure CRAN mirrors 

 1: 0-Cloud [https]                   2: Australia (Canberra) [https]   
 3: Australia (Melbourne 1) [https]   4: Australia (Melbourne 2) [https]
 5: Austria (Wien) [https]            6: Belgium (Brussels) [https]     
 7: Brazil (PR) [https]               8: Brazil (SP 1) [https]          
 9: Brazil (SP 2) [https]            10: Bulgaria [https]               
11: Canada (MB) [https]              12: Canada (ON 1) [https]          
13: Canada (ON 2) [https]            14: Chile (Santiago) [https]       
15: China (Beijing 1) [https]        16: China (Beijing 2) [https]      
17: China (Beijing 3) [https]        18: China (Hefei) [https]          
19: China (Hong Kong) [https]        20: China (Lanzhou) [https]        
21: China (Nanjing) [https]          22: China (Shanghai 2) [https]     
23: China (Shenzhen) [https]         24: China (Wuhan) [https]          
25: Colombia (Cali) [https]          26: C

Selection:  60


R[write to console]: trying URL 'https://stat.ethz.ch/CRAN/src/contrib/readr_2.1.6.tar.gz'

R[write to console]: Content type 'application/x-gzip'
R[write to console]:  length 299193 bytes (292 KB)

R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]

x86_64-conda-linux-gnu-c++ -std=gnu++17 -I"/opt/conda/lib/R/include" -DNDEBUG  -I'/opt/conda/lib/R/library/cpp11/include' -I'/opt/conda/lib/R/library/tzdb/include' -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /opt/conda/include -I/opt/conda/include -Wl,-rpath-link,/opt/conda/lib    -fpic  -fvisibility-inlines-hidden  -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /opt/conda/include -fdebug-prefix-map=/home/conda/feedstock_root/build_artifacts/r-base-split_1714471492496/work=/usr/local/src/conda/r-base-4.3.3 -fdebug-prefix-map=/opt/conda=/usr/local/src/conda-prefix  -c Collector.cpp -o Collector.o
x86_64-conda-linux-gnu-c++ -std=gnu++17 -I"/opt/conda/lib/R/include" -DNDEBUG  -I'/opt/conda/lib/R/library/cpp11/include' -I'/opt/conda/lib/R/library/tzdb/include' -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /opt/conda/include -I/opt/conda/include -Wl,-rpath-link,/opt/conda/lib    -fpic  -fvisibility-in

installing to /opt/conda/lib/R/library/00LOCK-readr/00new/readr/libs
** R
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
*** copying figures
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
** checking absolute paths in shared objects and dynamic libraries
** testing if installed package can be loaded from final location
** testing if installed package keeps a record of temporary installation path
* DONE (readr)
R[write to console]: 

R[write to console]: 
R[write to console]: The downloaded source packages are in
	‘/tmp/RtmpWbz0cn/downloaded_packages’
R[write to console]: 
R[write to console]: 

R[write to console]: Updating HTML index of packages in '.Library'

R[write to console]: Making 'packages.html' ...
R[write to console]:  done

R[write to console]: trying URL 'https://stat.ethz.ch/CRAN/src/contrib/dplyr_1.1.4.tar.gz'

R[write to console]: Content type 'ap

x86_64-conda-linux-gnu-c++ -std=gnu++17 -I"/opt/conda/lib/R/include" -DNDEBUG   -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /opt/conda/include -I/opt/conda/include -Wl,-rpath-link,/opt/conda/lib    -fpic  -fvisibility-inlines-hidden  -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /opt/conda/include -fdebug-prefix-map=/home/conda/feedstock_root/build_artifacts/r-base-split_1714471492496/work=/usr/local/src/conda/r-base-4.3.3 -fdebug-prefix-map=/opt/conda=/usr/local/src/conda-prefix  -c chop.cpp -o chop.o
x86_64-conda-linux-gnu-c++ -std=gnu++17 -I"/opt/conda/lib/R/include" -DNDEBUG   -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /opt/conda/include -I/opt/conda/include -Wl,-rpath-link,/opt/conda/lib    -fpic  -fvisibility-inlines-hidden  -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /opt/conda/include -

installing to /opt/conda/lib/R/library/00LOCK-dplyr/00new/dplyr/libs
** R
** data
*** moving datasets to lazyload DB
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
*** copying figures
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
** checking absolute paths in shared objects and dynamic libraries
** testing if installed package can be loaded from final location
** testing if installed package keeps a record of temporary installation path
* DONE (dplyr)
R[write to console]: 

R[write to console]: 
R[write to console]: The downloaded source packages are in
	‘/tmp/RtmpWbz0cn/downloaded_packages’
R[write to console]: 
R[write to console]: 

R[write to console]: Updating HTML index of packages in '.Library'

R[write to console]: Making 'packages.html' ...
R[write to console]:  done

R[write to console]: trying URL 'https://stat.ethz.ch/CRAN/src/contrib/reshape2_1.4.5.

x86_64-conda-linux-gnu-c++ -std=gnu++17 -I"/opt/conda/lib/R/include" -DNDEBUG  -I'/opt/conda/lib/R/library/Rcpp/include' -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /opt/conda/include -I/opt/conda/include -Wl,-rpath-link,/opt/conda/lib    -fpic  -fvisibility-inlines-hidden  -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /opt/conda/include -fdebug-prefix-map=/home/conda/feedstock_root/build_artifacts/r-base-split_1714471492496/work=/usr/local/src/conda/r-base-4.3.3 -fdebug-prefix-map=/opt/conda=/usr/local/src/conda-prefix  -c RcppExports.cpp -o RcppExports.o
x86_64-conda-linux-gnu-c++ -std=gnu++17 -I"/opt/conda/lib/R/include" -DNDEBUG  -I'/opt/conda/lib/R/library/Rcpp/include' -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /opt/conda/include -I/opt/conda/include -Wl,-rpath-link,/opt/conda/lib    -fpic  -fvisibility-inlines-hidden  -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fP

installing to /opt/conda/lib/R/library/00LOCK-reshape2/00new/reshape2/libs
** R
** data
*** moving datasets to lazyload DB
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
** building package indices
** testing if installed package can be loaded from temporary location
** checking absolute paths in shared objects and dynamic libraries
** testing if installed package can be loaded from final location
** testing if installed package keeps a record of temporary installation path
* DONE (reshape2)
R[write to console]: 

R[write to console]: 
R[write to console]: The downloaded source packages are in
	‘/tmp/RtmpWbz0cn/downloaded_packages’
R[write to console]: 
R[write to console]: 

R[write to console]: Updating HTML index of packages in '.Library'

R[write to console]: Making 'packages.html' ...
R[write to console]:  done

R[write to console]: also installing the dependency ‘S7’


R[write to console]: trying URL 'https://stat.ethz.ch/CRAN/src

x86_64-conda-linux-gnu-cc -I"/opt/conda/lib/R/include" -DNDEBUG   -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /opt/conda/include -I/opt/conda/include -Wl,-rpath-link,/opt/conda/lib    -fpic  -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /opt/conda/include -fdebug-prefix-map=/home/conda/feedstock_root/build_artifacts/r-base-split_1714471492496/work=/usr/local/src/conda/r-base-4.3.3 -fdebug-prefix-map=/opt/conda=/usr/local/src/conda-prefix  -c init.c -o init.o
x86_64-conda-linux-gnu-cc -I"/opt/conda/lib/R/include" -DNDEBUG   -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /opt/conda/include -I/opt/conda/include -Wl,-rpath-link,/opt/conda/lib    -fpic  -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /opt/conda/include -fdebug-prefix-map=/home/conda/feedstock_root/build_artifacts/r-base-split_1714471492496/work=/usr/local/src/conda/r-base-4.3.3

installing to /opt/conda/lib/R/library/00LOCK-S7/00new/S7/libs
** R
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
** checking absolute paths in shared objects and dynamic libraries
** testing if installed package can be loaded from final location
** testing if installed package keeps a record of temporary installation path
* DONE (S7)
* installing *source* package ‘ggplot2’ ...
** package ‘ggplot2’ successfully unpacked and MD5 sums checked
** using staged installation
** R
** data
*** moving datasets to lazyload DB
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
*** copying figures
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
** testing if installed package can be loaded from final location


x86_64-conda-linux-gnu-c++ -std=gnu++17 -I"/opt/conda/lib/R/include" -DNDEBUG  -I'/opt/conda/lib/R/library/cpp11/include' -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /opt/conda/include -I/opt/conda/include -Wl,-rpath-link,/opt/conda/lib    -fpic  -fvisibility-inlines-hidden  -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /opt/conda/include -fdebug-prefix-map=/home/conda/feedstock_root/build_artifacts/r-base-split_1714471492496/work=/usr/local/src/conda/r-base-4.3.3 -fdebug-prefix-map=/opt/conda=/usr/local/src/conda-prefix  -c cpp11.cpp -o cpp11.o
x86_64-conda-linux-gnu-c++ -std=gnu++17 -I"/opt/conda/lib/R/include" -DNDEBUG  -I'/opt/conda/lib/R/library/cpp11/include' -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /opt/conda/include -I/opt/conda/include -Wl,-rpath-link,/opt/conda/lib    -fpic  -fvisibility-inlines-hidden  -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack

installing to /opt/conda/lib/R/library/00LOCK-tidyr/00new/tidyr/libs
** R
** data
*** moving datasets to lazyload DB
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
*** copying figures
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
** checking absolute paths in shared objects and dynamic libraries
** testing if installed package can be loaded from final location
** testing if installed package keeps a record of temporary installation path
* DONE (tidyr)
R[write to console]: 

R[write to console]: 
R[write to console]: The downloaded source packages are in
	‘/tmp/RtmpWbz0cn/downloaded_packages’
R[write to console]: 
R[write to console]: 

R[write to console]: Updating HTML index of packages in '.Library'

R[write to console]: Making 'packages.html' ...
R[write to console]:  done



In [None]:
%%R

In [16]:
%%R
### FungalTraits done in R

# 0. Packages
library(readr)
library(dplyr)
library(reshape2)
library(ggplot2)
library(tidyr)


# 1. Load data

# ASV ↔ species mapping (ASV_ID, genus, species)
asv_species <- read_tsv("Project_data/FungalTrait/fungaltrait_asv_species.tsv")

# Count table (ASV_ID + sample columns)
asv_counts  <- read_tsv("Project_data/FungalTrait/fungaltrait_counts.tsv")

# Metadata
metadata    <- read_tsv("Project_data/Metadata/updated_fungut_metadata.tsv")

head(asv_species)
head(asv_counts)[, 1:5]



# 2. Load FungalTrait database

# FungalTraits table downloaded from Google Sheets as CSV
fungaltraits <- read_csv("Project_data/FungalTrait/FungalTraits_v1.2.csv")

# Clean column names a bit: no spaces, consistent case
names(fungaltraits) <- gsub("\\s+", "_", names(fungaltraits))

# Make sure there is a column called "species"
if ("Species" %in% names(fungaltraits) && !"species" %in% names(fungaltraits)) {
  fungaltraits <- fungaltraits |> rename(species = Species)
}

# Same for genus if necessary (GENUS → genus)
if ("GENUS" %in% names(fungaltraits) && !"genus" %in% names(fungaltraits)) {
  fungaltraits <- fungaltraits |> rename(genus = GENUS)
}

# Have a look:
head(fungaltraits)



# 3. Attach traits to each ASV (species-level join)

# Join by species name: every ASV gets the traits of its species
asv_with_traits <- asv_species %>%
  left_join(fungaltraits, by = "genus")

# Check: do we see trait columns attached?
head(asv_with_traits[, c("ASV_ID", "genus", "species", "primary_lifestyle")])



# 4. Merges traits with the ASV abundance counts
asv_trait_counts <- asv_with_traits %>%
  left_join(asv_counts, by = "ASV_ID")

# Check
head(asv_trait_counts[, c("ASV_ID", "genus", "primary_lifestyle", "ERR5327198")])



# 5. Compute Lifestyle Abundance per Sample

# keep only columns that are numeric abundance values
sample_cols <- names(asv_trait_counts)[sapply(asv_trait_counts, is.numeric)]

head(sample_cols)
length(sample_cols)

# Summarize abundance per lifestyle
trophic_by_sample <- asv_trait_counts %>%
  filter(!is.na(primary_lifestyle)) %>%
  group_by(primary_lifestyle) %>%
  summarise(across(all_of(sample_cols), sum)) %>%
  ungroup()

# Absolute counts per lifestyle per sample
head(trophic_by_sample[, 1:6])

# Convert to relative abundance
trophic_rel <- trophic_by_sample
trophic_rel[sample_cols] <- apply(trophic_rel[sample_cols], 2, function(x) x / sum(x))

# Relative abundance per lifestyle per sample
head(trophic_rel[, 1:6])

# transpose: samples as rows, lifestyles as columns
trophic_rel_t <- t(trophic_rel[sample_cols])
colnames(trophic_rel_t) <- trophic_rel$primary_lifestyle
trophic_rel_t <- as.data.frame(trophic_rel_t)

# add sample IDs as a column
trophic_rel_t$SampleID <- rownames(trophic_rel_t)

# Check
head(trophic_rel_t)



# 6. Attach Sample Metadata
colnames(metadata)
metadata <- metadata %>%
  rename(SampleID = ID)

trophic_final <- trophic_rel_t %>%
  left_join(metadata, by = "SampleID")

# Check
head(trophic_final)



# 7. Visualizations

# Select the lifestyle columns
meta_cols <- colnames(metadata)
lifestyle_cols <- setdiff(colnames(trophic_final),
                          c("SampleID", meta_cols))

lifestyle_cols

trophic_long <- trophic_final %>%
  select(SampleID, all_of(lifestyle_cols)) %>%
  pivot_longer(
    cols      = all_of(lifestyle_cols),
    names_to  = "Lifestyle",
    values_to = "RelAbundance"
  ) %>%
  left_join(metadata, by = "SampleID")


# IBD group
trophic_group <- trophic_long %>% 
  group_by(ibd_sample, Lifestyle) %>%           
  summarise(mean_rel = mean(RelAbundance),
            .groups = "drop")

ggplot(trophic_group,
       aes(x = ibd_sample, y = mean_rel, fill = Lifestyle)) +
  geom_col(position = "fill") +              
  ylab("Mean relative abundance") +
  xlab("Environment (IBD)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Gluten Status
trophic_group_2 <- trophic_long %>% 
  group_by(gluten_sample, Lifestyle) %>%           
  summarise(mean_rel = mean(RelAbundance),
            .groups = "drop")

plot_gluten <- ggplot(trophic_group_2,
       aes(x = gluten_sample, y = mean_rel, fill = Lifestyle)) +
  geom_col(position = "fill") +              
  ylab("Mean relative abundance") +
  xlab("Environment (Gluten Status)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


# Diet
trophic_group_3 <- trophic_long %>% 
  group_by(diet_type_sample, Lifestyle) %>%           
  summarise(mean_rel = mean(RelAbundance),
            .groups = "drop")

plot_diet <- ggplot(trophic_group_3,
       aes(x = diet_type_sample, y = mean_rel, fill = Lifestyle)) +
  geom_col(position = "fill") +              
  ylab("Mean relative abundance") +
  xlab("Environment (Diet Type)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Gender
trophic_group_4 <- trophic_long %>% 
  group_by(sex_sample, Lifestyle) %>%           
  summarise(mean_rel = mean(RelAbundance),
            .groups = "drop")

plot_sex <- ggplot(trophic_group_4,
       aes(x = sex_sample, y = mean_rel, fill = Lifestyle)) +
  geom_col(position = "fill") +              
  ylab("Mean relative abundance") +
  xlab("Environment (Gender)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# BMI
trophic_group_5 <- trophic_long %>% 
  group_by(bmi_category, Lifestyle) %>%           
  summarise(mean_rel = mean(RelAbundance),
            .groups = "drop")

plot_bmi <- ggplot(trophic_group_5,
       aes(x = bmi_category, y = mean_rel, fill = Lifestyle)) +
  geom_col(position = "fill") +              
  ylab("Mean relative abundance") +
  xlab("Environment (BMI)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Continent
trophic_group_6 <- trophic_long %>% 
  group_by(continent, Lifestyle) %>%           
  summarise(mean_rel = mean(RelAbundance),
            .groups = "drop")

plot_continent <- ggplot(trophic_group_6,
       aes(x = continent, y = mean_rel, fill = Lifestyle)) +
  geom_col(position = "fill") +              
  ylab("Mean relative abundance") +
  xlab("Environment (Continent)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))



# 8. Statistical testing

# metadata variables you analysed in Step 7
meta_vars <- c("ibd_sample", "gluten_sample", "diet_type_sample",
               "sex_sample", "bmi_category", "continent")

# all lifestyle columns
lifestyle_cols <- setdiff(colnames(trophic_final),
                          c("SampleID", colnames(metadata)))

# Kruskal–Wallis
run_kw <- function(meta_var) {
  results <- lapply(lifestyle_cols, function(lf) {
    
    # remove samples with missing metadata
    df <- trophic_final %>%
      filter(!is.na(.data[[meta_var]]))
    
    test <- kruskal.test(df[[lf]] ~ df[[meta_var]])
    
    data.frame(
      Lifestyle = lf,
      Metadata  = meta_var,
      p_value   = test$p.value
    )
  })
  
  do.call(rbind, results)
}

kw_results <- lapply(meta_vars, run_kw)
kw_results <- do.call(rbind, kw_results)

# adjust for multiple testing
kw_results$padj <- p.adjust(kw_results$p_value, method = "fdr")

kw_results

# significant results
subset(kw_results, padj < 0.05)



# 9. Export Plots
dir.create("figures", showWarnings = FALSE)

#ggsave("figures/plot_ibd.png", plot_ibd, width = 8, height = 5, dpi = 300)
#ggsave("figures/plot_gluten.png", plot_gluten, width = 8, height = 5, dpi = 300)
#ggsave("figures/plot_diet.png", plot_diet, width = 8, height = 5, dpi = 300)
#ggsave("figures/plot_sex.png", plot_sex, width = 8, height = 5, dpi = 300)
#ggsave("figures/plot_bmi.png", plot_bmi, width = 8, height = 5, dpi = 300)
#ggsave("figures/plot_continent.png", plot_continent, width = 8, height = 5, dpi = 300)


Rows: 657 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (3): ASV_ID, genus, species

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 657 Columns: 151
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr   (1): ASV_ID
dbl (150): ERR5327198, ERR5327199, ERR5327266, ERR5327282, ERR5327284, ERR53...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 150 Columns: 16
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (10): ID, country_sample, state_sample, sex_sample, diet_type_sample, ib...
dbl  (6): latitude_sample, longitude_sample, age_years_sample, height_cm_sam...

ℹ Use `spec()` to retrieve the full column