In [None]:
library(data.table)
library(dplyr)
library(nparLD)
library(missForest)
library(parallel)
library(doParallel)
library(stats)
library(Mfuzz)
library(Biobase)
library(ggplot2)
library(ggrepel)

# HuMet DATASET

In this notebook, we analyze a subset of the HuMet dataset, focusing on **plasma** samples across three platforms:

- Metabolon HD4 (nt-ms)
- Biocrates p150 (t-ms)
- In-house biochemistry (chem.)

## Loading the Data

In [None]:
met_data <- fread("../input/raw/humet_data_raw_none_subjects15_tp57.csv", sep = ",", fill = TRUE)
info_data <- fread("../input/humet_info.csv", sep = ",", fill = TRUE)
head(met_data)
head(info_data)

## Data Analysis

In [None]:
# Count metabolite columns
num_metabolite_columns <- length(setdiff(names(met_data), c("time", "subject")))

unique_time_values <- unique(met_data$time)
unique_subject_values <- unique(met_data$subject)

print(paste("Number of metabolites:", num_metabolite_columns))
print(paste("Unique time values:", paste(unique_time_values, collapse = ", ")))
print(paste("Unique subject values:", paste(unique_subject_values, collapse = ", ")))

In [None]:
# Count occurrences of each unique platform
platform_counts <- table(info_data$platform_name)
print(platform_counts)

- Targeted: 132
- Non-Targeted: 502
- Insulin (Hormone): 1

## Preprocessing

## Adding Challenge information

**Relevant time intervals for our analysis**:

Since the original dataset lacked challenge information, we assigned it based on the time column:

- **Fasting**: Time points 1–10
- **Physical Activity**: Time points 33–39
- **Oral Lipid Tolerance Test (OLTT)**: Time points 40–50

In [None]:
# Create a dataset with all time intervals
met_data <- met_data %>%
  mutate(challenge = case_when(
    time >= 1 & time <= 10 ~ "Fasting",
    time >= 33 & time <= 39 ~ "Physical Activity",
    time >= 40 & time <= 50 ~ "OLTT", # crosscheck what happens if OLTT ends at 48
    TRUE ~ "Other"  # Keep "Other" instead of filtering out
  ))

# Display first rows of both datasets
#tail(met_data_all)
head(met_data)

## Removing Metabolites with > 30% missing values

In [271]:
remove_high_na_metabolites <- function(met_data, threshold = 0.3, output_file = "removed_metabolites.txt") {
  # Identify metabolite columns (excluding time, subject, and challenge)
  metabolite_columns <- setdiff(colnames(met_data), c("time", "subject", "challenge"))
  
  # Calculate the percentage of missing values for each metabolite
  na_percentage <- colMeans(is.na(met_data[, ..metabolite_columns]))

  # Find metabolites with more than `threshold` missing values
  high_na_metabolites <- names(na_percentage[na_percentage > threshold])

  # Write the removed metabolite names to a text file
  if (length(high_na_metabolites) > 0) {
    writeLines(high_na_metabolites, output_file)
  }

  # Remove these metabolites from met_data
  filtered_met_data <- met_data[, !high_na_metabolites, with = FALSE]

  return(filtered_met_data)
}

# Apply function to clean met_data
met_data <- remove_high_na_metabolites(met_data)

#### Leaving Only Relevant Time Points

In [272]:
# Create a dataset with only relevant time intervals
met_data <- met_data %>%
  filter(challenge != "Other")

## Splitting the dataset based on platform

In [273]:
# Identify metabolite columns (excluding time, subject, and challenge)
metabolite_columns <- setdiff(colnames(met_data), c("time", "subject", "challenge"))

# Define platform patterns for Metabolon and Biocrates
platforms <- list(
  metabolon = "\\[P, nt-ms\\]",
  biocrates = "\\[P, t-ms\\]"
)

# Function to filter metabolites based on platform
filter_metabolites <- function(pattern) {
  selected_cols <- c("time", "subject", "challenge", metabolite_columns[grepl(pattern, metabolite_columns)])
  met_data[, ..selected_cols]
}

# Function to filter metabolites NOT belonging to Metabolon or Biocrates (i.e., Inhouse)
filter_inhouse_metabolites <- function() {
  excluded_cols <- unique(unlist(lapply(platforms, function(p) metabolite_columns[grepl(p, metabolite_columns)])))
  selected_cols <- c("time", "subject", "challenge", setdiff(metabolite_columns, excluded_cols))
  met_data[, ..selected_cols]
}

# Create datasets
met_data_metabolon <- filter_metabolites(platforms$metabolon)
met_data_biocrates <- filter_metabolites(platforms$biocrates)
met_data_inhouse <- filter_inhouse_metabolites()  # Everything else


## Handle Missing Values - missForest

In [None]:
# Count missing values in each dataset
sum(is.na(data_fasting))
sum(is.na(data_exercise))
sum(is.na(data_oltt))

In [None]:
# Function to convert categorical variables to factors
convert_to_factors <- function(data) {
  data %>%
    mutate(
      challenge = as.factor(challenge),
      time = as.factor(time),
      subject = as.factor(subject)
    ) %>%
    mutate(across(where(is.character), as.factor))
}

# Function for missForest imputation with adaptive parallelization
perform_missForest <- function(data_subset, ntree_val = 10) {
  num_vars <- ncol(data_subset)  # Get the number of variables
  
  # Adjust cores to be at most the number of variables
  num_cores <- min(detectCores() - 1, num_vars)
  
  # If parallelization is still invalid, set it to 'no'
  parallel_option <- if (num_cores > 1) "variables" else "no"
  
  cl <- makeCluster(num_cores, type = "FORK") 
  registerDoParallel(cl)
  
  set.seed(42)  # Ensures reproducibility
  imputed_data <- missForest(data_subset, ntree = ntree_val, parallelize = parallel_option, verbose = TRUE)
  
  stopCluster(cl)  # Stop cluster
  
  return(imputed_data$ximp)  # Extract imputed dataset
}

# Wrapper function to process and impute metabolite datasets
data_pipeline <- function(metabolite_datasets) {
  # Convert categorical variables to factors
  metabolite_datasets <- lapply(metabolite_datasets, convert_to_factors)
  
  # Perform imputation with automatic parallelization adjustment
  imputed_data <- lapply(metabolite_datasets, perform_missForest, ntree_val = 10) #400
  
  return(imputed_data)
}

# List of metabolite datasets
metabolite_datasets <- list(
  metabolon = met_data_metabolon,
  biocrates = met_data_biocrates,
  inhouse = met_data_inhouse
)

# Apply pipeline to each dataset
imputed_metabolite_data <- data_pipeline(metabolite_datasets)

In [None]:
imputed_metabolite_data

In [113]:
### AFTER REMOVING OTHER

# 50 trees => 0.603862
# 68 trees => 0.6200632
# 69 trees => 0.594013
# 70 trees => 0.5919029
# 75 trees => 0.5951333
# 76 trees => 0.5837592
# 77 trees => 0.5951333
# 78 trees => 0.5852654
# 80 trees => 0.6201631
# 100 trees => 0.5938395
# 120 trees => 0.6130575
# 400 trees => 0.5863569

#### Merging platform-separated datasets together 

In [None]:
# Merge datasets based on time, subject, and challenge
combined_data <- Reduce(function(x, y) {
  merge(x, y, by = c("time", "subject", "challenge"), all = TRUE)
}, list(imputed_metabolite_data$metabolon, 
        imputed_metabolite_data$biocrates, 
        imputed_metabolite_data$inhouse))

# Print the first rows of the combined dataset
head(combined_data)

## Correlation

In [None]:
correlation_analysis <- function(target_variable, 
                                 exclude_columns = c(), 
                                 method = "pearson", 
                                 p_adjust_method = "bonferroni") {

  
  # Daten einlesen
  df <- combined_data
  
  # Identifiziere numerische Spalten und schließe Zielvariable sowie andere auszuschließende Spalten aus
  numeric_columns <- sapply(df, is.numeric)
  metabolite_columns <- setdiff(names(df)[numeric_columns], c(target_variable, exclude_columns))
  
  # Ergebnisse vorbereiten
  cor_results <- data.frame(Metabolite = metabolite_columns, Correlation = NA, p_value = NA)
  
  # Berechnung der paarweisen Korrelation
  for (metabolite in metabolite_columns) {
    cor_test <- cor.test(df[[metabolite]], df[[target_variable]], method = method)
    cor_results[cor_results$Metabolite == metabolite, "Correlation"] <- cor_test$estimate
    cor_results[cor_results$Metabolite == metabolite, "p_value"] <- cor_test$p.value
  }
  
  # Multiple Testing Correction
  cor_results$p_adjusted <- p.adjust(cor_results$p_value, method = p_adjust_method)
  
  # Ergebnisse sortieren
  cor_results <- cor_results[order(cor_results$p_adjusted), ]

  fwrite(cor_results, "../results/correlation_results")
  
  # Top 10 Ergebnisse ausgeben
  return(head(cor_results, 10))
}

# Beispiel-Aufruf der Funktion
result <- correlation_analysis(
  target_variable = "Insulin [P, chem.]", # Zielvariable angepasst
  exclude_columns = c("time", "subject", "challenge"), # Ausschlüsse anpassen
  method = "pearson",
  p_adjust_method = "bonferroni"
)

# Ergebnisse anzeigen
print(result)

## Reformating the Table

In [None]:
# Function to add platform information and reshape each dataset into long format
reshape_long <- function(data) {
  # Identify metabolite columns (exclude time, subject, challenge)
  metabolite_columns <- setdiff(names(data), c("time", "subject", "challenge"))
  
  # Convert all metabolite columns to numeric (preserves NA values)
  data[, (metabolite_columns) := lapply(.SD, as.numeric), .SDcols = metabolite_columns]
  
  # Reshape into long format
  long_data <- melt(data,
                    id.vars = c("time", "subject", "challenge"),  # Keep these columns unchanged
                    measure.vars = metabolite_columns,  # Only reshape metabolite columns
                    variable.name = "metabolite",
                    value.name = "response",
                    na.rm = FALSE)  # Keep NA values instead of removing them
  
  # Add platform name based on the metabolite column name
  long_data[, platform_name := case_when(
    grepl("\\[P, t-ms\\]", metabolite) ~ "Biocrates p150 [t-ms]",
    grepl("\\[P, nt-ms\\]", metabolite) ~ "Metabolon HD4 [nt-ms]",
    grepl("\\[P, chem.\\]", metabolite) ~ "In-house biochemistry [chem.]",
    TRUE ~ "Unknown"  # Default case for anything that doesn't match
  )]
  
  return(long_data)
}

# Reshape the combined dataset into long format
met_data <- reshape_long(combined_data)


# Print first rows of the final combined dataset
head(met_data)

Clean up the metabolite names

In [None]:
# Clean metabolite names by removing anything inside square brackets and trimming whitespace
met_data[, metabolite := gsub("\\[.*?\\]", "", metabolite)]  # Remove text inside brackets
met_data[, metabolite := trimws(metabolite)]  # Trim leading/trailing spaces
met_data[, metabolite := tolower(metabolite)]  # Convert to lowercase

# Print first rows to verify changes
head(met_data)


Some data is missing from our met_data dataset (e.g. super_pathway and sub_pathway), we need to add it for further analysis. For this we use info_data.

Clean up of info_data:

In [None]:
# Keep only rows where fluid == "plasma"
info_data <- info_data[fluid == "plasma"]

# Ensure correct encoding and remove asterisks
info_data$metabolite <- gsub("[*]", "", info_data$metabolite)  # Remove all asterisks
info_data$metabolite <- gsub("\u200B", "", info_data$metabolite)  # Remove zero-width spaces (if present)
info_data$metabolite <- gsub("[[:space:]]+$", "", info_data$metabolite)  # Trim trailing spaces
info_data$metabolite <- trimws(info_data$metabolite)  # Remove any remaining spaces
info_data$metabolite <- tolower(info_data$metabolite)  # Convert to lowercase

head(info_data)

Merging the data 

In [None]:
# Merge met_data with info_data based on metabolite and platform_name
met_data <- merge(met_data, 
                  info_data[, .(metabolite, platform_name, super_pathway, sub_pathway)], 
                  by = c("metabolite", "platform_name"), 
                  all.x = TRUE)  # Keep all rows in met_data

# Print first rows to verify the merge
head(met_data)

## Hypothesis Testing

DO YOU WANT TO ANALYSE HORMONES (E.G. INSULIN) TOO, RUN THIS CELL

In [None]:
# Create a new dataset excluding rows where platform is "In-house biochemistry [chem.]"
met_data_filtered <- met_data[platform_name != "In-house biochemistry [chem.]", ]

# Print the first few rows of the new dataset
head(met_data_filtered)

NO INSULIN? RUN THIS

In [280]:
met_data_filtered <- met_data

For ANOVA & T-TEST we need to only use targeted and non-targeted metabolites, we remove molecules from in Biochemistry, since these are not metabolites (e.g. insulin)

In [None]:
num_unique_metabolite_platforms <- nrow(unique(met_data_filtered[, .(metabolite, platform_name)]))
print(num_unique_metabolite_platforms)

## Anova-Test

In [None]:
# Define significance threshold after multiple testing correction
p_threshold <- 0.05 / 634

# Convert challenge_time to a categorical variable
met_data_filtered[, time := as.factor(time)]
met_data_filtered[, subject := as.factor(subject)]

# Subset data by challenge
metabolite_data_fasting <- met_data_filtered[challenge == "Fasting"]
metabolite_data_pat <- met_data_filtered[challenge == "Physical Activity"]
metabolite_data_oltt <- met_data_filtered[challenge == "OLTT"]

# Function to run ANOVA-like test while considering metabolite & platform
run_anova_like_test <- function(metabolite_data, challenge_name) {
    
    # Initialize results list
    results <- list()

    # Loop through unique metabolite-platform combinations
    unique_metabolites <- unique(metabolite_data[, .(metabolite, platform_name, super_pathway, sub_pathway)])
    
    for (i in seq_len(nrow(unique_metabolites))) {
        
        met <- unique_metabolites$metabolite[i]
        plat <- unique_metabolites$platform_name[i]
        super_path <- unique_metabolites$super_pathway[i]
        sub_path <- unique_metabolites$sub_pathway[i]
        
        # Subset data for this metabolite and platform
        subset_data <- metabolite_data[metabolite == met & platform_name == plat]
        
        # Ensure sufficient data points for analysis
        if (nrow(subset_data) > 2) {
            
            # Run the ld.f1 test
            test_result <- ld.f1(y = subset_data$response, 
                                 time = subset_data$time, 
                                 subject = subset_data$subject, 
                                 description=FALSE)

            # Extract p-value for time effect
            p_value <- test_result$ANOVA.test$`p-value`
            
            # Store results
            results[[paste(met, plat, sep = "_")]] <- data.table(
              challenge = challenge_name,
              metabolite = met,
              platform_name = plat,
              super_pathway = super_path,
              sub_pathway = sub_path,
              p_value = p_value
            )
        }
    }

    # Combine results into a data frame
    anova_results <- rbindlist(results, fill = TRUE)

    # Identify significant time effects
    anova_results[, significant := p_value < p_threshold]

    return(anova_results)
}

# Run ANOVA-like test for each challenge
anova_results_fasting <- run_anova_like_test(metabolite_data_fasting, "Fasting")
anova_results_pat <- run_anova_like_test(metabolite_data_pat, "Physical Activity")
anova_results_oltt <- run_anova_like_test(metabolite_data_oltt, "OLTT")

# Combine all results into one table
#final_anova_results <- rbind(anova_results_ogtt, anova_results_oltt, anova_results_sld, fill = TRUE)
#final_anova_results

#### Significant effect of time on metabolite levels during at least one challenge

In [None]:
# Get all unique metabolites from the updated dataset (excluding insulin)
all_metabolites <- unique(met_data_filtered[, .(metabolite, platform_name, super_pathway, sub_pathway)])

# Sort metabolites first by super_pathway, then sub_pathway, then metabolite name
all_metabolites <- all_metabolites[order(super_pathway, sub_pathway, tolower(metabolite))]

# Initialize the column as FALSE for all metabolites
all_metabolites[, significant_any_challenge := FALSE]

# Extract **only** significant metabolites (ensuring metabolite-platform pairs match)
significant_fasting <- anova_results_fasting[significant == TRUE, .(metabolite, platform_name)]
significant_pat <- anova_results_pat[significant == TRUE, .(metabolite, platform_name)]
significant_oltt <- anova_results_oltt[significant == TRUE, .(metabolite, platform_name)]

# Function to update significance status **only for matching metabolite + platform pairs**
update_significance <- function(met_data_filtered, sig_data) {
    if (nrow(sig_data) > 0) {  # Only run if there's data
        met_data_filtered[sig_data, on = .(metabolite,platform_name), significant_any_challenge := TRUE]
    }
}

# Update based on **corrected** significance lists
update_significance(all_metabolites, significant_fasting)
update_significance(all_metabolites, significant_pat)
update_significance(all_metabolites, significant_oltt)

# Save the output file
#fwrite(all_metabolites, "../results/anova_results_significant_in_at_least_one_challenge.csv")

# Display output
head(all_metabolites)

### Figure 2 

#### Putting together a table

In [None]:
# Merge results into one table
  anova_results_combined <- merge(anova_results_fasting, anova_results_pat, by = c("metabolite", "platform_name", "super_pathway", "sub_pathway"), all = TRUE)
  anova_results_combined <- merge(anova_results_combined, anova_results_oltt, by = c("metabolite", "platform_name", "super_pathway", "sub_pathway"), all = TRUE)
  
  # Rename p-value columns for each challenge
  setnames(anova_results_combined, 
           c("p_value.x", "p_value.y", "p_value"), 
           c("p_value_fasting", "p_value_pat", "p_value_oltt"))
  
  # Remove the redundant challenge columns and significance columns as they are not needed
  anova_results_combined[, c("challenge.x", "challenge.y", "challenge", "significant.x", "significant.y", "significant") := NULL]
  
  # Merge the p-values into the all_metabolites dataframe
  all_metabolites_fig <- merge(all_metabolites, anova_results_combined, by = c("metabolite", "platform_name", "super_pathway", "sub_pathway"), all.x = TRUE)
  
  # Filter rows where platform is either "Metabolon HD4 [nt-ms]" or "In-house biochemistry [chem.]"
  all_metabolites_fig <- all_metabolites_fig[
    platform_name %in% c("Metabolon HD4 [nt-ms]", "In-house biochemistry [chem.]")
  ]
  
  # Merge all_metabolites with met_data based on metabolite and platform
  all_metabolites_fig <- merge(all_metabolites_fig, met_data[, .(metabolite, platform_name, subject, time, 
                              challenge, response)], 
                                by = c("metabolite", "platform_name"), all.x = TRUE)
  
  # Calculate the mean for each metabolite, platform, challenge, and time
  all_metabolites_fig[, mean_response := mean(response, na.rm = TRUE), 
                         by = .(metabolite, platform_name, challenge, time)]

   print("after mean")
  head(all_metabolites_fig)
  
  # Remove 'subject' and 'response' columns
  all_metabolites_fig <- all_metabolites_fig[, !c("subject", "response"), with = FALSE]
  
  # Remove duplicates based on all columns
  all_metabolites_fig <- unique(all_metabolites_fig)
  
  # Calculate log2_foldchange based on the difference in mean_response
  all_metabolites_fig[, log2_foldchange := 
                        mean_response - mean_response[time == 1],
                      by = .(metabolite, platform_name, challenge)]
  
  # Replace NA in mean_response with 0 if time is 0
  all_metabolites_fig[time == 0 & is.na(mean_response), mean_response := 0]
  
  # Create a new column 'p_value' based on the 'challenge' column
  all_metabolites_fig[, p_value := 
      ifelse(challenge == "Fasting", p_value_fasting,
      ifelse(challenge == "Physical Activity", p_value_pat,
      ifelse(challenge == "OLTT", p_value_oltt, NA)))] 
  
  # Now remove the original p_value columns (p_value_fasting, p_value_fasting, p_value_oltt)
  all_metabolites_fig[, c("p_value_fasting", "p_value_pat", "p_value_oltt") := NULL]
  
  # Create a new column 'neg_log_p_value' that takes -log10 of the 'p_value' column
  all_metabolites_fig[, neg_log10_p_value := -log10(p_value)]
  
  # Create a new column 'abs_log2_foldchange' that stores the absolute value of 'log2_foldchange'
  all_metabolites_fig[, abs_log2_foldchange := abs(log2_foldchange)]

  unique(all_metabolites_fig$significant_any_challenge)

In [None]:
generate_figure2_data <- function(anova_results_fasting, anova_results_pat, anova_results_oltt, 
                                     all_metabolites, met_data, output_path = "../results/figure_2_table.csv") {
  
  # Merge results into one table
  anova_results_combined <- merge(anova_results_fasting, anova_results_pat, by = c("metabolite", "platform_name", "super_pathway", "sub_pathway"), all = TRUE)
  anova_results_combined <- merge(anova_results_combined, anova_results_oltt, by = c("metabolite", "platform_name", "super_pathway", "sub_pathway"), all = TRUE)
  
  # Rename p-value columns for each challenge
  setnames(anova_results_combined, 
           c("p_value.x", "p_value.y", "p_value"), 
           c("p_value_fasting", "p_value_pat", "p_value_oltt"))
  
  # Remove the redundant challenge columns and significance columns as they are not needed
  anova_results_combined[, c("challenge.x", "challenge.y", "challenge", "significant.x", "significant.y", "significant") := NULL]
  
  # Merge the p-values into the all_metabolites dataframe
  all_metabolites_fig <- merge(all_metabolites, anova_results_combined, by = c("metabolite", "platform_name", "super_pathway", "sub_pathway"), all.x = TRUE)
  
  # Filter rows where platform is either "Metabolon HD4 [nt-ms]" or "In-house biochemistry [chem.]"
  all_metabolites_fig <- all_metabolites_fig[
    platform_name %in% c("Metabolon HD4 [nt-ms]", "In-house biochemistry [chem.]")
  ]
  
  # Merge all_metabolites with met_data based on metabolite and platform
  all_metabolites_fig <- merge(all_metabolites_fig, met_data[, .(metabolite, platform_name, subject, time, 
                              challenge, response)], 
                                by = c("metabolite", "platform_name"), all.x = TRUE)
  
  # Calculate the mean for each metabolite, platform, challenge, and time
  all_metabolites_fig[, mean_response := mean(response, na.rm = TRUE), 
                         by = .(metabolite, platform_name, challenge, time)]
  
  # Remove 'subject' and 'response' columns
  all_metabolites_fig <- all_metabolites_fig[, !c("subject", "response"), with = FALSE]
  
  # Remove duplicates based on all columns
  all_metabolites_fig <- unique(all_metabolites_fig)
  
  # Calculate log2_foldchange based on the difference in mean_response
  all_metabolites_fig[, log2_foldchange := 
                        mean_response - mean_response[time == 0],
                      by = .(metabolite, platform_name, challenge)]
  
  # Replace NA in mean_response with 0 if time is 0
  all_metabolites_fig[time == 0 & is.na(mean_response), mean_response := 0]
  
  # Create a new column 'p_value' based on the 'challenge' column
  all_metabolites_fig[, p_value := 
      ifelse(challenge == "Fasting", p_value_fasting,
      ifelse(challenge == "Physical Activity", p_value_pat,
      ifelse(challenge == "OLTT", p_value_oltt, NA)))] 
  
  # Now remove the original p_value columns (p_value_fasting, p_value_fasting, p_value_oltt)
  all_metabolites_fig[, c("p_value_fasting", "p_value_pat", "p_value_oltt") := NULL]
  
  # Create a new column 'neg_log_p_value' that takes -log10 of the 'p_value' column
  all_metabolites_fig[, neg_log10_p_value := -log10(p_value)]
  
  # Create a new column 'abs_log2_foldchange' that stores the absolute value of 'log2_foldchange'
  all_metabolites_fig[, abs_log2_foldchange := abs(log2_foldchange)]
  
  # Filter rows where either condition applies: abs_log2_foldchange > 1 or neg_log10_p_value > 40
  all_metabolites_fig <- all_metabolites_fig[(significant_any_challenge == TRUE & abs_log2_foldchange > 1) | neg_log10_p_value > 40]
  
  # Sort the table in decreasing order of abs_log2_foldchange
  setorder(all_metabolites_fig, -abs_log2_foldchange)
  
  # Save the table to a CSV file
  fwrite(all_metabolites_fig, output_path)
  
  # Return the processed data frame
  return(all_metabolites_fig)
}

# Example of how to call the function and get the result
all_metabolites_fig <- generate_figure2_data(anova_results_fasting, anova_results_pat, anova_results_oltt, all_metabolites, met_data, "../results/fig_2_table.csv")
head(all_metabolites_fig)

In [None]:
all_metabolites_fig[all_metabolites_fig$significant_any_challenge == "FALSE"]

#### Volcano Plot

In [None]:
 # Define challenge colors
  challenge_colors <- c("ogtt" = "red",  
                        "sld" = "blue",  
                        "oltt" = "#BA8E23") 

  # Define time point shapes
  time_point_shapes <- c(
    "15" = 0,   # Empty square
    "30" = 1,   # Empty circle
    "45" = 2,   # Empty triangle facing up
    "60" = 15,  # Filled square
    "90" = 6,   # Empty triangle facing down
    "120" = 16, # Filled circle
    "180" = 17, # Filled triangle facing up
    "240" = 18  # Filled rhombus
  ) 

  # Custom labels for challenge
  challenge_labels <- c("Fasting" = "Fasting",
                        "Physical Activity" = "Physical Activity",
                        "OLTT" = "Oral liquid tolerance test")

generate_figure2a <- function(all_metabolites_fig, output_path = "../results/plots/volcano_plot_figure.png") {

  # Create volcano plot
  volcano_plot <- ggplot(all_metabolites_fig, aes(x = log2_foldchange, y = neg_log10_p_value, 
                     color = challenge, shape = as.factor(challenge_time))) +
    # Add transparent rectangle
    annotate("rect", xmin = -1, xmax = 1, ymin = 0, ymax = Inf, alpha = 0.2, fill = "blue") +
    # Add dashed vertical lines at x = -1 and x = 1
    geom_vline(xintercept = -1, linetype = "dashed", color = "black", size = 0.5) +
    geom_vline(xintercept = 1, linetype = "dashed", color = "black", size = 0.5) +
    geom_hline(yintercept = -log10(0.05/634), linetype = "dashed", color = "black", size = 0.5) +
    # Add points (ensure this is after the rectangle and lines to be on top)
    geom_point(size = 3, alpha = 1) +  # Set alpha to 1 for fully opaque points
    scale_color_manual(values = challenge_colors, labels = challenge_labels, 
                       guide = "none") +  # Remove color legend
    scale_x_continuous(breaks = seq(floor(min(all_metabolites_fig$log2_foldchange)), 
                                  ceiling(max(all_metabolites_fig$log2_foldchange)), 
                                  by = 1)) +
    scale_shape_manual(values = time_point_shapes) +
    geom_hline(yintercept = 40, linetype = "dashed", color = "black", size = 0.5) +
    theme_minimal() +
    labs(x = "log2 fold change", 
         y = "-log10(p-value)") +
    scale_y_continuous(expand = c(0, 0), limits = c(0, 85)) +
    theme(
      legend.position = "none",  # Remove the legend
      axis.line = element_line(color = "black", size = 1),
      axis.ticks = element_line(color = "black", size = 1),  # Ensure ticks are visible
      axis.title = element_text(size = 14, face = "bold"),  # Increase axis title size
      axis.text = element_text(size = 14)  # Increase size of the numbers next to ticks
    )

  # Save the volcano plot without the legend
  ggsave(output_path, plot = volcano_plot, bg = "white")
  
  # Return the volcano plot object in case the user wants to further customize or display it
  return(volcano_plot)
}

# Example of how to call the function
generate_figure2a(all_metabolites_fig, "../results/plots/fig_2_A_volcano_plot_logfold_pvalue.png")

#### Forest Plot

In [None]:
generate_figure2b <- function(all_metabolites_fig, output_path = "../results/plots/forest_plot.png") {
  
  # Define the custom order for super_pathway
  pathway_order <- c("Hormones", "Carbohydrate", "Nucleotide", "Xenobiotics", 
                     "Amino Acid", "Lipid", "Peptide")
  
  # Ensure the super_pathway is a factor with the specified order
  all_metabolites_fig$super_pathway <- factor(all_metabolites_fig$super_pathway, levels = pathway_order)
  
  # Sort the dataframe by the ordered super_pathway and alphabetically by metabolite within each super_pathway
  all_metabolites_fig <- all_metabolites_fig[order(all_metabolites_fig$super_pathway, all_metabolites_fig$metabolite), ]
  
  # Combine super_pathway and metabolite to ensure metabolites are grouped within the correct super_pathway order
  all_metabolites_fig$metabolite <- factor(all_metabolites_fig$metabolite, 
                                            levels = unique(all_metabolites_fig$metabolite[order(all_metabolites_fig$super_pathway, all_metabolites_fig$metabolite)]))
  
  # Compute bracket positions automatically (RIGHT SIDE)
  bracket_data <- all_metabolites_fig %>%
    group_by(super_pathway) %>%
    summarise(ymin = min(as.numeric(factor(metabolite))), 
              ymax = max(as.numeric(factor(metabolite)))) %>%
    mutate(x = max(all_metabolites_fig$log2_foldchange) + 1.2)  # Move to RIGHT side
  
  # Calculate max x position for labels
  max_x_value <- max(all_metabolites_fig$log2_foldchange, na.rm = TRUE) + 1.5  
  
  # Compute label positions for super_pathway
  label_data <- bracket_data %>%
    mutate(label_x = x + 0.15,   # Move labels slightly to the right
           label_y = (ymin + ymax) / 2)  # Position at midpoint of each bracket
  
  # Create forest plot
  forest_plot <- ggplot(all_metabolites_fig, aes(x = log2_foldchange, y = metabolite)) +
    # Shaded region
    annotate("rect", xmin = -1, xmax = 1, ymin = -Inf, ymax = Inf, alpha = 0.3, fill = "gray80") +
    geom_vline(xintercept = 0, linetype = "dashed", color = "black", size = 0.3) +
    geom_point(aes(color = challenge, shape = as.factor(challenge_time)), size = 2, alpha = 0.8) +
  
    # Proper legend integration
    scale_color_manual(
      values = challenge_colors, 
      labels = challenge_labels, 
      guide = guide_legend(title = "Challenge", order = 1)
    ) +
    scale_shape_manual(
      values = time_point_shapes,
      guide = guide_legend(title = "Challenge time [min]", order = 2)
    ) +
  
    # Labels
    labs(x = "log2 fold change",
         y = "Metabolites",
         color = "Challenge", 
         shape = "Challenge time [min]") +
  
    theme_bw() +
    theme(
      legend.position = "top",  # Legend stays at the top
      legend.title = element_text(size = 12, face = "bold"),
      legend.text = element_text(size = 10),
      
      axis.text.x = element_text(size = 14),
      axis.text.y = element_text(size = 12),
      axis.title.x = element_text(size = 16, face = "bold"),
      axis.title.y = element_text(size = 16, face = "bold"),
      
      strip.background = element_blank(),
      strip.text = element_text(size = 14, face = "bold"),
      
      panel.grid = element_blank(),
      panel.border = element_blank(),
      panel.spacing = unit(0.01, "null"), 
      axis.line = element_line(color = "black"),
      
      strip.text.y.left = element_blank(),
      strip.placement = "outside"
    ) +
  
    # Force legends into two separate lists
    guides(
      color = guide_legend(ncol = 1, order = 1, title.position = "top"),  # Challenge (list on left)
      shape = guide_legend(ncol = 2, order = 2, title.position = "top")   # Challenge time (2 columns)
    ) +
  
    # Facet grid for super_pathway
    facet_grid(rows = vars(super_pathway), scales = "free_y", space = "free_y", switch = "y") +
  
    # Adjust x-axis
    scale_x_continuous(breaks = seq(-2, max_x_value, by = 1)) +
    expand_limits(x = c(0, max_x_value + 1)) +
  
    # Brackets on the right side
    geom_segment(data = bracket_data, aes(x = x, xend = x, y = ymin, yend = ymax), size = 0.5) +
    geom_segment(data = bracket_data, aes(x = x, xend = x - 0.2, y = ymin, yend = ymin), size = 0.5) +
    geom_segment(data = bracket_data, aes(x = x, xend = x - 0.2, y = ymax, yend = ymax), size = 0.5) +
  
    # Super_pathway labels next to brackets
    geom_text(data = label_data, aes(x = label_x, y = label_y, label = super_pathway),
              hjust = 0, vjust = 0.5, size = 5)

  # Save without cutting off labels
  ggsave(output_path, plot = forest_plot, bg = "white", width = 12, height = 13, units = "in")
  
  # Return the forest plot object for further customization or use
  return(forest_plot)
}

# Example of how to call the function
generate_figure2b(all_metabolites_fig, "../results/plots/fig_2_B_forest_plot_logfold.png")

#### Significant effect of time on metabolite levels during at all challenges

In [284]:
# Initialize columns for each category as FALSE
all_metabolites[, `:=`(
  significant_fasting = FALSE,
  significant_pat = FALSE,
  significant_oltt = FALSE,
  significant_fasting_pat = FALSE,
  significant_fasting_oltt = FALSE,
  significant_pat_oltt = FALSE,
  significant_fasting_pat_oltt = FALSE
)]

# Function to update significance for each individual challenge
update_significance <- function(met_data, sig_data, column_name) {
    if (nrow(sig_data) > 0) {
        met_data[sig_data, on = .(metabolite, platform_name), (column_name) := TRUE]
    }
}

# Update individual significance columns
update_significance(all_metabolites, significant_fasting, "significant_fasting")
update_significance(all_metabolites, significant_pat, "significant_pat")
update_significance(all_metabolites, significant_oltt, "significant_oltt")

# Calculate overlaps
all_metabolites[, `:=`(
  significant_fasting_pat = significant_fasting & significant_pat & !significant_oltt,
  significant_fasting_oltt = significant_fasting & significant_oltt & !significant_pat,
  significant_pat_oltt = significant_pat & significant_oltt & !significant_fasting,
  significant_fasting_pat_oltt = significant_fasting & significant_pat & significant_oltt
)]

# Save the updated table
#fwrite(all_metabolites, "results/all_metabolites_with_significance.csv")

#### Input for Venn Diagram

In [None]:
create_venn_table <- function(all_metabolites) {
  
  # Calculate the counts of significant metabolites for each condition and their overlaps
  venn_data <- c(
    "Fasting" = sum(all_metabolites$significant_fasting),  # Metabolites significant in fasting condition
    "PAT" = sum(all_metabolites$significant_pat),  # Metabolites significant in PAT condition
    "OLTT" = sum(all_metabolites$significant_oltt),  # Metabolites significant in OLTT condition
    
    # Overlapping significance between conditions
    "Fasting ∩ PAT" = sum(all_metabolites$significant_fasting_pat),  # Significant in both Fasting and PAT
    "Fasting ∩ OLTT" = sum(all_metabolites$significant_fasting_oltt),  # Significant in both Fasting and OLTT
    "PAT ∩ OLTT" = sum(all_metabolites$significant_pat_oltt),  # Significant in both PAT and OLTT
    "Fasting ∩ PAT ∩ OLTT" = sum(all_metabolites$significant_fasting_pat_oltt),  # Significant in all three conditions
    
    # Exclusive significance for each condition
    "Fasting (exclusive)" = sum(all_metabolites$significant_fasting) - sum(all_metabolites$significant_fasting_pat) - sum(all_metabolites$significant_fasting_oltt) - sum(all_metabolites$significant_fasting_pat_oltt),
    "PAT (exclusive)" = sum(all_metabolites$significant_pat) - sum(all_metabolites$significant_pat_oltt) - sum(all_metabolites$significant_fasting_pat) - sum(all_metabolites$significant_fasting_pat_oltt),
    "OLTT (exclusive)" = sum(all_metabolites$significant_oltt) - sum(all_metabolites$significant_fasting_oltt) - sum(all_metabolites$significant_pat_oltt) - sum(all_metabolites$significant_fasting_pat_oltt),
    
    # Overall summary statistics
    "Total Metabolites" = nrow(all_metabolites),  # Total number of metabolites analyzed
    "Total Significant in Any Challenge" = sum(all_metabolites$significant_any_challenge),  # Total metabolites significant in at least one condition
    "Total Not Significant" = nrow(all_metabolites) - sum(all_metabolites$significant_any_challenge)  # Metabolites not significant in any condition
  )
  
  # Convert the named vector to a data.table for better readability and handling
  venn_table <- data.table(Category = names(venn_data), Count = venn_data)
  venn_table  # Diasplay the created Venn diagram table
}

# Call the function to generate and return the Venn diagram table
create_venn_table(all_metabolites)

#### Input for Nested Donut - Metabolite Responses by Class

In [None]:
create_nested_donut_table <- function(all_metabolites) {
  # Print tables for debugging and validation
  # print(table(all_metabolites$super_pathway))  # Total count of metabolites per super_pathway
  # print(table(all_metabolites$super_pathway[all_metabolites$significant_any_challenge]))  # Count of significant metabolites per super_pathway
  
  # Compute total counts for each super_pathway
  super_pathway_counts <- table(all_metabolites$super_pathway)
  
  # Compute counts for significant metabolites
  sig_super_pathway_counts <- table(all_metabolites$super_pathway[all_metabolites$significant_any_challenge])
  
  # Create a table combining both total and significant counts
  nested_donut_table <- data.table(
    super_pathway = names(super_pathway_counts),
    total = as.integer(super_pathway_counts),
    significant = as.integer(sig_super_pathway_counts[names(super_pathway_counts)]) # Ensure missing categories are filled with NA
  )

  # Save the table to a CSV file
  fwrite(nested_donut_table, file = "../results/nested_donut_counts_per_pathway.csv")
  message("Nested Donut input saved: ../results/nested_donut_counts_per_pathway.csv")
  nested_donut_table  # Display the nested donut table
}

# Generate and save the nested donut table
create_nested_donut_table(all_metabolites)

## T-Test

In [None]:
perform_paired_ttests <- function(met_data, all_metabolites, 
                                  significant_challenge_type = "any", 
                                  filter_significant = TRUE, 
                                  output_path = "../results/paired_ttest_results.csv") {

  # Select baseline data for different challenges
  baseline_fasting <- met_data[challenge == "Fasting" & time == "1"]
  baseline_pat  <- met_data[challenge == "Physical Activity" & time == "33"]
  
  # Ensure only common subjects are used across all three conditions
  common_subjects <- Reduce(intersect, list(baseline_fasting$subject, baseline_pat$subject))
  baseline_fasting <- baseline_fasting[subject %in% common_subjects]
  baseline_pat <- baseline_pat[subject %in% common_subjects]
  
  # Get unique metabolite-platform combinations
  metabolites <- unique(met_data[, .(metabolite, platform_name)])
  
  # Perform paired t-tests and keep significance + pathway info
  results <- lapply(1:nrow(metabolites), function(i) {
    met_name <- metabolites$metabolite[i]
    met_platform <- metabolites$platform_name[i]
    
    # Subset using both metabolite name and platform
    fasting_values <- baseline_fasting[metabolite == met_name & platform_name == met_platform, response]
    pat_values  <- baseline_pat[metabolite == met_name & platform_name == met_platform, response]
    
    # Ensure valid comparisons (at least two values in each group)
    if (length(fasting_values) > 1 & length(pat_values) > 1) {
      mean_diff_pat <- mean(pat_values, na.rm = TRUE) - mean(fasting_values, na.rm = TRUE)
      
      # Perform paired t-tests
      p_val_pat <- tryCatch(
        t.test(pat_values, fasting_values, paired = TRUE, var.equal = FALSE)$p.value,
        error = function(e) NA
      )
      
      # Choose significance column based on parameter
      if (significant_challenge_type == "all") {
        anova_significance <- all_metabolites[metabolite == met_name & platform_name == met_platform, significant_fasting_pat_oltt]
        pathway_info <- all_metabolites[metabolite == met_name & platform_name == met_platform, 
                                      .(platform_name, super_pathway, sub_pathway, significant_fasting_pat_oltt)]
      } else {
        anova_significance <- all_metabolites[metabolite == met_name & platform_name == met_platform, significant_any_challenge]
        pathway_info <- all_metabolites[metabolite == met_name & platform_name == met_platform, 
                                      .(platform_name, super_pathway, sub_pathway, significant_any_challenge)]
      }
      
      # Return structured result
      return(data.table(
        metabolite = met_name,
        platform_name = met_platform,
        super_pathway = pathway_info$super_pathway,
        sub_pathway = pathway_info$sub_pathway,
        mean_diff_PAT_FASTING = mean_diff_pat,
        pvalue_PAT_FASTING = p_val_pat,
        significant_response = anova_significance
      ))
    } else {
      return(NULL)
    }
  })
  
  # Remove NULL results safely
  results <- rbindlist(Filter(Negate(is.null), results), fill = TRUE)
  
  # Filter based on significant challenge type
  if (filter_significant) {
    filtered_results <- results[significant_response == TRUE]
  } else {
    filtered_results <- results
  }
  
  # Save filtered results to a CSV file
  fwrite(filtered_results, output_path)
  message("Filtered T-Test completed! Filtered results saved in: ", output_path)
  
  # Print summary of the filtered results
  num_significant <- nrow(filtered_results)
  total_tests <- nrow(results)
  percentage <- (num_significant / total_tests) * 100
  
  cat("Number of significant metabolites (filtered):", num_significant, "\n")
  cat("Percentage of significant results (filtered):", round(percentage, 2), "%\n")
}

# Example of how to call the function and get the result
filtered_results <- perform_paired_ttests(met_data, all_metabolites, 
                      significant_challenge_type = "any", # either "any" or "all"
                      filter_significant = TRUE, # if you want to only have significant results choose TRUE
                      output_path = "../results/paired_ttest_results.csv")


In [None]:
### OLD TTEST

# Select baseline data for different challenges
baseline_fasting <- met_data[challenge == "Fasting" & time == "1"]
baseline_pat  <- met_data[challenge == "Physical Activity"  & time == "33"]

# Ensure only common subjects are used across all three conditions
common_subjects <- Reduce(intersect, list(baseline_fasting$subject, baseline_pat$subject))
baseline_fasting <- baseline_fasting[subject %in% common_subjects]
baseline_pat <- baseline_pat[subject %in% common_subjects]

# Get unique metabolite-platform combinations
metabolites <- unique(met_data[, .(metabolite, platform_name)])

# Perform paired t-tests and keep significance + pathway info
results <- lapply(1:nrow(metabolites), function(i) {
  met_name <- metabolites$metabolite[i]
  met_platform <- metabolites$platform_name[i]

  # Subset using both metabolite name and platform
  fasting_values <- baseline_fasting[metabolite == met_name & platform_name == met_platform, response]
  pat_values  <- baseline_pat[metabolite == met_name & platform_name == met_platform, response]

  # Ensure valid comparisons (at least two values in each group)
  if (length(fasting_values) > 1 & length(pat_values) > 1) {
    mean_diff_pat <- mean(pat_values, na.rm = TRUE) - mean(fasting_values, na.rm = TRUE)

    # Perform paired t-tests
    p_val_pat <- tryCatch(
      t.test(pat_values, fasting_values, paired = TRUE, var.equal = FALSE)$p.value,
      error = function(e) NA
    )

    # Retrieve significance info and pathway details
    anova_significance <- all_metabolites[metabolite == met_name & platform_name == met_platform, significant_fasting_pat_oltt]
    pathway_info <- all_metabolites[metabolite == met_name & platform_name == met_platform, 
                                    .(platform_name, super_pathway, sub_pathway, significant_fasting_pat_oltt)]

    # Return structured result
    return(data.table(
      metabolite = met_name,
      platform_name = met_platform,
      super_pathway = pathway_info$super_pathway,
      sub_pathway = pathway_info$sub_pathway,
      mean_diff_PAT_FASTING = mean_diff_pat,
      pvalue_PAT_FASTING = p_val_pat,
      significant_response = anova_significance
    ))
  } else {
    return(NULL)
  }
})

# Remove NULL results safely
results <- rbindlist(Filter(Negate(is.null), results), fill = TRUE)

# Filter rows where significant_OGTT_OLTT_SLD is TRUE
filtered_results <- results[significant_response == TRUE]

# Save filtered results to a CSV file
fwrite(filtered_results, "../results/paired_ttest_results.csv")
message("Filtered T-Test completed! Filtered results saved in: ../results/paired_ttest_results.csv")

# Print summary of the filtered results
num_significant <- nrow(filtered_results)
total_tests <- nrow(results)
percentage <- (num_significant / total_tests) * 100

cat("Number of significant metabolites (filtered):", num_significant, "\n")
cat("Percentage of significant results (filtered):", round(percentage, 2), "%\n")


#### Input for Volcano Plot - Carryover Effect Analysis

In [None]:
create_volcano_carryover_input <- function(met_data, all_metabolites) {
  # Select baseline data for different challenges
  baseline_fasting <- met_data[challenge == "Fasting" & time == "1"]
  baseline_pat  <- met_data[challenge == "Physical Activity"  & time == "33"]

  # Ensure only common subjects are used across all three conditions
  common_subjects <- Reduce(intersect, list(baseline_fasting$subject, baseline_pat$subject))
  baseline_fasting <- baseline_fasting[subject %in% common_subjects]
  baseline_pat <- baseline_pat[subject %in% common_subjects]

  # Get unique metabolite-platform combinations
  metabolites <- unique(met_data[, .(metabolite, platform_name)])

  # Perform paired t-tests and keep significance + pathway info
  ttest_results <- lapply(1:nrow(metabolites), function(i) {
    met_name <- metabolites$metabolite[i]
    met_platform <- metabolites$platform_name[i]

    fasting_values <- baseline_fasting[metabolite == met_name & platform_name == met_platform, response]
    pat_values  <- baseline_pat[metabolite == met_name & platform_name == met_platform, response]

    if (length(fasting_values) > 1 & length(pat_values) > 1) {
      mean_diff_pat <- mean(pat_values, na.rm = TRUE) - mean(fasting_values, na.rm = TRUE)
      p_val_pat <- tryCatch(
        t.test(pat_values, fasting_values, paired = TRUE, var.equal = FALSE)$p.value,
        error = function(e) NA
      )
      pathway_info <- all_metabolites[metabolite == met_name & platform_name == met_platform, 
                                      .(super_pathway, sub_pathway, significant_any_challenge)]
      return(data.table(
        metabolite = met_name,
        platform_name = met_platform,
        super_pathway = pathway_info$super_pathway,
        sub_pathway = pathway_info$sub_pathway,
        mean_diff_PAT_FASTING = mean_diff_pat,
        pvalue_PAT_FASTING = p_val_pat,
        significant_response = pathway_info$significant_any_challenge
      ))
    } else {
      return(NULL)
    }
  })

  ttest_results <- rbindlist(Filter(Negate(is.null), ttest_results), fill = TRUE)
  fwrite(ttest_results, "../results/volcano_ttest_inputs.csv")
  message("Carryover Volcano input saved: ../results/volcano_ttest_inputs.csv")
}

# Call the function to generate and save Volcano Plot input
data_volcano <- create_volcano_carryover_input(met_data, all_metabolites)

#### Create Volcano Plot

In [None]:
create_volcano_carryover_plot <- function() {

  # Load data
  data <- fread("../results/volcano_ttest_inputs.csv")
  
  # Select and transform data
  data_transformed <- data %>%
    mutate(log_pvalue = -log10(pvalue_PAT_FASTING)) %>%
    arrange(desc(log_pvalue))  # Order by log_pvalue in descending order

  # Define colors for each super-pathway
  super_pathway_colors <- c(
    "Amino Acids" = "#d62728",
    "Carbohydrates" = "#7f7f7f",
    "Cofactors and Vitamins" = "#9467bd",
    "Energy" = "#8c564b",
    "Lipids" = "#ba8e23",
    "Nucleotides" = "#e377c2",
    "Peptides" = "#2ca02c",
    "Xenobiotics" = "#1f77b4"
  )

  # Define significance threshold value for dotted line
  sig_threshold <- -log10(0.05/634)

  # Select metabolites with log_pvalue above the significance threshold for labeling
  top_metabolites <- data_transformed %>% filter(log_pvalue > sig_threshold)

  # Create volcano plot
  volcano_plot <- ggplot(data_transformed, aes(x = mean_diff_PAT_FASTING, y = log_pvalue, color = super_pathway)) +
    geom_point(aes(alpha = log_pvalue > sig_threshold, shape = significant_response), size = 2.5) +
    scale_shape_manual(values = c(16, 1), name = "Significant response", labels = c("yes", "no"), breaks = c(TRUE, FALSE)) +
    scale_alpha_manual(values = c(0.2, 1.0), guide = 'none') +
    scale_color_manual(values = super_pathway_colors) +
    guides(shape = guide_legend(order = 1), color = guide_legend(order = 2)) +
    labs(
      x = "Mean difference of challenge baseline",
      y = "-log10(p-value)",
      color = "Super-pathways"
    ) +
    
    # Updated y-axis
    scale_y_continuous(limits = c(0, 9), breaks = seq(0, 9, 1), expand = c(0, 0)) +
    
    # Updated x-axis
    scale_x_continuous(limits = c(-11, 12), breaks = seq(-10, 10, 5), expand = c(0, 0)) +
    
    theme_minimal() +
    theme(
      panel.grid.major = element_line(color = "gray80", linewidth = 0.3),  # Fix deprecated 'size' argument
      panel.grid.minor = element_blank(),  # Remove minor grid lines
      legend.position = "right",
      legend.title = element_text(size = 12, face = "bold"),
      legend.text = element_text(size = 10),
      plot.title = element_text(hjust = 0.5, size = 14, margin = margin(b = 10)),
      plot.margin = margin(20, 5, 5, 5),
      axis.title = element_text(size = 12, face = "bold"),
      axis.text = element_text(size = 10)
    ) +
    
    # Reference lines
    geom_hline(yintercept = sig_threshold, linetype = "dashed", color = "black") +
    geom_hline(yintercept = 0, linetype = "solid", color = "black") +
    geom_vline(xintercept = -11, linetype = "solid", color = "black") +
  
    # Adjusted annotation
    annotate("text", x = 9, y = sig_threshold + 0.15, label = paste("-log10(0.05/634) =", round(sig_threshold, 2)), 
             size = 3.5, color = "black", fontface = "bold") +
  
    # Adjusted metabolite labels
    geom_text_repel(data = top_metabolites, aes(label = metabolite), size = 3, box.padding = 0.4, max.overlaps = 20)

  # Display the plot in the notebook
  print(volcano_plot)

  # Save the plot as an image
  ggsave("../results/plots/sup_fig_4_volcano_plot_carryover_pat_fasting.png", plot = volcano_plot, width = 10, height = 12, dpi = 300, bg = "white")
}

# Call function to create and save volcano plot
create_volcano_carryover_plot()

# Clustering

### Clustering Input

In [None]:
# Map missing information from met_data_filtered to filtered_results based on both "metabolite" and "platform"
clustering_input <- merge(
  filtered_results, 
  met_data_filtered[, .(metabolite, platform_name, response, subject, challenge, time)], 
  by = c("metabolite", "platform_name"), 
  all.x = TRUE
)

# Save the updated results
#fwrite(clustering_input, "../results/clustering_input.csv")

# Display the first few rows of the updated data
head(clustering_input)

### Run Clustering

In [None]:
# Funktion für den Workflow
run_clustering <- function(c_num = 8, m = 1.25, seed = 123) {
  
  # Setze Seed für Reproduzierbarkeit
  set.seed(seed)
  
  # 1. Daten laden und vorverarbeiten
  df <- clustering_input
  head(df)
  
  # Sicherstellen, dass notwendige Spalten vorhanden sind
  required_columns <- c("metabolite", "super_pathway", "sub_pathway", "response", "subject", "challenge")
  if (!all(required_columns %in% colnames(df))) {
    stop("Not all required columns are present in the dataset!")
  }
  
  # Filter und Vorverarbeitung
  #df <- df[!(df$challenge == "OGTT" & df$response == 240), ] # Entferne bestimmte Zeilen
  df$challenge <- NULL # Entferne die 'challenge'-Spalte
  
  # Fehlende Werte im 'response'-Feld durch den Mittelwert ersetzen
  df$response[is.na(df$response)] <- mean(df$response, na.rm = TRUE)
  
  # Z-Score Normalisierung pro Metabolit
  df$response <- ave(df$response, df$metabolite, FUN = function(x) scale(x, center = TRUE, scale = TRUE))
  
  # Metadaten speichern
  df_meta <- unique(df[, c("metabolite", "super_pathway", "sub_pathway")])
  
  # 2. Daten für Clustering vorbereiten
  # Aggregiere Mittelwerte von 'response' nach 'metabolite' und 'platform_name'
  df_agg <- aggregate(response ~ metabolite + platform_name, data = df, FUN = mean)
  
  # Erstelle eine vollständige Kombination aller 'metabolite' und 'platform_name'
  all_combinations <- expand.grid(metabolite = unique(df_agg$metabolite),
                                   platform_name = unique(df_agg$platform_name))
  
  # Verbinde die vollständige Kombination mit den aggregierten Daten
  df_agg <- merge(all_combinations, df_agg, by = c("metabolite", "platform_name"), all.x = TRUE)
  
  # Fehlende Werte in 'response' mit 0 auffüllen
  df_agg$response[is.na(df_agg$response)] <- 0
  
  # Konvertiere die aggregierten Daten in eine Matrix
  response_matrix <- reshape(df_agg, idvar = "metabolite", timevar = "platform_name", direction = "wide")
  rownames(response_matrix) <- response_matrix$metabolite
  response_matrix <- response_matrix[, -1] # Entferne die erste Spalte (metabolite)
  response_matrix <- as.matrix(response_matrix)
  
  # ExpressionSet erstellen
  expr_set <- new("ExpressionSet", exprs = response_matrix)
  
  # 3. Mfuzz-Clustering durchführen
  cl <- mfuzz(expr_set, c = c_num, m = m)
  
  # 4. Ergebnisse formatieren
  cluster_assignments <- data.frame(
    metabolite = rownames(response_matrix),
    cl$membership
  )
  
  # Bestimme den Cluster mit der höchsten Wahrscheinlichkeit für jeden Metaboliten
  cluster_assignments$Assigned_Cluster <- apply(cl$membership, 1, which.max)
  
  # Füge Super Pathway und Sub Pathway hinzu
  cluster_assignments <- merge(cluster_assignments, df_meta, by = "metabolite", all.x = TRUE)
  
  # Formatieren der Spalten
  colnames(cluster_assignments) <- c(
    "Metabolite", "Cluster_1", "Cluster_2", "Cluster_3", "Cluster_4", "Cluster_5",
    "Cluster_6", "Cluster_7", "Cluster_8", "Assigned_Cluster", "Super_Pathway", "Sub_Pathway"
  )
  
  # Ergebnisse speichern
  fwrite(cluster_assignments, "../results/mfuzz_results.csv")
  print("Clustering abgeschlossen! Ergebnisse gespeichert.")
}

# Beispielaufruf
run_clustering(
  c_num = 8,  # Anzahl der Cluster
  m = 1.25,   # Fuzzifizierungsparameter
  seed = 42  # Seed für Reproduzierbarkeit
)