In [111]:
library(data.table)
library(dplyr)
library(nparLD)
library(missForest)
library(parallel)
library(doParallel)
library(Mfuzz)
library(Biobase)

# HuMet DATASET

In this notebook, we analyze a subset of the HuMet dataset, focusing on **plasma** samples across three platforms:

- Metabolon HD4 (nt-ms)
- Biocrates p150 (t-ms)
- In-house biochemistry (chem.)

## Loading the Data

In [112]:
met_data <- fread("../input/raw/humet_data_raw_none_subjects15_tp57.csv", sep = ",", fill = TRUE)
info_data <- fread("../input/humet_info.csv", sep = ",", fill = TRUE)
head(met_data)
head(info_data)

time,subject,"1-(1-enyl-oleoyl)-GPC (P-18:1) [P, nt-ms]","1-(1-enyl-oleoyl)-GPE (P-18:1) [P, nt-ms]","1-(1-enyl-palmitoyl)-GPC (P-16:0) [P, nt-ms]","1-(1-enyl-palmitoyl)-GPE (P-16:0) [P, nt-ms]","1-(1-enyl-stearoyl)-GPC (P-18:0) [P, nt-ms]","1-(1-enyl-stearoyl)-GPE (P-18:0) [P, nt-ms]","1-adrenoyl-GPC (22:4) [P, nt-ms]","1-arachidonoyl-GPC (20:4n6) [P, nt-ms]",...,"SM C24:1 [P, t-ms]","SM C26:1 [P, t-ms]","threonine [P, t-ms]","tryptophan [P, t-ms]","tyrosine [P, t-ms]","valine [P, t-ms]","Glucose [P, chem.]","Insulin [P, chem.]","Lactate [P, chem.]","NEFA [P, chem.]"
<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1,0.4938,1.0205,0.8303,1.058,0.913,1.0212,0.5614,0.5686,...,24.0,0.185,225.2,83.7,93.2,426.5,103.0,6.85,8.9,0.38
2,1,,0.4827,0.7422,0.8171,0.6513,1.0613,0.4811,0.4346,...,25.4,0.214,188.0,75.5,69.8,317.0,95.0,5.42,7.85,0.52
3,1,0.5089,0.7566,0.7895,0.8633,0.8431,1.0804,0.5727,0.5011,...,27.8,0.183,245.3,92.2,82.9,436.1,87.0,3.99,6.8,0.66
4,1,0.5022,0.6572,0.7738,0.7747,0.7635,0.9441,0.5314,0.5299,...,27.5,0.255,160.8,70.6,61.3,279.5,89.8,4.09,6.5,0.58
5,1,0.5717,0.6865,0.9145,0.8733,0.9987,1.0984,0.661,0.6097,...,28.8,0.241,263.4,101.1,91.5,396.7,92.7,4.18,6.2,0.5
6,1,0.5622,0.7593,0.9676,0.9409,1.0,1.0538,0.7443,0.6149,...,28.8,0.247,200.0,87.7,80.7,371.8,91.2,3.88,6.0,0.59


metabolite,super_pathway,sub_pathway,fluid,platform_name,platform_unit,retention_index,CAS,PubChem,ChEBI
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>
1-(1-enyl-oleoyl)-GPC (P-18:1)*,Lipids,Lysoplasmalogen,plasma,Metabolon HD4 [nt-ms],normalized counts,1547,,,CHEBI:136125
1-(1-enyl-oleoyl)-GPE (P-18:1)*,Lipids,Lysoplasmalogen,plasma,Metabolon HD4 [nt-ms],normalized counts,6250,,,CHEBI:133229
1-(1-enyl-palmitoyl)-GPC (P-16:0)*,Lipids,Lysoplasmalogen,plasma,Metabolon HD4 [nt-ms],normalized counts,1547,,,CHEBI:73841
1-(1-enyl-palmitoyl)-GPE (P-16:0)*,Lipids,Lysoplasmalogen,plasma,Metabolon HD4 [nt-ms],normalized counts,1558,,42607469.0,CHEBI:138421
1-(1-enyl-stearoyl)-GPC (P-18:0) *,Lipids,Lysoplasmalogen,plasma,Metabolon HD4 [nt-ms],normalized counts,1560,,24779527.0,CHEBI:88779
1-(1-enyl-stearoyl)-GPE (P-18:0)*,Lipids,Lysoplasmalogen,plasma,Metabolon HD4 [nt-ms],normalized counts,6500,,42607470.0,CHEBI:87823


## Data Analysis

In [None]:
# Count metabolite columns
num_metabolite_columns <- length(setdiff(names(met_data), c("time", "subject")))

unique_time_values <- unique(met_data$time)
unique_subject_values <- unique(met_data$subject)

print(paste("Number of metabolites:", num_metabolite_columns))
print(paste("Unique time values:", paste(unique_time_values, collapse = ", ")))
print(paste("Unique subject values:", paste(unique_subject_values, collapse = ", ")))

In [None]:
# Count occurrences of each unique platform
platform_counts <- table(info_data$platform_name)
print(platform_counts)

- Targeted: 132
- Non-Targeted: 502
- Insulin (Hormone): 1

## Preprocessing

## Adding Challenge information

**Relevant time intervals for our analysis**:

Since the original dataset lacked challenge information, we assigned it based on the time column:

- **Fasting**: Time points 1–10
- **Physical Activity**: Time points 33–39
- **Oral Lipid Tolerance Test (OLTT)**: Time points 40–50

In [113]:
# Create a dataset with all time intervals
met_data <- met_data %>%
  mutate(challenge = case_when(
    time >= 1 & time <= 10 ~ "Fasting",
    time >= 33 & time <= 39 ~ "Physical Activity",
    time >= 40 & time <= 50 ~ "OLTT", # crosscheck what happens if OLTT ends at 48
    TRUE ~ "Other"  # Keep "Other" instead of filtering out
  ))

# Display first rows of both datasets
#tail(met_data_all)
head(met_data)

time,subject,"1-(1-enyl-oleoyl)-GPC (P-18:1) [P, nt-ms]","1-(1-enyl-oleoyl)-GPE (P-18:1) [P, nt-ms]","1-(1-enyl-palmitoyl)-GPC (P-16:0) [P, nt-ms]","1-(1-enyl-palmitoyl)-GPE (P-16:0) [P, nt-ms]","1-(1-enyl-stearoyl)-GPC (P-18:0) [P, nt-ms]","1-(1-enyl-stearoyl)-GPE (P-18:0) [P, nt-ms]","1-adrenoyl-GPC (22:4) [P, nt-ms]","1-arachidonoyl-GPC (20:4n6) [P, nt-ms]",...,"SM C26:1 [P, t-ms]","threonine [P, t-ms]","tryptophan [P, t-ms]","tyrosine [P, t-ms]","valine [P, t-ms]","Glucose [P, chem.]","Insulin [P, chem.]","Lactate [P, chem.]","NEFA [P, chem.]",challenge
<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,1,0.4938,1.0205,0.8303,1.058,0.913,1.0212,0.5614,0.5686,...,0.185,225.2,83.7,93.2,426.5,103.0,6.85,8.9,0.38,Fasting
2,1,,0.4827,0.7422,0.8171,0.6513,1.0613,0.4811,0.4346,...,0.214,188.0,75.5,69.8,317.0,95.0,5.42,7.85,0.52,Fasting
3,1,0.5089,0.7566,0.7895,0.8633,0.8431,1.0804,0.5727,0.5011,...,0.183,245.3,92.2,82.9,436.1,87.0,3.99,6.8,0.66,Fasting
4,1,0.5022,0.6572,0.7738,0.7747,0.7635,0.9441,0.5314,0.5299,...,0.255,160.8,70.6,61.3,279.5,89.8,4.09,6.5,0.58,Fasting
5,1,0.5717,0.6865,0.9145,0.8733,0.9987,1.0984,0.661,0.6097,...,0.241,263.4,101.1,91.5,396.7,92.7,4.18,6.2,0.5,Fasting
6,1,0.5622,0.7593,0.9676,0.9409,1.0,1.0538,0.7443,0.6149,...,0.247,200.0,87.7,80.7,371.8,91.2,3.88,6.0,0.59,Fasting


## Removing Metabolites with > 30% missing values

In [114]:
remove_high_na_metabolites <- function(met_data, threshold = 0.3, output_file = "removed_metabolites.txt") {
  # Identify metabolite columns (excluding time, subject, and challenge)
  metabolite_columns <- setdiff(colnames(met_data), c("time", "subject", "challenge"))
  
  # Calculate the percentage of missing values for each metabolite
  na_percentage <- colMeans(is.na(met_data[, ..metabolite_columns]))

  # Find metabolites with more than `threshold` missing values
  high_na_metabolites <- names(na_percentage[na_percentage > threshold])

  # Write the removed metabolite names to a text file
  if (length(high_na_metabolites) > 0) {
    writeLines(high_na_metabolites, output_file)
  }

  # Remove these metabolites from met_data
  filtered_met_data <- met_data[, !high_na_metabolites, with = FALSE]

  return(filtered_met_data)
}

# Apply function to clean met_data
met_data <- remove_high_na_metabolites(met_data)

#### Leaving Only Relevant Time Points

In [115]:
# Create a dataset with only relevant time intervals
met_data <- met_data %>%
  filter(challenge != "Other")

## Splitting the dataset based on platform

In [116]:
# Identify metabolite columns (excluding time, subject, and challenge)
metabolite_columns <- setdiff(colnames(met_data), c("time", "subject", "challenge"))

# Define platform patterns for Metabolon and Biocrates
platforms <- list(
  metabolon = "\\[P, nt-ms\\]",
  biocrates = "\\[P, t-ms\\]"
)

# Function to filter metabolites based on platform
filter_metabolites <- function(pattern) {
  selected_cols <- c("time", "subject", "challenge", metabolite_columns[grepl(pattern, metabolite_columns)])
  met_data[, ..selected_cols]
}

# Function to filter metabolites NOT belonging to Metabolon or Biocrates (i.e., Inhouse)
filter_inhouse_metabolites <- function() {
  excluded_cols <- unique(unlist(lapply(platforms, function(p) metabolite_columns[grepl(p, metabolite_columns)])))
  selected_cols <- c("time", "subject", "challenge", setdiff(metabolite_columns, excluded_cols))
  met_data[, ..selected_cols]
}

# Create datasets
met_data_metabolon <- filter_metabolites(platforms$metabolon)
met_data_biocrates <- filter_metabolites(platforms$biocrates)
met_data_inhouse <- filter_inhouse_metabolites()  # Everything else


## Handle Missing Values - missForest

In [None]:
# Count missing values in each dataset
sum(is.na(data_fasting))
sum(is.na(data_exercise))
sum(is.na(data_oltt))

In [117]:
# Function to convert categorical variables to factors
convert_to_factors <- function(data) {
  data %>%
    mutate(
      challenge = as.factor(challenge),
      time = as.factor(time),
      subject = as.factor(subject)
    ) %>%
    mutate(across(where(is.character), as.factor))
}

# Function for missForest imputation with adaptive parallelization
perform_missForest <- function(data_subset, ntree_val = 10) {
  num_vars <- ncol(data_subset)  # Get the number of variables
  
  # Adjust cores to be at most the number of variables
  num_cores <- min(detectCores() - 1, num_vars)
  
  # If parallelization is still invalid, set it to 'no'
  parallel_option <- if (num_cores > 1) "variables" else "no"
  
  cl <- makeCluster(num_cores, type = "FORK") 
  registerDoParallel(cl)
  
  set.seed(42)  # Ensures reproducibility
  imputed_data <- missForest(data_subset, ntree = ntree_val, parallelize = parallel_option, verbose = TRUE)
  
  stopCluster(cl)  # Stop cluster
  
  return(imputed_data$ximp)  # Extract imputed dataset
}

# Wrapper function to process and impute metabolite datasets
data_pipeline <- function(metabolite_datasets) {
  # Convert categorical variables to factors
  metabolite_datasets <- lapply(metabolite_datasets, convert_to_factors)
  
  # Perform imputation with automatic parallelization adjustment
  imputed_data <- lapply(metabolite_datasets, perform_missForest, ntree_val = 10) #400
  
  return(imputed_data)
}

# List of metabolite datasets
metabolite_datasets <- list(
  metabolon = met_data_metabolon,
  biocrates = met_data_biocrates,
  inhouse = met_data_inhouse
)

# Apply pipeline to each dataset
imputed_metabolite_data <- data_pipeline(metabolite_datasets)

  parallelizing over the variables of the input data matrix 'xmis'
  missForest iteration 1 in progress...done!
    estimated error(s): 0.6648431 0 
    difference(s): 0.00363823 0 
    time: 16.418 seconds

  missForest iteration 2 in progress...done!
    estimated error(s): 0.6872388 0 
    difference(s): 0.002561098 0 
    time: 16.172 seconds

  missForest iteration 3 in progress...done!
    estimated error(s): 0.6856945 0 
    difference(s): 0.002923526 0 
    time: 15.492 seconds

  parallelizing over the variables of the input data matrix 'xmis'
  missForest iteration 1 in progress...done!
    estimated error(s): 0.1267831 0 
    difference(s): 5.811953e-06 0 
    time: 1.947 seconds

  missForest iteration 2 in progress...done!
    estimated error(s): 0.1186948 0 
    difference(s): 1.483467e-05 0 
    time: 1.886 seconds

  parallelizing over the variables of the input data matrix 'xmis'
  missForest iteration 1 in progress...done!
    estimated error(s): 0 0 
    difference(s

In [None]:
imputed_metabolite_data

In [113]:
### AFTER REMOVING OTHER

# 50 trees => 0.603862
# 68 trees => 0.6200632
# 69 trees => 0.594013
# 70 trees => 0.5919029
# 75 trees => 0.5951333
# 76 trees => 0.5837592
# 77 trees => 0.5951333
# 78 trees => 0.5852654
# 80 trees => 0.6201631
# 100 trees => 0.5938395
# 120 trees => 0.6130575
# 400 trees => 0.5863569

#### Merging platform-separated datasets together 

In [118]:
# Merge datasets based on time, subject, and challenge
combined_data <- Reduce(function(x, y) {
  merge(x, y, by = c("time", "subject", "challenge"), all = TRUE)
}, list(imputed_metabolite_data$metabolon, 
        imputed_metabolite_data$biocrates, 
        imputed_metabolite_data$inhouse))

# Print the first rows of the combined dataset
head(combined_data)

time,subject,challenge,"1-(1-enyl-oleoyl)-GPC (P-18:1) [P, nt-ms]","1-(1-enyl-oleoyl)-GPE (P-18:1) [P, nt-ms]","1-(1-enyl-palmitoyl)-GPC (P-16:0) [P, nt-ms]","1-(1-enyl-palmitoyl)-GPE (P-16:0) [P, nt-ms]","1-(1-enyl-stearoyl)-GPC (P-18:0) [P, nt-ms]","1-(1-enyl-stearoyl)-GPE (P-18:0) [P, nt-ms]","1-adrenoyl-GPC (22:4) [P, nt-ms]",...,"SM C24:0 [P, t-ms]","SM C24:1 [P, t-ms]","SM C26:1 [P, t-ms]","threonine [P, t-ms]","tryptophan [P, t-ms]","tyrosine [P, t-ms]","valine [P, t-ms]","Glucose [P, chem.]","Insulin [P, chem.]","Lactate [P, chem.]"
<fct>,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1,Fasting,0.4938,1.0205,0.8303,1.058,0.913,1.0212,0.5614,...,11.28,24.0,0.185,225.2,83.7,93.2,426.5,103.0,6.85,8.9
1,2,Fasting,1.2173,1.2686,1.175,1.1096,1.1053,1.2545,1.2713,...,22.5,49.2,0.435,82.1,85.7,76.4,229.6,89.1,4.78,7.9
1,3,Fasting,1.1172,3.0355,1.4331,1.7547,1.6649,2.5967,1.0602,...,20.69,47.3,0.475,86.0,81.5,62.8,233.3,86.2,7.31,10.6
1,4,Fasting,1.2224,1.157,1.14865,1.1423,1.11795,1.0157,1.11795,...,11.81,30.6,0.345,66.3,74.9,66.5,232.6,79.2,3.29,7.1
1,5,Fasting,1.3746,1.2251,1.3744,1.4259,1.4916,1.0728,1.3278,...,19.91,46.2,0.421,94.1,76.2,61.4,271.6,93.8,6.06,11.7
1,6,Fasting,1.0824,1.3979,1.1735,0.8401,1.1971,1.4994,1.0394,...,13.99,33.8,0.303,89.2,75.7,78.8,248.1,85.6,6.2,9.6


## Correlation (FOR LAURA)

In [119]:
fwrite(combined_data, "../results/correlation_input.csv")
  print("correlatin input saved")

[1] "correlatin input saved"


## Reformating the Table

In [None]:
# Function to add platform information and reshape each dataset into long format
reshape_long <- function(data) {
  # Identify metabolite columns (exclude time, subject, challenge)
  metabolite_columns <- setdiff(names(data), c("time", "subject", "challenge"))
  
  # Convert all metabolite columns to numeric (preserves NA values)
  data[, (metabolite_columns) := lapply(.SD, as.numeric), .SDcols = metabolite_columns]
  
  # Reshape into long format
  long_data <- melt(data,
                    id.vars = c("time", "subject", "challenge"),  # Keep these columns unchanged
                    measure.vars = metabolite_columns,  # Only reshape metabolite columns
                    variable.name = "metabolite",
                    value.name = "response",
                    na.rm = FALSE)  # Keep NA values instead of removing them
  
  # Add platform name based on the metabolite column name
  long_data[, platform_name := case_when(
    grepl("\\[P, t-ms\\]", metabolite) ~ "Biocrates p150 [t-ms]",
    grepl("\\[P, nt-ms\\]", metabolite) ~ "Metabolon HD4 [nt-ms]",
    grepl("\\[P, chem.\\]", metabolite) ~ "In-house biochemistry [chem.]",
    TRUE ~ "Unknown"  # Default case for anything that doesn't match
  )]
  
  return(long_data)
}

# Reshape the combined dataset into long format
met_data <- reshape_long(combined_data)


# Print first rows of the final combined dataset
head(met_data)

Clean up the metabolite names

In [None]:
# Clean metabolite names by removing anything inside square brackets and trimming whitespace
met_data[, metabolite := gsub("\\[.*?\\]", "", metabolite)]  # Remove text inside brackets
met_data[, metabolite := trimws(metabolite)]  # Trim leading/trailing spaces
met_data[, metabolite := tolower(metabolite)]  # Convert to lowercase

# Print first rows to verify changes
head(met_data)


Some data is missing from our met_data dataset (e.g. super_pathway and sub_pathway), we need to add it for further analysis. For this we use info_data.

Clean up of info_data:

In [None]:
# Keep only rows where fluid == "plasma"
info_data <- info_data[fluid == "plasma"]

# Ensure correct encoding and remove asterisks
info_data$metabolite <- gsub("[*]", "", info_data$metabolite)  # Remove all asterisks
info_data$metabolite <- gsub("\u200B", "", info_data$metabolite)  # Remove zero-width spaces (if present)
info_data$metabolite <- gsub("[[:space:]]+$", "", info_data$metabolite)  # Trim trailing spaces
info_data$metabolite <- trimws(info_data$metabolite)  # Remove any remaining spaces
info_data$metabolite <- tolower(info_data$metabolite)  # Convert to lowercase

head(info_data)

Merging the data 

In [None]:
# Merge met_data with info_data based on metabolite and platform_name
met_data <- merge(met_data, 
                  info_data[, .(metabolite, platform_name, super_pathway, sub_pathway)], 
                  by = c("metabolite", "platform_name"), 
                  all.x = TRUE)  # Keep all rows in met_data

# Print first rows to verify the merge
head(met_data)

## Hypothesis Testing

In [None]:
# Create a new dataset excluding rows where platform is "In-house biochemistry [chem.]"
met_data_filtered <- met_data[platform_name != "In-house biochemistry [chem.]", ]

# Print the first few rows of the new dataset
head(met_data_filtered)

For ANOVA & T-TEST we need to only use targeted and non-targeted metabolites, we remove molecules from in Biochemistry, since these are not metabolites (e.g. insulin)

In [None]:
num_unique_metabolite_platforms <- nrow(unique(met_data_filtered[, .(metabolite, platform_name)]))
print(num_unique_metabolite_platforms)

## Anova-Test

In [None]:
# Define significance threshold after multiple testing correction
p_threshold <- 0.05 / 634

# Convert challenge_time to a categorical variable
met_data_filtered[, time := as.factor(time)]
met_data_filtered[, subject := as.factor(subject)]

# Subset data by challenge
metabolite_data_fasting <- met_data_filtered[challenge == "Fasting"]
metabolite_data_pat <- met_data_filtered[challenge == "Physical Activity"]
metabolite_data_oltt <- met_data_filtered[challenge == "OLTT"]

# Function to run ANOVA-like test while considering metabolite & platform
run_anova_like_test <- function(metabolite_data, challenge_name) {
    
    # Initialize results list
    results <- list()

    # Loop through unique metabolite-platform combinations
    unique_metabolites <- unique(metabolite_data[, .(metabolite, platform_name, super_pathway, sub_pathway)])
    
    for (i in seq_len(nrow(unique_metabolites))) {
        
        met <- unique_metabolites$metabolite[i]
        plat <- unique_metabolites$platform_name[i]
        super_path <- unique_metabolites$super_pathway[i]
        sub_path <- unique_metabolites$sub_pathway[i]
        
        # Subset data for this metabolite and platform
        subset_data <- metabolite_data[metabolite == met & platform_name == plat]
        
        # Ensure sufficient data points for analysis
        if (nrow(subset_data) > 2) {
            
            # Run the ld.f1 test
            test_result <- ld.f1(y = subset_data$response, 
                                 time = subset_data$time, 
                                 subject = subset_data$subject, 
                                 description=FALSE)

            # Extract p-value for time effect
            p_value <- test_result$ANOVA.test$`p-value`
            
            # Store results
            results[[paste(met, plat, sep = "_")]] <- data.table(
              challenge = challenge_name,
              metabolite = met,
              platform_name = plat,
              super_pathway = super_path,
              sub_pathway = sub_path,
              p_value = p_value
            )
        }
    }

    # Combine results into a data frame
    anova_results <- rbindlist(results, fill = TRUE)

    # Identify significant time effects
    anova_results[, significant := p_value < p_threshold]

    return(anova_results)
}

# Run ANOVA-like test for each challenge
anova_results_fasting <- run_anova_like_test(metabolite_data_fasting, "Fasting")
anova_results_pat <- run_anova_like_test(metabolite_data_pat, "Physical Activity")
anova_results_oltt <- run_anova_like_test(metabolite_data_oltt, "OLTT")

# Combine all results into one table
#final_anova_results <- rbind(anova_results_ogtt, anova_results_oltt, anova_results_sld, fill = TRUE)
#final_anova_results

#### Significant effect of time on metabolite levels during at least one challenge

In [None]:
# Get all unique metabolites from the updated dataset (excluding insulin)
all_metabolites <- unique(met_data_filtered[, .(metabolite, platform_name, super_pathway, sub_pathway)])

# Sort metabolites first by super_pathway, then sub_pathway, then metabolite name
all_metabolites <- all_metabolites[order(super_pathway, sub_pathway, tolower(metabolite))]

# Initialize the column as FALSE for all metabolites
all_metabolites[, significant_any_challenge := FALSE]

# Extract **only** significant metabolites (ensuring metabolite-platform pairs match)
significant_fasting <- anova_results_fasting[significant == TRUE, .(metabolite, platform_name)]
significant_pat <- anova_results_pat[significant == TRUE, .(metabolite, platform_name)]
significant_oltt <- anova_results_oltt[significant == TRUE, .(metabolite, platform_name)]

# Function to update significance status **only for matching metabolite + platform pairs**
update_significance <- function(met_data_filtered, sig_data) {
    if (nrow(sig_data) > 0) {  # Only run if there's data
        met_data_filtered[sig_data, on = .(metabolite,platform_name), significant_any_challenge := TRUE]
    }
}

# Update based on **corrected** significance lists
update_significance(all_metabolites, significant_fasting)
update_significance(all_metabolites, significant_pat)
update_significance(all_metabolites, significant_oltt)

# Save the output file
#fwrite(all_metabolites, "../results/anova_results_significant_in_at_least_one_challenge.csv")

# Display output
head(all_metabolites)

#### Significant effect of time on metabolite levels during at all challenges

In [None]:
# Initialize columns for each category as FALSE
all_metabolites[, `:=`(
  significant_fasting = FALSE,
  significant_pat = FALSE,
  significant_oltt = FALSE,
  significant_fasting_pat = FALSE,
  significant_fasting_oltt = FALSE,
  significant_pat_oltt = FALSE,
  significant_fasting_pat_oltt = FALSE
)]

# Function to update significance for each individual challenge
update_significance <- function(met_data, sig_data, column_name) {
    if (nrow(sig_data) > 0) {
        met_data[sig_data, on = .(metabolite, platform_name), (column_name) := TRUE]
    }
}

# Update individual significance columns
update_significance(all_metabolites, significant_fasting, "significant_fasting")
update_significance(all_metabolites, significant_pat, "significant_pat")
update_significance(all_metabolites, significant_oltt, "significant_oltt")

# Debugging: Check individual significance counts
print(paste("fasting:", sum(all_metabolites$significant_fasting)))
print(paste("pat:", sum(all_metabolites$significant_pat)))
print(paste("oltt:", sum(all_metabolites$significant_oltt)))

# Calculate overlaps
all_metabolites[, `:=`(
  significant_fasting_pat = significant_fasting & significant_pat & !significant_oltt,
  significant_fasting_oltt = significant_fasting & significant_oltt & !significant_pat,
  significant_pat_oltt = significant_pat & significant_oltt & !significant_fasting,
  significant_fasting_pat_oltt = significant_fasting & significant_pat & significant_oltt
)]

# Debugging: Check overlap counts
print(paste("fasting ∩ pat:", sum(all_metabolites$significant_fasting_pat)))
print(paste("fasting ∩ oltt:", sum(all_metabolites$significant_fasting_oltt)))
print(paste("pat ∩ oltt:", sum(all_metabolites$significant_pat_oltt)))
print(paste("fasting ∩ pat ∩ oltt:", sum(all_metabolites$significant_fasting_pat_oltt)))

# Save the updated table
#fwrite(all_metabolites, "results/all_metabolites_with_significance.csv")

# Calculate final counts
counts <- list(
  fasting = sum(all_metabolites$significant_fasting),
  pat = sum(all_metabolites$significant_pat),
  oltt = sum(all_metabolites$significant_oltt),
  fasting_pat = sum(all_metabolites$significant_fasting_pat),
  fasting_oltt = sum(all_metabolites$significant_fasting_oltt),
  pat_oltt = sum(all_metabolites$significant_pat_oltt),
  fasting_pat_oltt = sum(all_metabolites$significant_fasting_pat_oltt)
)

# Print final counts
print(counts)

## T-Test

In [None]:
# Select baseline data for different challenges
baseline_fasting <- met_data[challenge == "Fasting" & time == "1"]  # Baseline for fasting
baseline_pat  <- met_data[challenge == "Physical Activity" & time == "33"]  # Baseline for physical activity

# Ensure only common subjects are used across all three conditions
common_subjects <- Reduce(intersect, list(baseline_fasting$subject, baseline_pat$subject))
baseline_fasting <- baseline_fasting[subject %in% common_subjects]
baseline_pat <- baseline_pat[subject %in% common_subjects]

# Get the list of unique metabolites
metabolites <- unique(met_data$metabolite)

# Perform paired t-tests for all metabolites
results <- lapply(metabolites, function(met) {
  # Subset data for the current metabolite
  fasting_values <- baseline_fasting[metabolite == met, response]
  pat_values <- baseline_pat[metabolite == met, response]
  
  # Ensure there are valid values for all comparisons
  if (length(fasting_values) > 1 & length(pat_values) > 1) {
    # Calculate mean differences
    mean_diff_pat <- mean(pat_values, na.rm = TRUE) - mean(fasting_values, na.rm = TRUE)
    
    # Perform paired t-tests
    p_val_pat <- tryCatch(
      t.test(pat_values, fasting_values, paired = TRUE, var.equal = FALSE)$p.value,
      error = function(e) NA
    )
    
    # Determine significance
    anova_significance <- all_metabolites[metabolite == met, significant_fasting_pat_oltt]

    pathway_info <- all_metabolites[metabolite == met, .(platform_name, super_pathway, sub_pathway, significant_fasting_pat_oltt)]
    
    # Return results as a data.table
    return(data.table(
      metabolite = met,
      platform_name = pathway_info$platform_name,
      super_pathway = pathway_info$super_pathway,
      sub_pathway = pathway_info$sub_pathway,
      mean_diff_PAT_FASTING = mean_diff_pat,
      pvalue_PAT_FASTING = p_val_pat,
      significant_response = anova_significance
    ))
  }
})

# Remove NULL results safely
results <- rbindlist(Filter(Negate(is.null), results), fill = TRUE)

# Filter rows where significant_FASTING_OLTT_PAT is TRUE
filtered_results <- results[significant_response == TRUE]

# Save filtered results to a CSV file
fwrite(filtered_results, "../results/paired_ttest_results.csv")
message("Filtered T-Test completed! Filtered results saved in: ../results/paired_ttest_results.csv")

# Print summary of the filtered results
num_significant <- nrow(filtered_results)
total_tests <- nrow(results)
percentage <- (num_significant / total_tests) * 100

cat("Number of significant metabolites (filtered):", num_significant, "\n")
cat("Percentage of significant results (filtered):", round(percentage, 2), "%\n")

In [None]:
# Map missing information from met_data_filtered to filtered_results based on "metabolite"
clustering_input <- merge(
  filtered_results, 
  met_data_filtered[, .(metabolite, response, subject, challenge, time)], 
  by = "metabolite", 
  all.x = TRUE
)

# Save the updated results
fwrite(clustering_input, "../results/clustering_input.csv")
#message("Mapped results saved in: ../results/mapped_filtered_results.csv")

# Display the first few rows of the updated data
head(clustering_input)

# Clustering

In [None]:
# Funktion für den Workflow
run_clustering <- function(c_num = 8, m = 1.25, seed = 123) {
  
  # Setze Seed für Reproduzierbarkeit
  set.seed(seed)
  
  # 1. Daten laden und vorverarbeiten
  df <- clustering_input
  head(df)
  
  # Sicherstellen, dass notwendige Spalten vorhanden sind
  required_columns <- c("metabolite", "super_pathway", "sub_pathway", "response", "subject", "challenge")
  if (!all(required_columns %in% colnames(df))) {
    stop("Not all required columns are present in the dataset!")
  }
  
  # Filter und Vorverarbeitung
  #df <- df[!(df$challenge == "OGTT" & df$response == 240), ] # Entferne bestimmte Zeilen
  df$challenge <- NULL # Entferne die 'challenge'-Spalte
  
  # Fehlende Werte im 'response'-Feld durch den Mittelwert ersetzen
  df$response[is.na(df$response)] <- mean(df$response, na.rm = TRUE)
  
  # Z-Score Normalisierung pro Metabolit
  df$response <- ave(df$response, df$metabolite, FUN = function(x) scale(x, center = TRUE, scale = TRUE))
  
  # Metadaten speichern
  df_meta <- unique(df[, c("metabolite", "super_pathway", "sub_pathway")])
  
  # 2. Daten für Clustering vorbereiten
  # Aggregiere Mittelwerte von 'response' nach 'metabolite' und 'platform_name'
  df_agg <- aggregate(response ~ metabolite + platform_name, data = df, FUN = mean)
  
  # Erstelle eine vollständige Kombination aller 'metabolite' und 'platform_name'
  all_combinations <- expand.grid(metabolite = unique(df_agg$metabolite),
                                   platform_name = unique(df_agg$platform_name))
  
  # Verbinde die vollständige Kombination mit den aggregierten Daten
  df_agg <- merge(all_combinations, df_agg, by = c("metabolite", "platform_name"), all.x = TRUE)
  
  # Fehlende Werte in 'response' mit 0 auffüllen
  df_agg$response[is.na(df_agg$response)] <- 0
  
  # Konvertiere die aggregierten Daten in eine Matrix
  response_matrix <- reshape(df_agg, idvar = "metabolite", timevar = "platform_name", direction = "wide")
  rownames(response_matrix) <- response_matrix$metabolite
  response_matrix <- response_matrix[, -1] # Entferne die erste Spalte (metabolite)
  response_matrix <- as.matrix(response_matrix)
  
  # ExpressionSet erstellen
  expr_set <- new("ExpressionSet", exprs = response_matrix)
  
  # 3. Mfuzz-Clustering durchführen
  cl <- mfuzz(expr_set, c = c_num, m = m)
  
  # 4. Ergebnisse formatieren
  cluster_assignments <- data.frame(
    metabolite = rownames(response_matrix),
    cl$membership
  )
  
  # Bestimme den Cluster mit der höchsten Wahrscheinlichkeit für jeden Metaboliten
  cluster_assignments$Assigned_Cluster <- apply(cl$membership, 1, which.max)
  
  # Füge Super Pathway und Sub Pathway hinzu
  cluster_assignments <- merge(cluster_assignments, df_meta, by = "metabolite", all.x = TRUE)
  
  # Formatieren der Spalten
  colnames(cluster_assignments) <- c(
    "Metabolite", "Cluster_1", "Cluster_2", "Cluster_3", "Cluster_4", "Cluster_5",
    "Cluster_6", "Cluster_7", "Cluster_8", "Assigned_Cluster", "Super_Pathway", "Sub_Pathway"
  )
  
  # Ergebnisse speichern
  fwrite(cluster_assignments, "../results/mfuzz_results.csv")
  print("Clustering abgeschlossen! Ergebnisse gespeichert.")
}

# Beispielaufruf
run_clustering(
  c_num = 8,  # Anzahl der Cluster
  m = 1.25,   # Fuzzifizierungsparameter
  seed = 42  # Seed für Reproduzierbarkeit
)