In [None]:
library(data.table)
library(dplyr)
library(nparLD)
library(missForest)
library(parallel)
library(doParallel)

# HuMet DATASET

In this notebook, we analyze a subset of the HuMet dataset, focusing on **plasma** samples across three platforms:

- Metabolon HD4 (nt-ms)
- Biocrates p150 (t-ms)
- In-house biochemistry (chem.)

## Loading the Data

In [None]:
met_data <- fread("../input/raw/humet_data_raw_none_subjects15_tp57.csv", sep = ",", fill = TRUE)
info_data <- fread("../input/humet_info.csv", sep = ",", fill = TRUE)
head(met_data)
head(info_data)

## Data Analysis

In [None]:
# Count metabolite columns
num_metabolite_columns <- length(setdiff(names(met_data), c("time", "subject")))

unique_time_values <- unique(met_data$time)
unique_subject_values <- unique(met_data$subject)

print(paste("Number of metabolites:", num_metabolite_columns))
print(paste("Unique time values:", paste(unique_time_values, collapse = ", ")))
print(paste("Unique subject values:", paste(unique_subject_values, collapse = ", ")))

In [None]:
# Count occurrences of each unique platform
platform_counts <- table(info_data$platform_name)
print(platform_counts)

- Targeted: 132
- Non-Targeted: 502
- Insulin (Hormone): 1

## Preprocessing

## Adding Challenge information

**Relevant time intervals for our analysis**:

Since the original dataset lacked challenge information, we assigned it based on the time column:

- **Fasting**: Time points 1–9
- **Physical Activity**: Time points 33–39
- **Oral Lipid Tolerance Test (OLTT)**: Time points 40–49

In [None]:
# Create a dataset with all time intervals
met_data <- met_data %>%
  mutate(challenge = case_when(
    time >= 1 & time <= 9 ~ "Fasting",
    time >= 33 & time <= 39 ~ "Physical Activity",
    time >= 40 & time <= 49 ~ "OLTT",
    TRUE ~ "Other"  # Keep "Other" instead of filtering out
  ))

# Display first rows of both datasets
#tail(met_data_all)
head(met_data)

In [None]:
# Count occurrences of each unique challenge
challenge_counts <- table(met_data$challenge)
print(challenge_counts)

## Removing Metabolites with > 30% missing values

In [12]:
remove_high_na_metabolites <- function(met_data, threshold = 0.3, output_file = "removed_metabolites.txt") {
  # Identify metabolite columns (excluding time, subject, and challenge)
  metabolite_columns <- setdiff(colnames(met_data), c("time", "subject", "challenge"))
  
  # Calculate the percentage of missing values for each metabolite
  na_percentage <- colMeans(is.na(met_data[, ..metabolite_columns]))

  # Find metabolites with more than `threshold` missing values
  high_na_metabolites <- names(na_percentage[na_percentage > threshold])

  # Write the removed metabolite names to a text file
  if (length(high_na_metabolites) > 0) {
    writeLines(high_na_metabolites, output_file)
  }

  # Remove these metabolites from met_data
  filtered_met_data <- met_data[, !high_na_metabolites, with = FALSE]

  return(filtered_met_data)
}

# Apply function to clean met_data
met_data <- remove_high_na_metabolites(met_data)

In [None]:
# Calculate the total number of columns in the dataset
num_columns <- ncol(met_data)

# Print the number of columns
cat("The number of columns in met_data is:", num_columns, "\n")

#### Leaving Only Relevant Time Points

In [13]:
# Create a dataset with only relevant time intervals
met_data <- met_data %>%
  filter(challenge != "Other")

## Splitting the dataset based on platform

In [14]:
# Identify metabolite columns (excluding time, subject, and challenge)
metabolite_columns <- setdiff(colnames(met_data), c("time", "subject", "challenge"))

# Define platform patterns for Metabolon and Biocrates
platforms <- list(
  metabolon = "\\[P, nt-ms\\]",
  biocrates = "\\[P, t-ms\\]"
)

# Function to filter metabolites based on platform
filter_metabolites <- function(pattern) {
  selected_cols <- c("time", "subject", "challenge", metabolite_columns[grepl(pattern, metabolite_columns)])
  met_data[, ..selected_cols]
}

# Function to filter metabolites NOT belonging to Metabolon or Biocrates (i.e., Inhouse)
filter_inhouse_metabolites <- function() {
  excluded_cols <- unique(unlist(lapply(platforms, function(p) metabolite_columns[grepl(p, metabolite_columns)])))
  selected_cols <- c("time", "subject", "challenge", setdiff(metabolite_columns, excluded_cols))
  met_data[, ..selected_cols]
}

# Create datasets
met_data_metabolon <- filter_metabolites(platforms$metabolon)
met_data_biocrates <- filter_metabolites(platforms$biocrates)
met_data_inhouse <- filter_inhouse_metabolites()  # Everything else

# Inspect results
#list(met_data_metabolon, met_data_biocrates, met_data_inhouse)


## Handle Missing Values - missForest

In [None]:
head(met_data_metabolon, 10)

The dataset was downloaded with the **"Concentrations and relative abundances"** transformation applied.  
According to the HuMet documentation, the following preprocessing steps were already performed:

#### 1. **Manual Data Curation**
- Data points exceeding **4 times the standard deviation** at a given time point were flagged.
- If these outliers were **not within the first 30 minutes** of a challenge, they were considered for exclusion.
- After **manual inspection**, **92 data points** were removed.

#### 2. **Missing Data**
- The dataset **does not contain manually excluded data points**.

Since these steps were applied in the repository, **additional outlier removal and manual curation are not necessary.**

In [None]:
# Count missing values in each dataset
sum(is.na(data_fasting))
sum(is.na(data_exercise))
sum(is.na(data_oltt))

In [None]:
# Function to convert categorical variables to factors
convert_to_factors <- function(data) {
  data %>%
    mutate(
      challenge = as.factor(challenge),
      time = as.factor(time),
      subject = as.factor(subject)
    ) %>%
    mutate(across(where(is.character), as.factor))
}

# Function for missForest imputation with adaptive parallelization
perform_missForest <- function(data_subset, ntree_val = 10) {
  num_vars <- ncol(data_subset)  # Get the number of variables
  
  # Adjust cores to be at most the number of variables
  num_cores <- min(detectCores() - 1, num_vars)
  
  # If parallelization is still invalid, set it to 'no'
  parallel_option <- if (num_cores > 1) "variables" else "no"
  
  cl <- makeCluster(num_cores, type = "FORK") 
  registerDoParallel(cl)
  
  set.seed(42)  # Ensures reproducibility
  imputed_data <- missForest(data_subset, ntree = ntree_val, parallelize = parallel_option, verbose = TRUE)
  
  stopCluster(cl)  # Stop cluster
  
  return(imputed_data$ximp)  # Extract imputed dataset
}

# Wrapper function to process and impute metabolite datasets
data_pipeline <- function(metabolite_datasets) {
  # Convert categorical variables to factors
  metabolite_datasets <- lapply(metabolite_datasets, convert_to_factors)
  
  # Perform imputation with automatic parallelization adjustment
  imputed_data <- lapply(metabolite_datasets, perform_missForest, ntree_val = 400)
  
  return(imputed_data)
}

# List of metabolite datasets
metabolite_datasets <- list(
  metabolon = met_data_metabolon,
  biocrates = met_data_biocrates,
  inhouse = met_data_inhouse
)

# Apply pipeline to each dataset
imputed_metabolite_data <- data_pipeline(metabolite_datasets)

In [None]:
head(imputed_metabolite_data$metabolon, 10)

In [None]:
### AFTER REMOVING OTHER

# 50 trees => 0.603862
# 68 trees => 0.6200632
# 69 trees => 0.594013
# 70 trees => 0.5919029
# 75 trees => 0.5951333
# 76 trees => 0.5837592
# 77 trees => 0.5951333
# 78 trees => 0.5852654
# 80 trees => 0.6201631
# 100 trees => 0.5938395
# 120 trees => 0.6130575
# 400 trees => 0.5863569


head(imputed_metabolite_data$metabolon)


## Reformating the Table

In [None]:
# Function to reshape a dataset into long format and add platform information
reshape_long <- function(data, platform_name) {
  # Identify metabolite columns (exclude time, subject, challenge)
  metabolite_columns <- setdiff(names(data), c("time", "subject", "challenge"))

  # Convert all metabolite columns to numeric (preserves NA values)
  data[, (metabolite_columns) := lapply(.SD, as.numeric), .SDcols = metabolite_columns]

  # Reshape into long format
  long_data <- melt(data,
                    id.vars = c("time", "subject", "challenge"),  # Keep these columns unchanged
                    measure.vars = metabolite_columns,  # Only reshape metabolite columns
                    variable.name = "metabolite",
                    value.name = "response",
                    na.rm = FALSE)  # Keep NA values instead of removing them

  # Add platform column
  long_data[, platform_name := platform_name]

  return(long_data)
}

# Reshape all three datasets and combine them
met_data <- rbindlist(
  list(
    reshape_long(imputed_metabolite_data$metabolon, "Metabolon HD4 [nt-ms]"),
    reshape_long(imputed_metabolite_data$biocrates, "Biocrates p150 [t-ms]"),
    reshape_long(imputed_metabolite_data$inhouse, "In-house biochemistry [chem.]")
  ),
  use.names = TRUE,
  fill = TRUE
)

# Print first rows of the final combined dataset
head(met_data)

Clean up the metabolite names

In [None]:
# Clean metabolite names by removing anything inside square brackets and trimming whitespace
met_data[, metabolite := gsub("\\[.*?\\]", "", metabolite)]  # Remove text inside brackets
met_data[, metabolite := trimws(metabolite)]  # Trim leading/trailing spaces
met_data[, metabolite := tolower(metabolite)]  # Convert to lowercase

# Print first rows to verify changes
head(met_data)


Some data is missing from our met_data dataset (e.g. super_pathway and sub_pathway), we need to add it for further analysis. For this we use info_data.

Clean up of info_data:

In [None]:
# Keep only rows where fluid == "plasma"
info_data <- info_data[fluid == "plasma"]

# Ensure correct encoding and remove asterisks
info_data$metabolite <- gsub("[*]", "", info_data$metabolite)  # Remove all asterisks
info_data$metabolite <- gsub("\u200B", "", info_data$metabolite)  # Remove zero-width spaces (if present)
info_data$metabolite <- gsub("[[:space:]]+$", "", info_data$metabolite)  # Trim trailing spaces
info_data$metabolite <- trimws(info_data$metabolite)  # Remove any remaining spaces
info_data$metabolite <- tolower(info_data$metabolite)  # Convert to lowercase

head(info_data)

Merging the data 

In [None]:
# Merge met_data with info_data based on metabolite and platform_name
met_data <- merge(met_data, 
                  info_data[, .(metabolite, platform_name, super_pathway, sub_pathway)], 
                  by = c("metabolite", "platform_name"), 
                  all.x = TRUE)  # Keep all rows in met_data

# Print first rows to verify the merge
head(met_data)

In [None]:
# Get unique values of the platform_name column
unique_platforms <- unique(met_data$platform_name)

# Print the unique platforms
print(unique_platforms)


## Hypothesis Testing

In [None]:
# Create a new dataset excluding rows where platform is "In-house biochemistry [chem.]"
met_data_filtered <- met_data[platform_name != "In-house biochemistry [chem.]", ]

# Print the first few rows of the new dataset
head(met_data_filtered)

In [None]:
filtered = unique(met_data$challenge)
print(filtered)

## Anova-Test

In [None]:
### DO NOT FORGET TO ADJUST THE THRESHOLD!!!!!

# Define significance threshold after multiple testing correction
p_threshold <- 0.05 / 634

# Convert challenge_time to a categorical variable
met_data_filtered[, time := as.factor(time)]
met_data_filtered[, subject := as.factor(subject)]

# Subset data by challenge
metabolite_data_fasting <- met_data_filtered[challenge == "Fasting"]
metabolite_data_pat <- met_data_filtered[challenge == "Physical Activity"]
metabolite_data_oltt <- met_data_filtered[challenge == "OLTT"]

# Function to run ANOVA-like test while considering metabolite & platform
run_anova_like_test <- function(metabolite_data, challenge_name) {
    
    # Initialize results list
    results <- list()

    # Loop through unique metabolite-platform combinations
    unique_metabolites <- unique(metabolite_data[, .(metabolite, platform_name, super_pathway, sub_pathway)])
    
    for (i in seq_len(nrow(unique_metabolites))) {
        
        met <- unique_metabolites$metabolite[i]
        plat <- unique_metabolites$platform_name[i]
        super_path <- unique_metabolites$super_pathway[i]
        sub_path <- unique_metabolites$sub_pathway[i]
        
        # Subset data for this metabolite and platform
        subset_data <- metabolite_data[metabolite == met & platform_name == plat]
        
        # Ensure sufficient data points for analysis
        if (nrow(subset_data) > 2) {
            
            # Run the ld.f1 test
            test_result <- ld.f1(y = subset_data$response, 
                                 time = subset_data$time, 
                                 subject = subset_data$subject, 
                                 description=FALSE)

            # Extract p-value for time effect
            p_value <- test_result$ANOVA.test$`p-value`
            
            # Store results
            results[[paste(met, plat, sep = "_")]] <- data.table(
              challenge = challenge_name,
              metabolite = met,
              platform_name = plat,
              super_pathway = super_path,
              sub_pathway = sub_path,
              p_value = p_value
            )
        }
    }

    # Combine results into a data frame
    anova_results <- rbindlist(results, fill = TRUE)

    # Identify significant time effects
    anova_results[, significant := p_value < p_threshold]

    return(anova_results)
}

# Run ANOVA-like test for each challenge
anova_results_fasting <- run_anova_like_test(metabolite_data_fasting, "Fasting")
anova_results_pat <- run_anova_like_test(metabolite_data_pat, "Physical Activity")
anova_results_oltt <- run_anova_like_test(metabolite_data_oltt, "OLTT")

# Combine all results into one table
#final_anova_results <- rbind(anova_results_ogtt, anova_results_oltt, anova_results_sld, fill = TRUE)
#final_anova_results

#### Significant effect of time on metabolite levels during at least one challenge

In [None]:
# Get all unique metabolites from the updated dataset (excluding insulin)
all_metabolites <- unique(met_data_filtered[, .(metabolite, platform_name, super_pathway, sub_pathway)])

# Sort metabolites first by super_pathway, then sub_pathway, then metabolite name
all_metabolites <- all_metabolites[order(super_pathway, sub_pathway, tolower(metabolite))]

# Initialize the column as FALSE for all metabolites
all_metabolites[, significant_any_challenge := FALSE]

# Extract **only** significant metabolites (ensuring metabolite-platform pairs match)
significant_fasting <- anova_results_fasting[significant == TRUE, .(metabolite, platform_name)]
significant_pat <- anova_results_pat[significant == TRUE, .(metabolite, platform_name)]
significant_oltt <- anova_results_oltt[significant == TRUE, .(metabolite, platform_name)]

# Function to update significance status **only for matching metabolite + platform pairs**
update_significance <- function(met_data_filtered, sig_data) {
    if (nrow(sig_data) > 0) {  # Only run if there's data
        met_data_filtered[sig_data, on = .(metabolite,platform_name), significant_any_challenge := TRUE]
    }
}

# Update based on **corrected** significance lists
update_significance(all_metabolites, significant_fasting)
update_significance(all_metabolites, significant_pat)
update_significance(all_metabolites, significant_oltt)

# Save the output file
fwrite(all_metabolites, "../results/anova_results_significant_in_at_least_one_challenge.csv")

# Display output
head(all_metabolites)

#### Significant effect of time on metabolite levels during at all challenges

In [None]:
# Initialize columns for each category as FALSE
all_metabolites[, `:=`(
  significant_fasting = FALSE,
  significant_pat = FALSE,
  significant_oltt = FALSE,
  significant_fasting_pat = FALSE,
  significant_fasting_oltt = FALSE,
  significant_pat_oltt = FALSE,
  significant_fasting_pat_oltt = FALSE
)]

# Function to update significance for each individual challenge
update_significance <- function(met_data, sig_data, column_name) {
    if (nrow(sig_data) > 0) {
        met_data[sig_data, on = .(metabolite, platform_name), (column_name) := TRUE]
    }
}

# Update individual significance columns
update_significance(all_metabolites, significant_fasting, "significant_fasting")
update_significance(all_metabolites, significant_pat, "significant_pat")
update_significance(all_metabolites, significant_oltt, "significant_oltt")

# Debugging: Check individual significance counts
print(paste("fasting:", sum(all_metabolites$significant_fasting)))
print(paste("pat:", sum(all_metabolites$significant_pat)))
print(paste("oltt:", sum(all_metabolites$significant_oltt)))

# Calculate overlaps
all_metabolites[, `:=`(
  significant_fasting_pat = significant_fasting & significant_pat & !significant_oltt,
  significant_fasting_oltt = significant_fasting & significant_oltt & !significant_pat,
  significant_pat_oltt = significant_pat & significant_oltt & !significant_fasting,
  significant_fasting_pat_oltt = significant_fasting & significant_pat & significant_oltt
)]

# Debugging: Check overlap counts
print(paste("fasting ∩ pat:", sum(all_metabolites$significant_fasting_pat)))
print(paste("fasting ∩ oltt:", sum(all_metabolites$significant_fasting_oltt)))
print(paste("pat ∩ oltt:", sum(all_metabolites$significant_pat_oltt)))
print(paste("fasting ∩ pat ∩ oltt:", sum(all_metabolites$significant_fasting_pat_oltt)))

# Save the updated table
#fwrite(all_metabolites, "results/all_metabolites_with_significance.csv")

# Calculate final counts
counts <- list(
  fasting = sum(all_metabolites$significant_fasting),
  pat = sum(all_metabolites$significant_pat),
  oltt = sum(all_metabolites$significant_oltt),
  fasting_pat = sum(all_metabolites$significant_fasting_pat),
  fasting_oltt = sum(all_metabolites$significant_fasting_oltt),
  pat_oltt = sum(all_metabolites$significant_pat_oltt),
  fasting_pat_oltt = sum(all_metabolites$significant_fasting_pat_oltt)
)

# Print final counts
print(counts)

## T-Test

In [None]:
### Needs to adjusted 

# Select baseline data for different challenges
baseline_ogtt <- met_data[challenge == "fasting" & challenge_time == "0"]  # Baseline for OGTT
baseline_sld  <- met_data[challenge == "ogtt" & challenge_time == "240"]  # Baseline for SLD (240 min after OGTT)
baseline_oltt <- met_data[challenge == "oltt" & challenge_time == "0"]  # Baseline for OLTT (240 min)

# Ensure only common subjects are used across all three conditions
common_subjects <- Reduce(intersect, list(baseline_ogtt$subject, baseline_sld$subject, baseline_oltt$subject))
baseline_ogtt <- baseline_ogtt[subject %in% common_subjects]
baseline_sld <- baseline_sld[subject %in% common_subjects]
baseline_oltt <- baseline_oltt[subject %in% common_subjects]

# Get the list of unique metabolites
metabolites <- unique(met_data$metabolite)

# Perform paired t-tests for all metabolites
results <- lapply(metabolites, function(met) {
  # Subset data for the current metabolite
  ogtt_values <- baseline_ogtt[metabolite == met, response]
  sld_values <- baseline_sld[metabolite == met, response]
  oltt_values <- baseline_oltt[metabolite == met, response]
  
  # Ensure there are valid values for all comparisons
  if (length(ogtt_values) > 1 & length(sld_values) > 1 & length(oltt_values) > 1) {
    # Calculate mean differences
    mean_diff_sld <- mean(sld_values, na.rm = TRUE) - mean(ogtt_values, na.rm = TRUE)
    mean_diff_oltt <- mean(oltt_values, na.rm = TRUE) - mean(ogtt_values, na.rm = TRUE)
    
    # Perform paired t-tests
    p_val_sld <- tryCatch(
      t.test(sld_values, ogtt_values, paired = TRUE, var.equal = FALSE)$p.value,
      error = function(e) NA
    )
    p_val_oltt <- tryCatch(
      t.test(oltt_values, ogtt_values, paired = TRUE, var.equal = FALSE)$p.value,
      error = function(e) NA
    )
    
    # Determine significance
    anova_significance <- all_metabolites[metabolite == met, significant_any_challenge]
    
    # Return results as a data.table
    return(data.table(
      metabolite = met,
      mean_diff_SLD_OGTT = mean_diff_sld,
      pvalue_SLD_OGTT = p_val_sld,
      mean_diff_OLTT_OGTT = mean_diff_oltt,
      pvalue_OLTT_OGTT = p_val_oltt,
      significant_response = anova_significance
    ))
  }
})

# Remove NULL results safely
results <- rbindlist(Filter(Negate(is.null), results), fill = TRUE)

# Save results to a CSV file
fwrite(results, "../results/paired_ttest_results.csv")
message("T-Test completed! Results saved in: ../results/paired_ttest_results.csv")

# Print summary of the results
num_significant <- sum(results$significant_response, na.rm = TRUE)
total_tests <- nrow(results)
percentage <- (num_significant / total_tests) * 100

cat("Number of significant metabolites:", num_significant, "\n")
cat("Percentage of significant results:", round(percentage, 2), "%\n")

# Clustering

## Z-Score Calculation

In [26]:
met_data_z_score <- fread("../input/raw/humet_data_zscore_none_subjects15_tp57.csv", sep = ",", fill = TRUE)
met_data_raw <- fread("../input/raw/humet_data_raw_none_subjects15_tp57.csv", sep = ",", fill = TRUE)

In [None]:
# Identify metabolite columns (exclude "time" and "subject")
metabolite_columns <- setdiff(names(met_data_raw), c("time", "subject", "challenge"))

# Apply z-score transformation to all metabolite columns
met_data_raw[, (metabolite_columns) := lapply(.SD, function(x) (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)), .SDcols = metabolite_columns]

# Print first rows to verify
head(met_data_raw)


In [None]:
# Apply z-score transformation grouped by "metabolite"
met_data[, response_z := (response - mean(response, na.rm = TRUE)) / sd(response, na.rm = TRUE), by = metabolite]

# Print the first rows to verify
head(met_data)
