# Correlate variables to remove redundant metrics

In [None]:
suppressPackageStartupMessages({
    library(here)
    library(dplyr)
    library(SummarizedExperiment)
})

## Functions

In [None]:
load_phenotypes <- function(region){
    pheno_file <- here("input/phenotypes/_m/phenotypes.csv")
    pheno = data.table::fread(pheno_file) |> filter(Region == region) |> 
        mutate_if(is.list, ~sapply(., sum)) |>
        mutate_if(is.numeric, scales::rescale)
    return(pheno)
}

check_dup <- function(df){
    sample    <- df |> select_if(is.numeric)
    sample    <- Filter(function(x) sd(x) != 0, sample)
    variables <- names(sample)
    return(cytominer::correlation_threshold(variables, sample, cutoff=0.95))
}

check_corr <- function(df){
    sample <- df |> select_if(is.numeric)
    sample <- Filter(function(x) sd(x) != 0, sample)
    dt     <- sample |> corrr::correlate() |>
        corrr::stretch() |> tidyr::drop_na() |>
        filter(abs(r) > 0.95) |>
        distinct(r, .keep_all=TRUE)
    varX <- distinct(dt, x)$x
    varX <- varX[-which(varX %in% intersect(varX, distinct(dt, y)$y))]
    vars <- unique(c(distinct(dt, x)$x, distinct(dt, y)$y))
    return(setdiff(vars, varX))
}

remove_variables <- function(pheno_df){
    if(length(check_corr(pheno_df)) != 0){
        pheno_df <- pheno_df |> select(-check_corr(pheno_df))
    }
    return(pheno_df)
}

## Main

### Load phenotypes

In [None]:
caudate <- load_phenotypes("Caudate")
dlpfc   <- load_phenotypes("DLPFC")
hippo   <- load_phenotypes("HIPPO")

In [None]:
caudate |> dim()
dlpfc |> dim()
hippo |> dim()

### Drop correlated

In [None]:
check_corr(caudate)
check_corr(dlpfc)
check_corr(hippo)

In [None]:
caudate <- remove_variables(caudate)
dlpfc   <- remove_variables(dlpfc)
hippo   <- remove_variables(hippo)

In [None]:
caudate |> dim()
dlpfc |> dim()
hippo |> dim()

### Commone variables

In [None]:
vars <- intersect(colnames(caudate),intersect(colnames(dlpfc), colnames(hippo)))
vars

In [None]:
length(vars)

In [None]:
data.frame("Variables"=vars) |>
    data.table::fwrite("shared_variables.tsv", sep='\t')

## Reproducibility

In [None]:
Sys.time()
proc.time()
options(width = 120)
sessioninfo::session_info()$platform
sessioninfo::session_info()$packages