# Correlate variables to remove redundant metrics

In [1]:
suppressPackageStartupMessages({
    library(here)
    library(dplyr)
    library(SummarizedExperiment)
})

## Functions

In [2]:
load_phenotypes <- function(region){
    pheno_file <- here("input/phenotypes/_m/phenotypes.csv")
    pheno = data.table::fread(pheno_file) |> filter(Region == region) |> 
        mutate_if(is.list, ~sapply(., sum)) |>
        mutate_if(is.numeric, scales::rescale)
    return(pheno)
}

check_dup <- function(df){
    sample    <- df |> select_if(is.numeric)
    sample    <- Filter(function(x) sd(x) != 0, sample)
    variables <- names(sample)
    return(cytominer::correlation_threshold(variables, sample, cutoff=0.95))
}

check_corr <- function(df){
    sample <- df |> select_if(is.numeric)
    sample <- Filter(function(x) sd(x) != 0, sample)
    dt     <- sample |> corrr::correlate() |>
        corrr::stretch() |> tidyr::drop_na() |>
        filter(abs(r) > 0.95) |>
        distinct(r, .keep_all=TRUE)
    varX <- distinct(dt, x)$x
    varX <- varX[-which(varX %in% intersect(varX, distinct(dt, y)$y))]
    vars <- unique(c(distinct(dt, x)$x, distinct(dt, y)$y))
    return(setdiff(vars, varX))
}

remove_variables <- function(pheno_df){
    if(length(check_corr(pheno_df)) != 0){
        pheno_df <- pheno_df |> select(-check_corr(pheno_df))
    }
    return(pheno_df)
}

## Main

### Load phenotypes

In [3]:
caudate <- load_phenotypes("Caudate")
dlpfc   <- load_phenotypes("DLPFC")
hippo   <- load_phenotypes("HIPPO")

In [4]:
caudate |> dim()
dlpfc |> dim()
hippo |> dim()

### Drop correlated

In [5]:
check_corr(caudate)
check_corr(dlpfc)
check_corr(hippo)

Correlation computed with
[36m•[39m Method: 'pearson'
[36m•[39m Missing treated using: 'pairwise.complete.obs'


Correlation computed with
[36m•[39m Method: 'pearson'
[36m•[39m Missing treated using: 'pairwise.complete.obs'


Correlation computed with
[36m•[39m Method: 'pearson'
[36m•[39m Missing treated using: 'pairwise.complete.obs'


In [6]:
caudate <- remove_variables(caudate)
dlpfc   <- remove_variables(dlpfc)
hippo   <- remove_variables(hippo)

Correlation computed with
[36m•[39m Method: 'pearson'
[36m•[39m Missing treated using: 'pairwise.complete.obs'
Correlation computed with
[36m•[39m Method: 'pearson'
[36m•[39m Missing treated using: 'pairwise.complete.obs'
Correlation computed with
[36m•[39m Method: 'pearson'
[36m•[39m Missing treated using: 'pairwise.complete.obs'
Correlation computed with
[36m•[39m Method: 'pearson'
[36m•[39m Missing treated using: 'pairwise.complete.obs'
Correlation computed with
[36m•[39m Method: 'pearson'
[36m•[39m Missing treated using: 'pairwise.complete.obs'
Correlation computed with
[36m•[39m Method: 'pearson'
[36m•[39m Missing treated using: 'pairwise.complete.obs'


In [7]:
caudate |> dim()
dlpfc |> dim()
hippo |> dim()

### Commone variables

In [8]:
vars <- intersect(colnames(caudate),intersect(colnames(dlpfc), colnames(hippo)))
vars

In [9]:
length(vars)

In [10]:
data.frame("Variables"=vars) |>
    data.table::fwrite("shared_variables.tsv", sep='\t')

## Reproducibility

In [11]:
Sys.time()
proc.time()
options(width = 120)
sessioninfo::session_info()$platform
sessioninfo::session_info()$packages

[1] "2023-02-13 18:45:00 EST"

   user  system elapsed 
  5.183   0.195   7.075 

Unnamed: 0_level_0,package,ondiskversion,loadedversion,path,loadedpath,attached,is_base,date,source,md5ok,library
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<lgl>,<fct>
base64enc,base64enc,0.1.3,0.1-3,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/base64enc,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/base64enc,False,False,2015-07-28,CRAN (R 4.2.1),,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2
Biobase,Biobase,2.58.0,2.58.0,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/Biobase,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/Biobase,True,False,2022-11-01,Bioconductor,,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2
BiocGenerics,BiocGenerics,0.44.0,0.44.0,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/BiocGenerics,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/BiocGenerics,True,False,2022-11-01,Bioconductor,,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2
bitops,bitops,1.0.7,1.0-7,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/bitops,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/bitops,False,False,2021-04-24,CRAN (R 4.2.2),,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2
cli,cli,3.6.0,3.6.0,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/cli,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/cli,False,False,2023-01-09,CRAN (R 4.2.2),,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2
colorspace,colorspace,2.1.0,2.1-0,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/colorspace,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/colorspace,False,False,2023-01-23,CRAN (R 4.2.2),,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2
corrr,corrr,0.4.4,0.4.4,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/corrr,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/corrr,False,False,2022-08-16,CRAN (R 4.2.2),,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2
crayon,crayon,1.5.2,1.5.2,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/crayon,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/crayon,False,False,2022-09-29,CRAN (R 4.2.2),,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2
data.table,data.table,1.14.6,1.14.6,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/data.table,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/data.table,False,False,2022-11-16,CRAN (R 4.2.2),,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2
DelayedArray,DelayedArray,0.24.0,0.24.0,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/DelayedArray,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2/DelayedArray,False,False,2022-11-01,Bioconductor,,/home/kynon/R/x86_64-pc-linux-gnu-library/4.2
