In [None]:
library(tidyverse)
library(factoextra)
source("scripts/r/baypass_utils.R")
source("scripts/r/functions_objects.R")

## Compare Omega matrices within and between seeds

- Compare Omega matrices estimated on different SNP subsets within random seeds using Förstner and Moonen Distance
- Plot FMD values across 3 different random seeds

In [None]:
# Load Omega matrices for all BayPass runs
# Results in DF with `seed`, `split`, and `matrix` columns
# `matrix` is a tidy nested column
load_omega_mat <- function(path){
    seed <- str_extract(path, "(?<=seed)(\\d)(?=_)")
    split <- str_extract(path, "(?<=split)(\\d+)(?=_)")
    mat <- suppressMessages(read_table(path, col_names = FALSE))
    df_out <- tibble(seed = seed, split = split, mat = list(mat))
    return(df_out)
}

mat_df <- snakemake@input[["omega_mat"]] %>% 
    purrr::map_dfr(load_omega_mat)

In [None]:
# Estimate FMD between all pairwise SNP subsets within seeds
# Return DF in long format with pariwise FMD values
calculate_fmd_within_seeds <- function(data_df){

    fmd_df <- utils::combn(0:(nrow(data_df) - 1), 2) %>% 
        t() %>% 
        as.data.frame() %>% 
        rename("n1" = "V1", "n2" = "V2")
    seed <- data_df %>% pull(seed) %>% unique()
    
    calculate_fmd <- function(fmd_df, data_df){
        n1 <- fmd_df %>% pull(n1)
        n2 <- fmd_df %>% pull(n2)
        
        mat1 <- data_df[data_df$split == n1, ]$mat[[1]] %>% as.matrix()
        mat2 <- data_df[data_df$split == n2, ]$mat[[1]] %>% as.matrix()

        fmd <- fmd.dist(mat1, mat2)
        df_mod <- fmd_df %>% 
            mutate(fmd = fmd)
        return(df_mod)
    }
    
    fmd_df <- fmd_df %>% 
        group_split(n1, n2) %>% 
        purrr::map_dfr(., calculate_fmd, data_df = data_df) %>% 
        mutate(seed = seed)

    return(fmd_df)
}

fmd_within_seeds <- mat_df %>% 
    group_split(seed) %>% 
    purrr::map_dfr(calculate_fmd_within_seeds)

In [None]:
# Summary statistics for FMD distances
fmd_within_seeds %>% 
    group_by(seed) %>% 
    reframe(mean = mean(fmd),
            sd = sd(fmd),
            median = median(fmd),
            min = min(fmd),
            max = max(fmd))

In [None]:
# Boxplot of FMD values for each random seed
fmd_box <- fmd_within_seeds %>% 
    ggplot(aes(x = seed, y = fmd)) +
        geom_boxplot() +
        xlab("Random seed") +
        ylab("Förstner and Moonen Distance (FMD)") +
        my_theme
fmd_box
ggsave(filename = snakemake@output[["fmd_box"]], plot = fmd_box, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")

## Plot population structure of random run

- Plot SVD and PCA of random run, colored by habitat or continent

In [None]:
set.seed(42)
random_run <- mat_df %>% filter(seed == 1) %>% sample_n(., 1)
random_mat <- random_run$mat[[1]]

In [None]:
samples <- read_delim(snakemake@config[["samples"]], delim='\t') %>% 
    dplyr::select(continent, site, city, range) %>% 
    rename("habitat" = "site") %>% 
    distinct()
head(samples)

In [None]:
rand_omega_pca <- prcomp(random_mat)

In [None]:
pc1_var <- round(get_eigenvalue(rand_omega_pca)["Dim.1", "variance.percent"], 2)
pc2_var <- round(get_eigenvalue(rand_omega_pca)["Dim.2", "variance.percent"], 2)

pca_plot <- get_pca_ind(rand_omega_pca)$coord %>% 
    bind_cols(., samples) %>% 
    ggplot(aes(x = Dim.1, y = Dim.2)) +
        geom_line(aes(group = city)) +
        geom_point(aes(color = continent, shape = habitat), size = 5) +
        scale_color_manual(values = c("#f2cd00", "#ef6a00", "#cd001a", "#538d22", "#1961ae", "#61007d")) +
        xlab(paste0("PC1 (", pc1_var, "%)")) + ylab(paste0("PC1 (", pc2_var, "%)")) +
        my_theme
pca_plot
ggsave(filename = snakemake@output[["pca"]], plot = pca_plot, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")