In [None]:
library(tidyverse)
library(factoextra)
source("scripts/r/baypass_utils.R")
source("scripts/r/functions_objects.R")

## Compare Omega matrices within and between seeds

- Compare Omega matrices estimated on different SNP subsets within random seeds using Förstner and Moonen Distance
- Plot FMD values across 3 different random seeds

In [None]:
# Load Omega matrices for all BayPass runs
# Results in DF with `seed`, `split`, and `mat` columns
# `mat` is a tidy nested column containing the 
load_omega_mat <- function(path){
    seed <- str_extract(path, "(?<=seed)(\\d+)(?=_)")
    split <- str_extract(path, "(?<=split)(\\d+)(?=_)")
    mat <- suppressMessages(read_table(path, col_names = FALSE))
    df_out <- tibble(seed = seed, split = split, mat = mat) %>% 
        nest_by(seed, split, .key = "mat")
    return(df_out)# Load Omega matrices for all BayPass runs
}

mat_df <- snakemake@input[["obs_omega_mat"]] %>% 
    purrr::map_dfr(load_omega_mat) %>% 
    ungroup() %>% 
    as_tibble()

In [None]:
# Estimate FMD between all pairwise SNP subsets within seeds
# Return DF in long format with pariwise FMD values
calculate_fmd_within_seeds <- function(data_df){
    
    # Create dataframe with all pairwise combinations of matrices
    # Ignores self-comparisons and permutations
    fmd_df <- utils::combn(0:(nrow(data_df) - 1), 2) %>% 
        t() %>% 
        as.data.frame() %>% 
        rename("n1" = "V1", "n2" = "V2")
    seed <- data_df %>% pull(seed) %>% unique()

    calculate_fmd <- function(fmd_df, data_df){
        n1 <- fmd_df %>% pull(n1)
        n2 <- fmd_df %>% pull(n2)
        mat1 <- data_df %>% filter(split == n1) %>% unnest(mat) %>% dplyr::select(-seed, -split) %>% as.matrix()
        mat2 <- data_df %>% filter(split == n2) %>% unnest(mat) %>% dplyr::select(-seed, -split) %>% as.matrix()
        
        fmd <- fmd.dist(mat1, mat2)
        df_mod <- fmd_df %>% 
            mutate(fmd = fmd)
        return(df_mod)
    }
    
    fmd_df <- fmd_df %>% 
        group_split(n1, n2) %>% 
        purrr::map_dfr(., calculate_fmd, data_df = data_df) %>% 
        mutate(seed = seed)

    return(fmd_df)
}

fmd_within_seeds <- mat_df %>% 
    group_split(seed) %>% 
    purrr::map_dfr(calculate_fmd_within_seeds)

In [None]:
# Summary statistics for FMD distances
fmd_within_seeds %>% 
    group_by(seed) %>% 
    reframe(mean = mean(fmd),
            sd = sd(fmd),
            median = median(fmd),
            min = min(fmd),
            max = max(fmd)) %>% 
    write_delim(snakemake@output[["fmd_sum"]], delim = '\t')

In [None]:
# Boxplot of FMD values for each random seed
fmd_box <- fmd_within_seeds %>% 
    ggplot(aes(x = seed, y = fmd)) +
        geom_boxplot() +
        xlab("Random seed") +
        ylab("Förstner and Moonen Distance (FMD)") +
        coord_cartesian(ylim = c(0.52, 0.63)) +
        scale_y_continuous(breaks = seq(0.52, 0.62, 0.02)) +
        my_theme
fmd_box
ggsave(filename = snakemake@output[["fmd_box"]], plot = fmd_box, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")

## Compare alphas and betas within and between seeds

In [None]:
load_beta_params <- function(path){
    seed <- str_extract(path, "(?<=seed)(\\d+)(?=_)")
    split <- str_extract(path, "(?<=split)(\\d+)(?=_)")
    betas_df <- suppressMessages(read_table(path, col_names = TRUE)) %>% 
        mutate(seed = seed, split = split)
    return(betas_df)
}

betas_df <- snakemake@input[["obs_beta_sum"]] %>% 
    purrr::map_dfr(load_beta_params)

In [None]:
alpha_box <- betas_df %>% 
    filter(PARAM == "a_beta_pi") %>% 
    ggplot(aes(x = seed, y = Mean)) +
        geom_boxplot() +
        xlab("Random seed") +
        ylab("a_beta_pi") +
        # coord_cartesian(ylim = c(0.52, 0.63)) +
        # scale_y_continuous(breaks = seq(0.52, 0.62, 0.02)) +
        my_theme
alpha_box
ggsave(filename = snakemake@output[["alpha_box"]], plot = alpha_box, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
beta_box <- betas_df %>% 
    filter(PARAM == "b_beta_pi") %>% 
    ggplot(aes(x = seed, y = Mean)) +
        geom_boxplot() +
        xlab("Random seed") +
        ylab("b_beta_pi") +
        # coord_cartesian(ylim = c(0.52, 0.63)) +
        # scale_y_continuous(breaks = seq(0.52, 0.62, 0.02)) +
        my_theme
beta_box
ggsave(filename = snakemake@output[["beta_box"]], plot = beta_box, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")

## Compare observed vs. simulated omega and betas

- Compare omega matrix from simulated `BayPass` run to omega matrix from observed data used to simulate the input allele counts
- Do the same as above for the posterio means of $\alpha_{\pi}$ and $b_{\pi}$

In [None]:
obs_mat <- mat_df %>% filter(seed == 3700 & split == 37) %>% unnest(mat) %>% dplyr::select(-seed, -split) %>% as.matrix()
sim_mat <- read_table(snakemake@input[["sim_omega_mat"]], col_names = FALSE) %>% as.matrix()
obs_sim_fmd <- fmd.dist(obs_mat, sim_mat)

In [None]:
obs_alpha <- betas_df %>% filter(seed == 3700 & split == 37 & PARAM == "a_beta_pi") %>% pull(Mean)
sim_betas <- read_table(snakemake@input[["sim_beta_sum"]])
sim_alpha <- sim_betas %>% filter(PARAM == "a_beta_pi") %>% pull(Mean)
print(obs_alpha)
print(sim_alpha)

In [None]:
obs_beta <- betas_df %>% filter(seed == 3700 & split == 37 & PARAM == "b_beta_pi") %>% pull(Mean)
sim_beta <- sim_betas %>% filter(PARAM == "b_beta_pi") %>% pull(Mean)
print(obs_beta)
print(sim_beta)

In [None]:
data.frame("comparison" = c("FMD", "obs_alpha", "obs_beta", "sim_alpha", "sim_beta"),
           "value" = c(obs_sim_fmd, obs_alpha, obs_beta, sim_alpha, sim_beta)) %>% 
    write_delim(snakemake@output[["obs_sim_stats"]], delim = "\t")

## Plot population structure of random run

- Plot SVD and PCA of random run, colored by habitat or continent

In [None]:
random_run <- mat_df %>% filter(seed == 3700 & split == 37)
random_mat <- random_run$mat[[1]] %>% as.matrix()

In [None]:
samples <- read_delim(snakemake@config[["samples"]], delim='\t') %>% 
    dplyr::select(continent, site, city, range) %>% 
    rename("habitat" = "site") %>% 
    distinct()
head(samples)

In [None]:
rand_omega_pca <- prcomp(random_mat)

In [None]:
pc1_var <- round(get_eigenvalue(rand_omega_pca)["Dim.1", "variance.percent"], 2)
pc2_var <- round(get_eigenvalue(rand_omega_pca)["Dim.2", "variance.percent"], 2)

rand_pca_plot <- get_pca_ind(rand_omega_pca)$coord %>% 
    bind_cols(., samples) %>% 
    ggplot(aes(x = Dim.1, y = Dim.2)) +
        geom_line(aes(group = city)) +
        geom_point(aes(color = continent, shape = habitat), size = 5) +
        scale_color_manual(values = c("#f2cd00", "#ef6a00", "#cd001a", "#538d22", "#1961ae", "#61007d")) +
        xlab(paste0("PC1 (", pc1_var, "%)")) + ylab(paste0("PC2 (", pc2_var, "%)")) +
        my_theme
rand_pca_plot
ggsave(filename = snakemake@output[["obs_rand_pca"]], plot = rand_pca_plot, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
sim_omega_pca <- prcomp(sim_mat)

In [None]:
pc1_var <- round(get_eigenvalue(sim_omega_pca)["Dim.1", "variance.percent"], 2)
pc2_var <- round(get_eigenvalue(sim_omega_pca)["Dim.2", "variance.percent"], 2)

sim_pca_plot <- get_pca_ind(sim_omega_pca)$coord %>% 
    bind_cols(., samples) %>% 
    ggplot(aes(x = Dim.1, y = Dim.2)) +
        geom_line(aes(group = city)) +
        geom_point(aes(color = continent, shape = habitat), size = 5) +
        scale_color_manual(values = c("#f2cd00", "#ef6a00", "#cd001a", "#538d22", "#1961ae", "#61007d")) +
        xlab(paste0("PC1 (", pc1_var, "%)")) + ylab(paste0("PC2 (", pc2_var, "%)")) +
        my_theme
sim_pca_plot
ggsave(filename = snakemake@output[["sim_pca"]], plot = sim_pca_plot, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")