In [None]:
library(tidyverse)
library(factoextra)
source("scripts/r/baypass_utils.R")
source("scripts/r/functions_objects.R")

In [None]:
sample_sheet <- read_delim(snakemake@config[["samples"]], delim = "\t", show_col_types = FALSE) %>%
    dplyr::select(sample, continent, city, site, range)

sample_order <- read_table(snakemake@input[["bams"]], col_names = FALSE, show_col_types = FALSE) %>%
    mutate(sample = str_extract(X1, pattern = "(?<=(final|toronto_bams)\\/).*(?=_merged)")) %>%
    dplyr::select(sample) %>%
    left_join(sample_sheet, by = "sample")

head(sample_order)

In [None]:
covMat <- read_delim(snakemake@input[["mat"]], col_names = FALSE, delim = ' ', show_col_types = FALSE)
head(covMat)

In [None]:
# Perform PCA on covariance matrix
pca <- prcomp(covMat)

In [None]:
# Extract eigenvalues/variances
head(get_eig(pca))

In [None]:
scree <- fviz_screeplot(pca, addlabels = TRUE, ylim = c(0, 25)) + 
    ylab("% Variance explained") + xlab("PC axis") +
    my_theme


ggsave(filename = snakemake@output[["scree"]], plot = scree, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600)

In [None]:
pca_df <- get_pca_ind(pca)$coord %>%
    as.data.frame() %>%
    dplyr::select("Dim.1":"Dim.6") %>%
    bind_cols(sample_order)
head(pca_df)

In [None]:
pc1_var <- round(get_eigenvalue(pca)["Dim.1", "variance.percent"], 2)
pc2_var <- round(get_eigenvalue(pca)["Dim.2", "variance.percent"], 2)

pca_plot <- pca_df %>% 
    ggplot(aes(x = Dim.1, y = Dim.2)) +
        geom_point(aes(color = continent), size = 3, alpha = 0.5) +
        scale_color_manual(values = c("#f2cd00", "#ef6a00", "#cd001a", "#538d22", "#1961ae", "#61007d")) +
        coord_cartesian(xlim = c(-1.6, 1.6), ylim = c(-1.6, 0.7)) +
        scale_x_continuous(breaks = seq(-1.5, 1.5, 0.5)) +
        scale_y_continuous(breaks = seq(-1.5, 0.5, 0.5)) +
        xlab(paste0("PC1 (", pc1_var, "%)")) + ylab(paste0("PC2 (", pc2_var, "%)")) +
        my_theme
pca_plot

ggsave(filename = snakemake@output[["all_pca"]], plot = pca_plot, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600)

In [None]:
plot_continental_pca <- function(df, cont){
    off_cont <- df %>% filter(continent != cont)
    on_cont <- df %>% filter(continent == cont)

    plot <- off_cont %>%
        ggplot(aes(x = Dim.1, y = Dim.2)) +
            geom_point(color = "black", size = 3, alpha = 0.1) +
            geom_point(data = on_cont, aes(color = city, shape = site), size = 3, alpha = 1) +
            scale_color_manual(values = c("#f2cd00", "#ef6a00", "#cd001a", "#538d22", "#1961ae", "#61007d")) +
            coord_cartesian(xlim = c(-1.6, 1.6), ylim = c(-1.6, 0.7)) +
            scale_x_continuous(breaks = seq(-1.5, 1.5, 0.5)) +
            scale_y_continuous(breaks = seq(-1.5, 0.5, 0.5)) +
            xlab(paste0("PC1 (", pc1_var, "%)")) + ylab(paste0("PC2 (", pc2_var, "%)")) +
            my_theme

    return(plot)
}

In [None]:
nam_pca <- plot_continental_pca(pca_df, cont = "NAM")
oce_pca <- plot_continental_pca(pca_df, cont = "OCE")
sam_pca <- plot_continental_pca(pca_df, cont = "SAM")
afr_pca <- plot_continental_pca(pca_df, cont = "AFR")
asi_pca <- plot_continental_pca(pca_df, cont = "ASI")
eu_pca <- plot_continental_pca(pca_df, cont = "EU")


ggsave(filename = snakemake@output[["nam_pca"]], plot = nam_pca, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600)
ggsave(filename = snakemake@output[["oce_pca"]], plot = oce_pca, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600)
ggsave(filename = snakemake@output[["sam_pca"]], plot = sam_pca, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600)
ggsave(filename = snakemake@output[["afr_pca"]], plot = afr_pca, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600)
ggsave(filename = snakemake@output[["asi_pca"]], plot = asi_pca, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600)
ggsave(filename = snakemake@output[["eu_pca"]], plot = eu_pca, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600)

In [None]:
pca_df %>%
    dplyr::select(Dim.1:Dim.6) %>%
    write_delim(., snakemake@output[["cov_file"]], col_names = FALSE, delim = ' ')