## Setup

In [None]:
# Load required packages
library(tidyverse)

In [None]:
# Load all data
load_fst_data <- function(path){
    name <- basename(path)
    win_size <- str_extract(name, "(?<=win)\\d+(?=\\.txt)")
    df <- read_delim(path, delim = "\t") %>% 
        mutate(win_size = win_size)
    return(df)
}
all_fst_df <- snakemake@input[["all_fsts"]] %>% 
    purrr::map_dfr(load_fst_data) %>% 
    mutate(win_size = as.character(win_size))

In [None]:
all_fst_df_filtered <- all_fst_df %>% 
    filter(!(arg_site_fst < -1 & win_size == 1)) %>% 
    filter(!(no_snps < 20 & win_size == 10000))
all_fst_df_filtered

In [None]:
options(repr.plot.width = 14, repr.plot.height = 6)
labels <- c(`1` = "Per-site", `10000` = '10 kb windows')
site_fst_winSize_hist <- all_fst_df %>% 
    ggplot(aes(x = arg_site_fst)) +
        geom_histogram(bins = 100, color = "black", fill = "red") +
        facet_wrap(~win_size, scales = "free", labeller = as_labeller(labels)) +
        ylab("Number of sites") + xlab("Site-based Fst from tskit (i.e. ARGs)") +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13))
site_fst_winSize_hist

ggsave(filename = snakemake@output[["site_fst_winSize_hist"]], plot = site_fst_winSize_hist,
       device = "pdf", height = 14, width = 6, units = "in", dpi = 300)

In [None]:
site_fst_winSize_hist_filt <- all_fst_df_filtered %>% 
    ggplot(aes(x = arg_site_fst)) +
        geom_histogram(bins = 100, color = "black", fill = "red") +
        facet_wrap(~win_size, scales = "free", labeller = as_labeller(labels)) +
        ylab("Number of sites") + xlab("Site-based Fst from tskit (i.e. ARGs)") +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13))
site_fst_winSize_hist_filt

ggsave(filename = snakemake@output[["site_fst_winSize_hist_filt"]], plot = site_fst_winSize_hist_filt,
       device = "pdf", height = 14, width = 6, units = "in", dpi = 300)

In [None]:
branch_fst_winSize_hist_filt <- all_fst_df_filtered %>% 
    ggplot(aes(x = arg_branch_fst)) +
        geom_histogram(bins = 100, color = "black", fill = "red") +
        facet_wrap(~win_size, scales = "free", labeller = as_labeller(labels)) +
        ylab("Number of sites") + xlab("Branch-based Fst from tskit (i.e. ARGs)") +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13))
branch_fst_winSize_hist_filt

ggsave(filename = snakemake@output[["branch_fst_winSize_hist_filt"]], plot = branch_fst_winSize_hist_filt,
       device = "pdf", height = 14, width = 6, units = "in", dpi = 300)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 12)
site_fst_by_method_winSize_manhat <- all_fst_df_filtered %>% 
    ggplot() +
        geom_line(aes(x = window_pos_1, y = gt_hudson_fst, color = "Genotype (VCF)"), linewidth = 0.5, alpha = 0.5) +
        geom_line(aes(x = window_pos_1, y = arg_site_fst, color = "ARG-site"), linewidth = 0.5, alpha = 0.8) +
        facet_grid(win_size~., scales = "free", labeller = as_labeller(labels)) +
        ylab("Fst") + xlab("Genomic position") +
        theme_classic() +
        scale_color_manual(name = "Method", values = c("Genotype (VCF)" = "blue", "ARG-site" = "red")) +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13),
              legend.position = "top",
              legend.direction = "horizontal",
              legend.key.size = unit(1, 'cm'),
              legend.title = element_text(size=15),
              legend.text = element_text(size=13))
site_fst_by_method_winSize_manhat

ggsave(filename = snakemake@output[["site_fst_by_method_winSize_manhat"]], plot = site_fst_by_method_winSize_manhat,
       device = "pdf", height = 20, width = 12, units = "in", dpi = 300)

In [None]:
branch_fst_by_method_winSize_manhat <- all_fst_df_filtered %>% 
    ggplot() +
        geom_line(aes(x = window_pos_1, y = gt_hudson_fst, color = "Genotype (VCF)"), linewidth = 0.5, alpha = 0.5) +
        geom_line(aes(x = window_pos_1, y = arg_branch_fst, color = "ARG-branch"), linewidth = 0.5) +
        facet_grid(win_size~., scales = "free", labeller = as_labeller(labels)) +
        ylab("Fst") + xlab("Genomic position") +
        theme_classic() +
        scale_color_manual(name = "Method", values = c("Genotype (VCF)" = "blue", "ARG-branch" = "red")) +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13),
              legend.position = "top",
              legend.direction = "horizontal",
              legend.key.size = unit(1, 'cm'),
              legend.title = element_text(size=15),
              legend.text = element_text(size=13))
branch_fst_by_method_winSize_manhat

ggsave(filename = snakemake@output[["branch_fst_by_method_winSize_manhat"]], plot = branch_fst_by_method_winSize_manhat,
       device = "pdf", height = 20, width = 12, units = "in", dpi = 300)

In [None]:
options(repr.plot.width = 14, repr.plot.height = 6)
site_gt_fst_cor_by_winSize <- all_fst_df_filtered %>% 
    ggplot(aes(x = arg_site_fst, y = gt_hudson_fst)) +
        geom_point(size = 2, alpha = 0.4) +
        facet_wrap(~win_size, scales = "free", labeller = as_labeller(labels)) +
        geom_smooth(method = "lm", linewidth = 1, color = 'blue') +
        ylab("Hudson's Fst from Pixy (i.e. VCFs)") + xlab("Site-based Fst from tskit (i.e. ARGs)") +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13))
site_gt_fst_cor_by_winSize

ggsave(filename = snakemake@output[["site_gt_fst_cor_by_winSize"]], plot = site_gt_fst_cor_by_winSize,
       device = "pdf", height = 14, width = 6, units = "in", dpi = 300)

In [None]:
branch_gt_fst_cor_by_winSize <- all_fst_df_filtered %>% 
    ggplot(aes(x = arg_branch_fst, y = gt_hudson_fst)) +
        geom_point(size = 2, alpha = 0.4) +
        geom_smooth(method = "lm", linewidth = 1, color = 'blue') +
        facet_wrap(~win_size, scales = "free", labeller = as_labeller(labels)) +
        ylab("Hudson's Fst from Pixy (i.e. VCFs)") + xlab("Branch-based Fst from tskit (i.e. ARGs)") +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13))
branch_gt_fst_cor_by_winSize

ggsave(filename = snakemake@output[["branch_gt_fst_cor_by_winSize"]], plot = branch_gt_fst_cor_by_winSize,
       device = "pdf", height = 6, width = 6, units = "in", dpi = 300)

In [None]:
site_sfs_fst_cor_by_winSize <- all_fst_df_filtered %>% 
    ggplot(aes(x = arg_site_fst, y = sfs_hudson_fst)) +
        geom_point(size = 2, alpha = 0.4) +
        geom_smooth(method = "lm", linewidth = 1, color = 'blue') +
        facet_wrap(~win_size, scales = "free", labeller = as_labeller(labels)) +
        ylab("Hudson's Fst from ANGSD (i.e. VCFs)") + xlab("Site-based Fst from tskit (i.e. ARGs)") +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13))
site_sfs_fst_cor_by_winSize

ggsave(filename = snakemake@output[["site_sfs_fst_cor_by_winSize"]], plot = site_sfs_fst_cor_by_winSize,
       device = "pdf", height = 14, width = 6, units = "in", dpi = 300)

In [None]:
branch_sfs_fst_cor_by_winSize <- all_fst_df_filtered %>% 
    ggplot(aes(x = arg_branch_fst, y = sfs_hudson_fst)) +
        geom_point(size = 2, alpha = 0.4) +
        geom_smooth(method = "lm", linewidth = 1, color = 'blue') +
        facet_wrap(~win_size, scales = "free", labeller = as_labeller(labels)) +
        ylab("Hudson's Fst from ANGSD (i.e. VCFs)") + xlab("Branch-based Fst from tskit (i.e. ARGs)") +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13))
branch_sfs_fst_cor_by_winSize

ggsave(filename = snakemake@output[["branch_sfs_fst_cor_by_winSize"]], plot = branch_sfs_fst_cor_by_winSize,
       device = "pdf", height = 14, width = 6, units = "in", dpi = 300)

In [None]:
# Create dataframe with Fst correlation coefficients
all_fst_cors <- all_fst_df_filtered %>% 
    group_by(win_size, regionID) %>% 
    summarize(branch_gt_cor = cor(arg_branch_fst, gt_hudson_fst, use = "complete.obs"),
              site_gt_cor = cor(arg_site_fst, gt_hudson_fst, use = "complete.obs"),
              branch_sfs_cor = cor(arg_branch_fst, sfs_hudson_fst, use = "complete.obs"),
              site_sfs_cor = cor(arg_site_fst, sfs_hudson_fst, use = "complete.obs"))
all_fst_cors

In [None]:
branch_gt_cor_hist_by_winSize <- all_fst_cors %>% 
    ggplot(aes(x = branch_gt_cor)) +
        geom_histogram(bins = 30, color = "black", fill = "red") +
        ylab("Number of 1Mb regions") + xlab("ARG branch Fst vs. Pixy GT Fst correlation") +
        facet_wrap(~win_size, scales = "free", labeller = as_labeller(labels)) +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13))
branch_gt_cor_hist_by_winSize

ggsave(filename = snakemake@output[["branch_gt_cor_hist_by_winSize"]], plot = branch_gt_cor_hist_by_winSize,
       device = "pdf", height = 14, width = 6, units = "in", dpi = 300)

In [None]:
site_gt_cor_hist_by_winSize <- all_fst_cors %>% 
    ggplot(aes(x = site_gt_cor)) +
        geom_histogram(bins = 30, color = "black", fill = "red") +
        ylab("Number of 1Mb regions") + xlab("ARG site Fst vs. Pixy GT Fst correlation") +
        facet_wrap(~win_size, scales = "free", labeller = as_labeller(labels)) +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13))
site_gt_cor_hist_by_winSize

ggsave(filename = snakemake@output[["site_gt_cor_hist_by_winSize"]], plot = site_gt_cor_hist_by_winSize,
       device = "pdf", height = 14, width = 6, units = "in", dpi = 300)

In [None]:
branch_sfs_cor_hist_by_winSize <- all_fst_cors %>% 
    ggplot(aes(x = branch_sfs_cor)) +
        geom_histogram(bins = 30, color = "black", fill = "red") +
        ylab("Number of 1Mb regions") + xlab("ARG branch Fst vs. ANGSD SFS Fst correlation") +
        facet_wrap(~win_size, scales = "free", labeller = as_labeller(labels)) +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13))
branch_sfs_cor_hist_by_winSize

ggsave(filename = snakemake@output[["branch_sfs_cor_hist_by_winSize"]], plot = branch_sfs_cor_hist_by_winSize,
       device = "pdf", height = 14, width = 6, units = "in", dpi = 300)

In [None]:
site_sfs_cor_hist_by_winSize <- all_fst_cors %>% 
    ggplot(aes(x = site_sfs_cor)) +
        geom_histogram(bins = 30, color = "black", fill = "red") +
        ylab("Number of 1Mb regions") + xlab("ARG branch Fst vs. ANGSD SFS Fst correlation") +
        facet_wrap(~win_size, scales = "free", labeller = as_labeller(labels)) +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13))
site_sfs_cor_hist_by_winSize

ggsave(filename = snakemake@output[["site_sfs_cor_hist_by_winSize"]], plot = site_sfs_cor_hist_by_winSize,
       device = "pdf", height = 14, width = 6, units = "in", dpi = 300)

In [None]:
windowed_fst_df_filtered <- all_fst_df_filtered %>% 
    filter(win_size == 10000)
arg_fst_quant <- quantile(windowed_fst_df_filtered %>% pull(arg_branch_fst), probs = 0.99); arg_fst_quant
gt_fst_quant <- quantile(windowed_fst_df_filtered %>% pull(gt_hudson_fst), probs = 0.99); gt_fst_quant
sfs_fst_quant <- quantile(windowed_fst_df_filtered %>% pull(sfs_hudson_fst), probs = 0.99); sfs_fst_quant