## Setup

In [None]:
# Load required packages
library(tidyverse)

In [None]:
# Load all data
arg_fst_df <- suppressMessages(snakemake@input[["arg_fst"]] %>% purrr::map_dfr(., read_csv))

gt_fst_df <- suppressMessages(read_delim(snakemake@input[["gt_fsts"]], delim='\t') %>% filter(pop1 == 'Urban' & pop2 == 'Rural'))

cols = c("chrom", "pos", "fst_num", "fst_denom")
sfs_fst_df <- suppressMessages(read_delim(snakemake@input[["sfs_fsts"]], delim = '\t', col_names = cols))

load_region_files <- function(path){
    region_id = str_split(basename(path), "\\.", simplify=T)[1,4]
    dat <- suppressMessages(
        read_delim(path, delim="\t", col_names=c("chrom", "start", "end")) %>% 
        mutate(region_id = region_id)
    )
    return(dat)
}
regions_df <- snakemake@input[["regions"]] %>% 
    purrr::map_df(., load_region_files)

In [None]:
merge_fst_vals <- function(df){

    region <- df %>% pull(regionID) %>% unique()
    region_df <- regions_df %>% filter(region_id == region) 
    start <- region_df$start + 1
    end <- region_df$end
    window_size <- snakemake@params[["window_sizw"]]

    starts <- seq(start, end - 1, window_size)
    if(end < (start - 1) + 1e6){
        starts <- starts[1:length(starts) - 1]
    }
    ends <- (starts - 1) + window_size
    
    all_intervals <- data.frame(window_pos_1 = starts, window_pos_2 = ends)
    
    arg_fsts <- df %>% 
        dplyr::select(regionID, window_index, arg_branch_fst, arg_site_fst)
    gt_fsts <- gt_fst_df %>% 
        filter(window_pos_1 >= start & window_pos_2 <= end) %>% 
        full_join(., all_intervals, by = c("window_pos_1", "window_pos_2")) %>% 
        arrange(window_pos_1)

    all_fsts <- bind_cols(arg_fsts, gt_fsts)
    return(all_fsts)
}

all_fsts_df <- arg_fst_df %>% 
    group_split(regionID) %>% 
    purrr::map_dfr(., merge_fst_vals)

In [None]:
options(repr.plot.width = 14, repr.plot.height = 6)
all_fsts_df %>% 
    filter(regionID == 10) %>% 
    ggplot() +
        geom_line(aes(x = window_pos_1, y = arg_site_fst), color = 'blue', linewidth = 1, alpha = 1) +
        geom_line(aes(x = window_pos_1, y = avg_hudson_fst), color = 'red', linewidth = 1, alpha = 1) +
        ggtitle("region 10") +
        ylab("Fst") + xlab("Genomic position") +
        theme_classic()

In [None]:
options(repr.plot.width = 6, repr.plot.height = 6)
all_fsts_df %>% 
    filter(no_snps >= 20) %>% 
    ggplot(aes(x = arg_site_fst, y = avg_hudson_fst)) +
        geom_point(size = 2) +
        geom_smooth(method = "lm", linewidth = 1, color = 'blue') +
        ylab("Hudson's Fst from Pixy (i.e. VCFs)") + xlab("Site-based Fst from tskit (i.e. ARGs)") +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15))

In [None]:
all_fsts_df %>% 
    filter(no_snps >= 20) %>% 
    ggplot(aes(x = arg_branch_fst, y = avg_hudson_fst)) +
        geom_point(size = 2) +
        geom_smooth(method = "lm", linewidth = 1, color = 'blue') +
        ylab("Hudson's Fst from Pixy (i.e. VCFs)") + xlab("Branch-based Fst from tskit (i.e. ARGs)") +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15))

In [None]:
all_fsts_df <- all_fsts_df %>% 
    filter(no_snps >= 20) %>% 
    group_by(regionID) %>% 
    summarize(branch_cor = cor(arg_branch_fst, avg_hudson_fst, use = "complete.obs"),
              site_cor = cor(arg_site_fst, avg_hudson_fst, use = "complete.obs")) 

all_fsts_df %>% 
    ggplot(aes(x = branch_cor)) +
        geom_histogram(bins = 30, color = "black", fill = "red") +
        ylab("Number of 1Mb regions") + xlab("ARG branch Fst vs. Pixy GT Fst correlation") +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15))

In [None]:
all_fsts_df %>% 
    ggplot(aes(x = site_cor)) +
        geom_histogram(bins = 30, color = "black", fill = "red") +
        ylab("Number of 1Mb regions") + xlab("ARG branch Fst vs. Pixy GT Fst correlation") +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15))

In [None]:
all_fsts_df %>% 
    ggplot(aes(x = site_cor, y = branch_cor)) +
        geom_point(size = 2) +
        geom_smooth(method = 'lm', linewidth = 1, color = 'blue') +
        ylab("ARG branch Fst vs. Pixy GT Fst correlation") + xlab("ARG site Fst vs. Pixy GT Fst correlation") +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15))

In [None]:
all_fsts_df %>% 
    filter(site_cor <= 0.9 | branch_cor <= 0.6)