In [None]:
# Load required Packages
library(tidyverse)

In [None]:
# Load file with selected and unselected XP-nSL regions
regions <- read_delim(snakemake@input[["regions"]], delim='\t')
head(regions)

In [None]:
# Load ARG-based estimates of Fst across entire regions
# I.e. mean Fst across region, averaged across all posterior ARG samples
arg_fst_df <- snakemake@input[["arg_fst"]] %>% 
    purrr::map_dfr(., read_delim, delim = '\t')
head(arg_fst_df)

In [None]:
# Add branch and site based Fst estimates to regions DF
mean_arg_fst_df <- arg_fst_df %>% 
    group_by(regionID, type) %>% 
    summarise(mean = mean(fst))  %>% 
    pivot_wider(., names_from = "type", values_from = "mean", names_prefix = "fst_") %>% 
    left_join(., regions, by = "regionID")
head(mean_arg_fst_df)

In [None]:
# Plot site and branch-based Fst estimates
ggplot(mean_arg_fst_df, aes(x = fst_site, y = fst_branch)) +
    geom_point(size = 3, color = 'black', fill = 'black') +
    geom_smooth(method = 'lm', color = 'blue') +
    xlab('Site Fst') + ylab("Branch Fst") +
    theme_classic() +
    theme(axis.text = element_text(size = 15),
          axis.title = element_text(size = 18))

In [None]:
# Load in Fst estimated from genotypes in the VCF. Add to regions DF
gt_fst <- read_delim(snakemake@input[["gt_fst"]], delim = "\t")
mean_arg_fst_df <- mean_arg_fst_df %>% 
    left_join(., gt_fst, by = "regionID")

In [None]:
# Plot genotype Fst against ARG branch-based Fst
mean_arg_fst_df %>% 
    ggplot(., aes(x = fst_branch, y = gt_fst_weighted)) +
    geom_point(size = 3, aes(color = direction)) +
    geom_smooth(method = 'lm', color = 'blue') +
    xlab('ARG Branch Fst') + ylab("VCFtools GT Fst") +
    theme_classic() +
    theme(axis.text = element_text(size = 15),
          axis.title = element_text(size = 18))

In [None]:
# Correlation between GT and ARG Fst
cor.test(mean_arg_fst_df$gt_fst_weighted, mean_arg_fst_df$fst_branch)

In [None]:
# Same as above but without regions in putative inversions
inversion_regions <- c(83:94, 97:111, 130:139)
mean_arg_fst_df %>% 
    filter(!regionID %in% inversion_regions) %>% 
    ggplot(., aes(x = fst_branch, y = gt_fst_weighted)) +
    geom_point(size = 3, aes(color = direction)) +
    geom_smooth(method = 'lm', color = 'blue') +
    xlab('ARG Branch Fst') + ylab("VCFtools GT Fst") +
    theme_classic() +
    theme(axis.text = element_text(size = 15),
          axis.title = element_text(size = 18))

In [None]:
# Correlation excluding putative inversions
cor.test(filter(mean_arg_fst_df, !regionID %in% inversion_regions)$gt_fst_weighted, 
         filter(mean_arg_fst_df, !regionID %in% inversion_regions)$fst_branch)

In [None]:
# Calculate mean windowed Fst from SFS in same regions as above ARG based estimates
calculate_mean_sfs_fst <- function(path, mean_arg_fst_df){
    get_region_fst <- function(arg_region_df, sfs_fst_df){
        region <- arg_region_df[["regionID"]]
        chr <- arg_region_df[["Chr"]]
        start <- arg_region_df[["start"]]
        end <- arg_region_df[["end"]]
        sfs_fst_df_filt <- sfs_fst_df %>% 
            filter(Pos >= start & Pos <= end) %>% 

            # Cap numerators at 0 if negative 
            # https://github.com/ANGSD/angsd/issues/309
            mutate(num = ifelse(num < 0, 0, num)) %>% 
            
            # Estimate weighted Fst as ratio of averages
            # https://github.com/ANGSD/angsd/issues/61
            summarise(num_sum = sum(num),
                      denom_sum = sum(denom),
                      fst = num_sum / denom_sum) %>% 
        mutate(regionID = region)
        return(sfs_fst_df_filt)     
    }
    
    sfs_fst_df <- read_delim(path, delim = '\t', col_names = c("Chr","Pos","num","denom")) 
    chr <- sfs_fst_df %>% pull(Chr) %>% unique()
    print(chr)
    flush.console()
    mean_sfs_fst_df <- mean_arg_fst_df %>% 
        ungroup() %>% 
        filter(Chr == chr) %>% 
        group_split(regionID) %>% 
        purrr::map(., get_region_fst, sfs_fst_df = sfs_fst_df)
    return(mean_sfs_fst_df)
}

sfs_fst_df <- snakemake@input[["sfs_fst"]] %>% 
    purrr::map_dfr(., calculate_mean_sfs_fst, mean_arg_fst_df = mean_arg_fst_df)

In [None]:
# Plot ARG-based Fst against SFS-based Fst
mean_arg_fst_df %>% 
    left_join(., sfs_fst_df, by = "regionID") %>% 
    ggplot(., aes(x = fst, y = fst_branch)) +
        geom_point(size = 3, aes(color = direction, fill = direction)) +
        geom_smooth(method = 'lm', color = 'blue') +
        xlab('SFS Fst') + ylab("ARG Branch Fst") +
        theme_classic() +
        theme(axis.text = element_text(size = 15),
              axis.title = element_text(size = 18))

In [None]:
# Same as above but without regions in putative inversions
mean_arg_fst_df %>% 
    filter(!regionID %in% inversion_regions) %>% 
    left_join(., sfs_fst_df, by = "regionID") %>% 
    ggplot(., aes(x = fst, y = fst_branch)) +
        geom_point(size = 3, aes(color = direction, fill = direction)) +
        geom_smooth(method = 'lm', color = 'blue') +
        xlab('SFS Fst') + ylab("ARG Branch Fst") +
        theme_classic() +
        theme(axis.text = element_text(size = 15),
              axis.title = element_text(size = 18))

In [None]:
# Plot ARG-based Fst against SFS-based Fst
mean_arg_fst_df %>% 
    left_join(., sfs_fst_df, by = "regionID") %>% 
    ggplot(., aes(x = fst, y = gt_fst_weighted)) +
        geom_point(size = 3, aes(color = direction, fill = direction)) +
        geom_smooth(method = 'lm', color = 'blue') +
        xlab('SFS Fst') + ylab("VCFtools Fst") +
        theme_classic() +
        theme(axis.text = element_text(size = 15),
              axis.title = element_text(size = 18))

In [None]:
win_fst_df <- snakemake@input[["win_fst"]] %>% 
    purrr::map_dfr(read_csv)

In [None]:
num_sites_trees_df <- snakemake@input[["nsites"]] %>% 
    purrr::map_dfr(read_csv)

In [None]:
num_sites_trees_df %>%
    mutate(r_mu_ratio = num_trees / num_sites) %>% 
    filter(regionID == 1) %>% 
    ggplot(aes(x = r_mu_ratio)) +
        geom_histogram(bins = 50, color = 'black', fill = 'red') +
        ylab("Number of iterations") + xlab("Recombination rate to mutation rate ratio") +
        theme_classic() +
        theme(axis.title = element_text(size = 15),
              axis.text = element_text(size = 13))

In [None]:
mean_r_mu_ratio_df <- num_sites_trees_df %>%
    mutate(r_mu_ratio = num_trees / num_sites) %>% 
    group_by(regionID) %>% 
    summarise(r_mu_ratio = mean(r_mu_ratio))

In [None]:
mean_r_mu_ratio_df %>% 
    ggplot(aes(x = r_mu_ratio)) +
        geom_histogram(bins = 50, color = 'black', fill = 'red') +
        ylab("Number of regions") + xlab("Mean recombination rate to mutation rate ratio") +
        theme_classic() +
        theme(axis.title = element_text(size = 15),
              axis.text = element_text(size = 13))    

In [None]:
win_fst_df %>% 
    group_by(regionID) %>% 
    summarize(cor = cor(gt_fst, arg_fst, use = "complete.obs")) %>% 
    ggplot(aes(x = cor)) +
        geom_histogram(bins = 50, color = 'black', fill = 'red') +
        ylab("Number of regions") + xlab("ARG vs. GT Fst correlation") +
        theme_classic() +
        theme(axis.title = element_text(size = 15),
              axis.text = element_text(size = 13))

In [None]:
win_fst_df %>% 
    group_by(regionID) %>% 
    summarize(cor = cor(gt_fst, arg_fst, use = "complete.obs")) %>% 
    filter(cor > -0.01 & cor < 0.01) %>% 
    arrange(desc(cor))

In [None]:
win_fst_df %>% 
    group_by(regionID) %>% 
    summarize(cor = cor(gt_fst, arg_fst, use = "complete.obs")) %>% 
    left_join(mean_r_mu_ratio_df, by="regionID") %>% 
    ggplot(., aes(x = r_mu_ratio, y = cor)) +
        geom_point(size = 3) +
        geom_smooth(method = 'lm', color = 'blue') +
        ylab('ARG vs. GT Fst correlation') + xlab("Mean recombination rate to mutation rate ratio") +
        theme_classic() +
        theme(axis.text = element_text(size = 15),
              axis.title = element_text(size = 18))

In [None]:
win_fst_df %>% 
    group_by(regionID) %>% 
    summarize(cor = cor(gt_fst, arg_fst, use = "complete.obs")) %>% 
    filter(regionID %in% inversion_regions) %>% 
    ggplot(aes(x = cor)) +
        geom_histogram(bins = 20, color = 'black', fill = 'red') +
        ylab("Number of regions") + xlab("ARG vs. GT Fst correlation") +
        theme_classic() +
        theme(axis.title = element_text(size = 15),
              axis.text = element_text(size = 13))

In [None]:
options(repr.plot.width = 12, repr.plot.height = 5, repr.plot.res = 100)
win_fst_df %>% 
    filter(regionID == 357) %>% 
    dplyr::select(-n_sites, -regionID) %>% 
    pivot_longer(names_to = "type", values_to = "fst", -win_id) %>% 
    ggplot(aes(y = fst, x = win_id)) +
        xlab("Window ID") + ylab("Fst") +
        geom_line(aes(color = type), linewidth = 1.5) +
        theme_classic() +
        theme(axis.title = element_text(size = 15),
              axis.text = element_text(size = 13),
              legend.title = element_text(size = 13),
              legend.text = element_text(size = 11))

In [None]:
win_fst_df %>% 
    filter(regionID == 234) %>% 
    dplyr::select(-n_sites, -regionID) %>% 
    pivot_longer(names_to = "type", values_to = "fst", -win_id) %>% 
    ggplot(aes(y = fst, x = win_id)) +
        xlab("Window ID") + ylab("Fst") +
        geom_line(aes(color = type), linewidth = 1.5) +
        theme_classic() +
        theme(axis.title = element_text(size = 15),
              axis.text = element_text(size = 13),
              legend.title = element_text(size = 13),
              legend.text = element_text(size = 11))

In [None]:
win_fst_df %>% 
    filter(regionID == 184) %>% 
    dplyr::select(-n_sites, -regionID) %>% 
    pivot_longer(names_to = "type", values_to = "fst", -win_id) %>% 
    ggplot(aes(y = fst, x = win_id)) +
        xlab("Window ID") + ylab("Fst") +
        geom_line(aes(color = type), linewidth = 1.5) +
        theme_classic() +
        theme(axis.title = element_text(size = 15),
              axis.text = element_text(size = 13),
              legend.title = element_text(size = 13),
              legend.text = element_text(size = 11))