In [None]:
library(tidyverse)
library(ggVennDiagram)

In [None]:
load_fst_data <- function(path){
    name <- basename(path)
    win_size <- str_extract(name, "(?<=win)\\d+(?=\\.txt)")
    df <- suppressMessages(read_delim(path, delim = "\t")) %>% 
        mutate(win_size = win_size)
    return(df)
}

all_fst_df <- snakemake@input[["all_fsts"]] %>% 
    purrr::map_dfr(load_fst_data) %>% 
    mutate(win_size = as.character(win_size))

In [None]:
head(all_fst_df)

In [None]:
all_fst_df_mod <- all_fst_df %>% 
    mutate(arg_branch_fst = ifelse(arg_branch_fst < 0, 0, arg_branch_fst),
           arg_site_fst = ifelse(arg_branch_fst < 0, 0, arg_site_fst),
           gt_hudson_fst = ifelse(arg_branch_fst < 0, 0, gt_hudson_fst),
           sfs_hudson_fst = ifelse(arg_branch_fst < 0, 0, sfs_hudson_fst))

In [None]:
all_fst_df_mod_windowed <- all_fst_df_mod %>% 
    filter(win_size == "10000") %>% 
    mutate(id = paste0(chromosome, ":", window_pos_1, "-", window_pos_2)) %>% 
    mutate(win_center = window_pos_1 + 5000) %>% 
    filter(no_snps >= 20)
all_fst_df_mod_persite <- all_fst_df_mod %>% 
    filter(win_size == "1") %>% 
    filter(!is.na(chromosome))

In [None]:
arg_fst_quant <- quantile(all_fst_df_mod_windowed %>% pull(arg_branch_fst), probs = 0.99, na.rm = T); arg_fst_quant
gt_fst_quant <- quantile(all_fst_df_mod_windowed %>% pull(gt_hudson_fst), probs = 0.99, na.rm = T); gt_fst_quant
sfs_fst_quant <- quantile(all_fst_df_mod_windowed %>% pull(sfs_hudson_fst), probs = 0.99, na.rm = T); sfs_fst_quant

In [None]:
arg_outliers <- all_fst_df_mod_windowed %>%
    filter(arg_branch_fst >= arg_fst_quant) %>%
    pull(id)
gt_outliers <- all_fst_df_mod_windowed %>%
    filter(gt_hudson_fst >= gt_fst_quant) %>%
    pull(id)
sfs_outliers <- all_fst_df_mod_windowed %>%
    filter(sfs_hudson_fst >= sfs_fst_quant) %>%
    pull(id)

In [None]:
outliers_list <- list(ARG = arg_outliers, GT = gt_outliers, SFS = sfs_outliers)
fst_outlier_venn <- ggVennDiagram(outliers_list)
fst_outlier_venn

ggsave(filename = snakemake@output[["venn"]], plot = fst_outlier_venn,
       device = "pdf", height = 8, width = 8, units = "in", dpi = 300)

In [None]:
data_cum <- all_fst_df_mod_windowed %>% 
    group_by(chromosome) %>% 
    summarise(max_WinCenter = max(win_center)) %>% 
    mutate(WinCenter_add = lag(cumsum(max_WinCenter), default = 0)) %>% 
    dplyr::select(chromosome, WinCenter_add)

all_fst_df_mod_windowed_mod <- all_fst_df_mod_windowed %>%
    inner_join(data_cum, by = "chromosome") %>% 
    mutate(WinCenter_cum = win_center + WinCenter_add) 

axis_set <- all_fst_df_mod_windowed_mod %>% 
  group_by(chromosome) %>% 
  summarize(center = mean(WinCenter_cum))

all_fst_df_mod_windowed_mod <- all_fst_df_mod_windowed_mod %>% 
    mutate(outlier_category = case_when(
        id %in% arg_outliers & id %in% gt_outliers & id %in% sfs_outliers ~ "Three-way outlier",
        id %in% arg_outliers & id %in% gt_outliers ~ "Two-way outlier",
        id %in% arg_outliers & id %in% sfs_outliers ~ "Two-way outlier",
        id %in% gt_outliers & id %in% sfs_outliers ~ "Two-way outlier",
        id %in% gt_outliers ~ "One-way outlier",
        id %in% arg_outliers ~ "One-way outlier",
        id %in% sfs_outliers ~ "One-way outlier",
        TRUE ~ "Not outlier"
    ))

outliers <- all_fst_df_mod_windowed_mod %>% filter(outlier_category == "Three-way outlier")

outlier_manhat <- all_fst_df_mod_windowed_mod %>%
    filter(outlier_category != "Three-way outlier") %>% 
    mutate(chrom_cat = case_when(chromosome == 'Chr01_Occ' ~ 'One',
                                 chromosome == 'Chr01_Pall' ~ 'Two',
                                 chromosome == 'Chr02_Occ' ~ 'One',
                                 chromosome == 'Chr02_Pall' ~ 'Two',
                                 chromosome == 'Chr03_Occ' ~ 'One',
                                 chromosome == 'Chr03_Pall' ~ 'Two',
                                 chromosome == 'Chr04_Occ' ~ 'One',
                                 chromosome == 'Chr04_Pall' ~ 'Two',
                                 chromosome == 'Chr05_Occ' ~ 'One',
                                 chromosome == 'Chr05_Pall' ~ 'Two',
                                 chromosome == 'Chr06_Occ' ~ 'One',
                                 chromosome == 'Chr06_Pall' ~ 'Two',
                                 chromosome == 'Chr07_Occ' ~ 'One',
                                 chromosome == 'Chr07_Pall' ~ 'Two',
                                 chromosome == 'Chr08_Occ' ~ 'One',
                                 chromosome == 'Chr08_Pall' ~ 'Two')) %>%
    ggplot(aes(x = WinCenter_cum, y = arg_branch_fst)) +
        geom_point(shape = 21, size = 3, alpha = 0.4, aes(fill = chrom_cat, color = chrom_cat)) +
        geom_point(data = outliers, shape = 21, alpha = 1, size = 3, color = "#f77f00", fill = "#f77f00") +
        geom_hline(yintercept = arg_fst_quant, color = "grey40", linetype = "dashed") +
        scale_x_continuous(label = axis_set$chromosome, breaks = axis_set$center) +
        scale_y_continuous(expand = c(0,0)) +
        coord_cartesian(ylim = c(0, 0.04)) +
        scale_fill_manual(values = c("One" = "black", 
                                     "Two" = "grey40")) + 
        scale_color_manual(values = c("One" = "black", 
                                     "Two" = "grey40")) +  
        ylab('Branch-based Fst from tskit (i.e. ARGs)') + xlab('Genomic position') +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13),
              legend.position = "none",
              legend.key.size = unit(1, 'cm'),
              legend.title = element_blank(),
              legend.text = element_text(size=13))
options(repr.plot.width = 20, repr.plot.height = 6)
outlier_manhat

ggsave(filename = snakemake@output[["outlier_manhat_windowed_threeway"]], plot = outlier_manhat,
       device = "pdf", width = 20, height = 6, units = "in", dpi = 300)

In [None]:
arg_fst_quant_persite <- quantile(all_fst_df_mod_persite %>% pull(arg_branch_fst), probs = 0.99, na.rm = T); arg_fst_quant_persite

In [None]:
data_cum <- all_fst_df_mod_persite %>% 
    group_by(chromosome) %>% 
    summarise(max_WinCenter = max(window_pos_1)) %>% 
    mutate(WinCenter_add = lag(cumsum(max_WinCenter), default = 0)) %>% 
    dplyr::select(chromosome, WinCenter_add)

all_fst_df_mod_persite_mod <- all_fst_df_mod_persite %>%
    inner_join(data_cum, by = "chromosome") %>% 
    mutate(WinCenter_cum = window_pos_1 + WinCenter_add) 

axis_set <- all_fst_df_mod_persite_mod %>% 
  group_by(chromosome) %>% 
  summarize(center = mean(WinCenter_cum))

all_fst_df_mod_persite_mod <- all_fst_df_mod_persite_mod %>% 
    mutate(outlier_category = case_when(
        arg_branch_fst >= arg_fst_quant_persite ~ "Outlier",
        TRUE ~ "Not outlier"
    ))

outliers <- all_fst_df_mod_persite_mod %>% filter(outlier_category != "Not outlier")

outlier_manhat <- all_fst_df_mod_persite_mod %>%
    filter(outlier_category == "Not outlier") %>% 
    mutate(chrom_cat = case_when(chromosome == 'Chr01_Occ' ~ 'One',
                                 chromosome == 'Chr01_Pall' ~ 'Two',
                                 chromosome == 'Chr02_Occ' ~ 'One',
                                 chromosome == 'Chr02_Pall' ~ 'Two',
                                 chromosome == 'Chr03_Occ' ~ 'One',
                                 chromosome == 'Chr03_Pall' ~ 'Two',
                                 chromosome == 'Chr04_Occ' ~ 'One',
                                 chromosome == 'Chr04_Pall' ~ 'Two',
                                 chromosome == 'Chr05_Occ' ~ 'One',
                                 chromosome == 'Chr05_Pall' ~ 'Two',
                                 chromosome == 'Chr06_Occ' ~ 'One',
                                 chromosome == 'Chr06_Pall' ~ 'Two',
                                 chromosome == 'Chr07_Occ' ~ 'One',
                                 chromosome == 'Chr07_Pall' ~ 'Two',
                                 chromosome == 'Chr08_Occ' ~ 'One',
                                 chromosome == 'Chr08_Pall' ~ 'Two')) %>%
    ggplot(aes(x = WinCenter_cum, y = arg_branch_fst)) +
        geom_point(shape = 21, size = 1, alpha = 0.2, aes(fill = chrom_cat, color = chrom_cat)) +
        geom_point(data = outliers, shape = 21, alpha = 0.1, size = 1, color = "#f77f00", fill = "#f77f00") +
        geom_hline(yintercept = arg_fst_quant_persite, color = "grey40", linetype = "dashed") +
        scale_x_continuous(label = axis_set$chromosome, breaks = axis_set$center) +
        scale_y_continuous(expand = c(0,0), breaks = seq(0, 0.09, 0.01)) +
        coord_cartesian(ylim = c(0, 0.09)) +
        scale_fill_manual(values = c("One" = "black", 
                                     "Two" = "grey40")) + 
        scale_color_manual(values = c("One" = "black", 
                                     "Two" = "grey40")) + 
        ylab('Branch-based Fst from tskit (i.e. ARGs)') + xlab('Genomic position') +
        theme_classic() +
        theme(axis.text = element_text(size = 13),
              axis.title = element_text(size = 15),
              strip.text.x = element_text(size = 13),
              legend.position = "none",
              legend.key.size = unit(1, 'cm'),
              legend.title = element_blank(),
              legend.text = element_text(size=13))
options(repr.plot.width = 20, repr.plot.height = 6)
outlier_manhat

ggsave(filename = snakemake@output[["outlier_manhat_persite"]], plot = outlier_manhat,
       device = "pdf", width = 20, height = 6, units = "in", dpi = 300)