# Setup

In [None]:
# Load required packages
library(tidyverse)
library(bedtoolsr)

In [None]:
# Load dataframes with windowed sweep statistics
# win_sfs_df <- read_delim(snakemake@input[['fst']], delim = '\t')
# win_xpnsl_df <- read_delim(snakemake@input[['xpnsl']], delim = '\t')

# # Load dataframes with windowed percentiles from permuted distributions
# urb_perc <- read_delim(snakemake@input[['urb_perc']], delim = '\t')
# rur_perc <- read_delim(snakemake@input[['rur_perc']], delim = '\t')

# # Load per-site ARG Fst values
# load_arg_stats <- function(path){
#     base <- basename(path)
#     chrom <- str_extract(base, "(^.*)(?=_region)")
#     df <- suppressMessages(read_csv(path)) %>% 
#         mutate(chromosome = chrom)
#     return(df)
# }
# arg_stats_df <- snakemake@input[["arg_stats"]] %>% purrr::map_dfr(load_arg_stats)

# # Load GFF
# gff <- ape::read.gff(snakemake@input[['gff']], GFF3 = TRUE) %>% 
#     dplyr::select(seqid, start, end, everything())

# Analysis of selective sweeps 

## XP-nSL

- I'll start by looking at genome-wide XP-xSL values, estimated in 50 kb windows
- In each window, I estimated the mean XP-nSL score and the proportion of scores that are either above 2 or less than -2.
- Positive outlier windows are those that are in the top 1% of the genome-wide mean XP-nSL distribution AND the top 1% of distribution of scores > 2. These windows represent positive selection in urban habitats
- Negative outlier windows are those that are in the bottom 1% of the genome-wide XP-xSL distribution AND the top 1% of distribution of scores < -2. These windows represent positive selection in rural habitats.
- I'll look at the top 10 windows under selection for urban and rural habitats separately.
- I'll merge outlier windows within 50Kb of one another into larger outlier regions

In [None]:
# Load observed XP-nSL data
obs_xpnsl_df <- read_delim(snakemake@input[["win_xpnsl"]])
head(obs_xpnsl_df)

In [None]:
# Distribution of the number of sites per window
xpnsl_nsites_thresh <- 50
xpnsl_nSites_hist <- obs_xpnsl_df %>% 
    ggplot(aes(x = n)) +
        geom_histogram(bins = 50, color = "black", fill = "white") +
        ylab("Number of windows") + xlab("Number of sites") +
        geom_vline(xintercept = xpnsl_nsites_thresh, color = "red", linetype = "dashed", linewidth = 1) +
        theme_classic() +
        theme(axis.text = element_text(size = 15),
              axis.title = element_text(size = 17))

xpnsl_nSites_hist
ggsave(filename = snakemake@output[["xpnsl_nSites_hist"]], plot = xpnsl_nSites_hist, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")

### Filter and identify outlier windows

In [None]:
# Function to assign outliers to XP-nSL windows
assign_xpnsl_outliers <- function(df){

    df_filt <- df %>%
        filter(n >= xpnsl_nsites_thresh)

    # Get critical values for mean XP-nSL score and proportions greater or lesser than 2 and -2, respectively
    xpnsl_score_quant <- quantile(df_filt %>% pull(mean), probs = c(0.01, 0.99))
    xpnsl_gtprop_quant <- quantile(df_filt %>% pull(gt_frac), probs = 0.99)
    xpnsl_ltprop_quant <- quantile(df_filt %>% pull(lt_frac), probs = 0.99)

    # Identify outliers and add as categorical variable to windows dataframe
    df_filt <- df_filt %>%
        mutate(xpnsl_score_outlier = ifelse(mean <= xpnsl_score_quant[1] | mean >= xpnsl_score_quant[2], 1, 0),
               xpnsl_gtprop_outlier = ifelse(gt_frac >= xpnsl_gtprop_quant, 1, 0),
               xpnsl_ltprop_outlier = ifelse(lt_frac >= xpnsl_ltprop_quant, 1, 0),
               direction = case_when(xpnsl_score_outlier == 1 & mean > 0 & xpnsl_gtprop_outlier == 1 ~ 'Urban',
                                     xpnsl_score_outlier == 1 & mean < 0 & xpnsl_ltprop_outlier == 1 ~ 'Rural',
                                     TRUE ~ 'None')) %>% 
    mutate(prop_outlier = case_when(direction == 'Urban' ~ gt_frac,
                                    direction == 'Rural' ~ lt_frac,
                                    TRUE ~ NA))

    return(df_filt)
}

obs_xpnsl_df_filt <- assign_xpnsl_outliers(obs_xpnsl_df)

sprintf("%s of %s XP-nSL windows remaining after removing those with less than %s sites", nrow(obs_xpnsl_df_filt), nrow(obs_xpnsl_df), xpnsl_nsites_thresh)

head(obs_xpnsl_df_filt)

### Generate Manhattan plot

In [None]:
# Setting up cummulative genome-wide x-axis
data_cum <- obs_xpnsl_df_filt %>% 
    group_by(Chr) %>% 
    summarise(max_winCenter = max(winCenter)) %>% 
    mutate(winCenter_add = lag(cumsum(max_winCenter), default = 0)) %>% 
    dplyr::select(Chr, winCenter_add)

obs_xpnsl_df_filt_mod <- obs_xpnsl_df_filt %>%
    inner_join(data_cum, by = "Chr") %>% 
    mutate(winCenter_cum = winCenter + winCenter_add) 

axis_set <- obs_xpnsl_df_filt_mod %>% 
  group_by(Chr) %>% 
  summarize(center = mean(winCenter_cum))

# Get XP-nSL outliers for plotting as separate layers
xpnsl_not_outlier <- obs_xpnsl_df_filt_mod %>% filter(direction == 'None')
urban_xpnsl_outliers <- obs_xpnsl_df_filt_mod %>% filter(direction == 'Urban')
rural_xpnsl_outliers <- obs_xpnsl_df_filt_mod %>% filter(direction == 'Rural')

# Get XP-nSL score quantile for plotting
xpnsl_score_quant <- quantile(obs_xpnsl_df_filt_mod %>% pull(mean), probs = c(0.01, 0.99))

xpnsl_not_outlier <- xpnsl_not_outlier %>%
    mutate(chrom_cat = case_when(Chr == 'Chr01_Occ' ~ 'One',
                                 Chr == 'Chr01_Pall' ~ 'Two',
                                 Chr == 'Chr02_Occ' ~ 'One',
                                 Chr == 'Chr02_Pall' ~ 'Two',
                                 Chr == 'Chr03_Occ' ~ 'One',
                                 Chr == 'Chr03_Pall' ~ 'Two',
                                 Chr == 'Chr04_Occ' ~ 'One',
                                 Chr == 'Chr04_Pall' ~ 'Two',
                                 Chr == 'Chr05_Occ' ~ 'One',
                                 Chr == 'Chr05_Pall' ~ 'Two',
                                 Chr == 'Chr06_Occ' ~ 'One',
                                 Chr == 'Chr06_Pall' ~ 'Two',
                                 Chr == 'Chr07_Occ' ~ 'One',
                                 Chr == 'Chr07_Pall' ~ 'Two',
                                 Chr == 'Chr08_Occ' ~ 'One',
                                 Chr == 'Chr08_Pall' ~ 'Two'))

xpnsl_manhat <- ggplot() +
        geom_point(data = xpnsl_not_outlier, shape = 21, alpha = 0.4, size = 3, 
                   aes(x = winCenter_cum, y = mean, fill = chrom_cat, color = chrom_cat)) +
        geom_point(data = urban_xpnsl_outliers, shape = 21, alpha = 1, size = 3, color = '#003876', fill = '#003876',
                   aes(x = winCenter_cum, y = mean)) +
        geom_point(data = rural_xpnsl_outliers, shape = 21, alpha = 1, size = 3, color = '#007243', fill = '#007243',
                   aes(x = winCenter_cum, y = mean)) +
        geom_hline(yintercept = xpnsl_score_quant, color = "grey40", linetype = "dashed") +
        scale_x_continuous(label = axis_set$Chr, breaks = axis_set$center) +
        scale_y_continuous(expand = c(0,0)) +
        coord_cartesian(ylim = c(-6, 5)) +
        scale_fill_manual(values = c("black", "grey40")) + 
        scale_color_manual(values = c("black", "grey40")) + 
        ylab('Normalized XP-nSL') + xlab('Chromosomes') +
        ggtitle('Urban-Rural XP-nSL') +
        theme_classic() +
        theme(
            legend.position = "none",
            panel.border = element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            axis.text = element_text(size=16),
            axis.title = element_text(size=20),
            axis.text.x = element_text(angle = 45, hjust = 1),
            plot.title = element_text(size=20, face="bold")
          )

options(repr.plot.width = 20, repr.plot.height = 6)
xpnsl_manhat
ggsave(filename = snakemake@output[["xpnsl_manhat"]], plot = xpnsl_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
load_hapstat_norm <- function(path, stat){
    chrom_name <- str_extract(basename(path), pattern = "Chr\\d+_((Occ)|(Pall))")
    habitat <- str_extract(basename(path), pattern = "(Urban|Rural)")

    if (stat == "ihh12"){
        df <- suppressMessages(read_delim(path, delim = "\t")) %>% 
            mutate(Chr = chrom_name, habitat = habitat)
    } else if(stat == "xpnsl"){
        df <- suppressMessages(read_delim(path, delim = "\t")) %>% 
            mutate(Chr = chrom_name)
    } else if (stat == "ihs"){
        cols <- c("id", "pos", "1_freq", "ihh1", "ihh0", "iHs", "normihs", "crit")
        df <- suppressMessages(read_delim(path, delim = "\t", col_names = cols)) %>% 
            mutate(Chr = chrom_name, habitat = habitat)
    } else if (stat == "nsl"){
        cols <- c("id", "pos", "1_freq", "sl1", "sl0", "nSL", "normnsl", "crit")
        df <- suppressMessages(read_delim(path, delim = "\t", col_names = cols)) %>% 
            mutate(Chr = chrom_name, habitat = habitat)
    }
    return(df)
}

In [None]:
xpnsl_persite <- snakemake@input[["norm_xpnsl"]] %>% 
    purrr::map_dfr(., load_hapstat_norm, stat = "xpnsl") %>% 
    rename("normxpnsl" = "normxpehh") %>% 
    mutate(crit = ifelse(normxpnsl < -2 | normxpnsl > 2, 1, 0))

In [None]:
# Setting up cummulative genome-wide x-axis
data_cum <- xpnsl_persite %>% 
    group_by(Chr) %>% 
    summarise(max_pos = max(pos)) %>% 
    mutate(pos_add = lag(cumsum(max_pos), default = 0)) %>% 
    dplyr::select(Chr, pos_add)

xpnsl_persite_mod <- xpnsl_persite %>%
    inner_join(data_cum, by = "Chr") %>% 
    mutate(pos_cum = pos + pos_add) 

axis_set <- xpnsl_persite_mod %>% 
  group_by(Chr) %>% 
  summarize(center = mean(pos_cum))

# Get XP-nSL outliers for plotting as separate layers
xpnsl_not_outlier <- xpnsl_persite_mod %>% filter(crit == 0)
urban_xpnsl_outliers <- xpnsl_persite_mod %>% filter(crit == 1 & normxpnsl > 0)
rural_xpnsl_outliers <- xpnsl_persite_mod %>% filter(crit == 1 & normxpnsl < 0)

xpnsl_not_outlier <- xpnsl_not_outlier %>%
    mutate(chrom_cat = case_when(Chr == 'Chr01_Occ' ~ 'One',
                                 Chr == 'Chr01_Pall' ~ 'Two',
                                 Chr == 'Chr02_Occ' ~ 'One',
                                 Chr == 'Chr02_Pall' ~ 'Two',
                                 Chr == 'Chr03_Occ' ~ 'One',
                                 Chr == 'Chr03_Pall' ~ 'Two',
                                 Chr == 'Chr04_Occ' ~ 'One',
                                 Chr == 'Chr04_Pall' ~ 'Two',
                                 Chr == 'Chr05_Occ' ~ 'One',
                                 Chr == 'Chr05_Pall' ~ 'Two',
                                 Chr == 'Chr06_Occ' ~ 'One',
                                 Chr == 'Chr06_Pall' ~ 'Two',
                                 Chr == 'Chr07_Occ' ~ 'One',
                                 Chr == 'Chr07_Pall' ~ 'Two',
                                 Chr == 'Chr08_Occ' ~ 'One',
                                 Chr == 'Chr08_Pall' ~ 'Two'))

persite_xpnsl_manhat <- ggplot() +
        geom_point(data = xpnsl_not_outlier, shape = 21, alpha = 0.1, size = 0.25, 
                   aes(x = pos_cum, y = normxpnsl, fill = chrom_cat, color = chrom_cat)) +
        geom_point(data = urban_xpnsl_outliers, shape = 21, alpha = 0.1, size = 0.25, color = '#003876', fill = '#003876',
                   aes(x = pos_cum, y = normxpnsl)) +
        geom_point(data = rural_xpnsl_outliers, shape = 21, alpha = 0.1, size = 0.25, color = '#007243', fill = '#007243',
                   aes(x = pos_cum, y = normxpnsl)) +
        geom_hline(yintercept = 2, color = "grey40", linetype = "dashed") +
        geom_hline(yintercept = -2, color = "grey40", linetype = "dashed") +
        scale_x_continuous(label = axis_set$Chr, breaks = axis_set$center) +
        scale_y_continuous(expand = c(0,0), breaks = seq(-8, 6, 2)) +
        coord_cartesian(ylim = c(-9, 7)) +
        scale_fill_manual(values = c("black", "grey40")) + 
        scale_color_manual(values = c("black", "grey40")) + 
        ylab('Normalized XP-nSL') + xlab('Chromosomes') +
        ggtitle('Per-Site Urban-Rural XP-nSL') +
        theme_classic() +
        theme(
            legend.position = "none",
            panel.border = element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            axis.text = element_text(size=16),
            axis.title = element_text(size=20),
            axis.text.x = element_text(angle = 45, hjust = 1),
            plot.title = element_text(size=20, face="bold")
          )

options(repr.plot.width = 20, repr.plot.height = 6)
persite_xpnsl_manhat
ggsave(filename = snakemake@output[["xpnsl_manhat_persite"]], plot = persite_xpnsl_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

### Add ranks and percentiles from permuted distributions

#### Add ranks

In [None]:
# Function to add ranks to XP-nSL windows based on the proportion of outlier scores
add_xpnsl_ranks <- function(df){
        
    df_out <- df %>% 
        arrange(desc(prop_outlier)) %>% 
        mutate(xpnsl_rank = 1:n())
    return(df_out)
}

obs_xpnsl_df_filt <- obs_xpnsl_df_filt %>% 
    group_split(direction) %>% 
    purrr::map_dfr(., add_xpnsl_ranks) %>% 
    mutate(xpnsl_rank = ifelse(direction == 'None', NA, xpnsl_rank))

#### Correlation between observed and permuted XP-nSL across iterations

In [None]:
perm_xpnsl_df <- purrr::map(snakemake@input[["win_xpnsl_perm"]], function(x) suppressMessages(read_delim(x, delim ='\t'))) %>% 
    map_dfr(., assign_xpnsl_outliers)
head(perm_xpnsl_df)

In [None]:
calc_cor <- function(df){
    val <- cor(df$mean, obs_xpnsl_df_filt$mean, method = "pearson")
    return(data.frame(cor = val))
}
cor_plot <- perm_xpnsl_df %>% 
    group_split(iter) %>% 
    purrr::map_dfr(., calc_cor) %>% 
    ggplot(aes(x = cor)) +
        geom_histogram(bins = 50, color = "black", fill = "white") +
        ylab("Number of iterations") + xlab("Permuted vs. observed XP-nSL Pearson's correlation coefficient") +
        theme_classic() +
        theme(axis.title = element_text(size = 17),
              axis.text = element_text(size = 15))

options(repr.plot.width = 8, repr.plot.height = 8)
cor_plot
ggsave(filename = snakemake@output[["cor_plot"]], plot = cor_plot, width = 8, 
       height = 8, units = "in", device = "pdf", dpi = 600)

#### Permutations and observed density plots

In [None]:
urban_sel_permuted <- perm_xpnsl_df %>% filter(direction == "Urban")
urban_sel_observed <- obs_xpnsl_df_filt %>% filter(direction == "Urban")

urb_mean_plot <- ggplot() +
    geom_density(data = urban_sel_permuted, aes(x = mean, fill = "Permuted", color = "Permuted"), alpha = 0.5) + 
    geom_density(data = urban_sel_observed, aes(x = mean, fill = "Observed", color = "Observed"), alpha = 0.5) +
    ylab("Density") + xlab("Mean windowed XP-nSL score") +
    scale_fill_manual(name = "Distribution", values = c("Permuted" = "#0070ec", "Observed" = "#003876")) +
    scale_color_manual(name = "Distribution", values = c("Permuted" = "#0070ec", "Observed" = "#003876")) +
    theme_classic() +
    theme(axis.title = element_text(size = 17),
          axis.text = element_text(size = 15))

urb_mean_plot
ggsave(filename = snakemake@output[["urb_mean_plot"]], plot = urb_mean_plot, width = 8, 
       height = 8, units = "in", device = "pdf", dpi = 600)

In [None]:
urb_prop_plot <- ggplot() +
    geom_density(data = urban_sel_permuted, aes(x = prop_outlier, fill = "Permuted", color = "Permuted"), alpha = 0.5) + 
    geom_density(data = urban_sel_observed, aes(x = prop_outlier, fill = "Observed", color = "Observed"), alpha = 0.5) +
    ylab("Density") + xlab("Proportion of XP-nSL scores in window > 2") +
    scale_fill_manual(name = "Distribution", values = c("Permuted" = "#0070ec", "Observed" = "#003876")) +
    scale_color_manual(name = "Distribution", values = c("Permuted" = "#0070ec", "Observed" = "#003876")) +
    theme_classic() +
    theme(axis.title = element_text(size = 17),
          axis.text = element_text(size = 15))

urb_prop_plot
ggsave(filename = snakemake@output[["urb_prop_plot"]], plot = urb_prop_plot, width = 8, 
       height = 8, units = "in", device = "pdf", dpi = 600)

In [None]:
# Dataframe with percentile in permuted distribution of each observed XP-nSL window
urban_percentile <- urban_sel_observed %>% 
    rowwise() %>% 
    mutate(mean_perm_perc = sum(mean >= urban_sel_permuted$mean) / nrow(urban_sel_permuted),
           prop_perm_perc = sum(prop_outlier >= urban_sel_permuted$prop_outlier) / nrow(urban_sel_permuted)) %>% 
    arrange(desc(mean_perm_perc)) %>% 
    dplyr::select(Chr, winID, start, end, winCenter, mean_perm_perc, prop_perm_perc)

In [None]:
rural_sel_permuted <- perm_xpnsl_df %>% filter(direction == "Rural")
rural_sel_observed <- obs_xpnsl_df_filt %>% filter(direction == "Rural")

rur_mean_plot <- ggplot() +
    geom_density(data = rural_sel_permuted, aes(x = mean, fill = "Permuted", color = "Permuted"), alpha = 0.5) + 
    geom_density(data = rural_sel_observed, aes(x = mean, fill = "Observed", color = "Observed"), alpha = 0.5) +
    ylab("Density") + xlab("Mean windowed XP-nSL score") +
    scale_fill_manual(name = "Distribution", values = c("Permuted" = "#00e989", "Observed" = "#007243")) +
    scale_color_manual(name = "Distribution", values = c("Permuted" = "#00e989", "Observed" = "#007243")) +
    theme_classic() +
    theme(axis.title = element_text(size = 17),
          axis.text = element_text(size = 15))

rur_mean_plot
ggsave(filename = snakemake@output[["rur_mean_plot"]], plot = rur_mean_plot, width = 8, 
       height = 8, units = "in", device = "pdf", dpi = 600)

In [None]:
rur_prop_plot <- ggplot() +
    geom_density(data = rural_sel_permuted, aes(x = prop_outlier, fill = "Permuted", color = "Permuted"), alpha = 0.5) + 
    geom_density(data = rural_sel_observed, aes(x = prop_outlier, fill = "Observed", color = "Observed"), alpha = 0.5) +
    ylab("Density") + xlab("Proportion of XP-nSL scores < -2") +
    scale_fill_manual(name = "Distribution", values = c("Permuted" = "#00e989", "Observed" = "#007243")) +
    scale_color_manual(name = "Distribution", values = c("Permuted" = "#00e989", "Observed" = "#007243")) +
    theme_classic() +
    theme(axis.title = element_text(size = 17),
          axis.text = element_text(size = 15))

rur_prop_plot
ggsave(filename = snakemake@output[["rur_prop_plot"]], plot = rur_prop_plot, width = 8, 
       height = 8, units = "in", device = "pdf", dpi = 600)

In [None]:
# Dataframe with percentile in permuted distribution of each observed XP-nSL window
rural_percentile <- rural_sel_observed %>% 
    rowwise() %>% 
    mutate(mean_perm_perc = sum(mean <= rural_sel_permuted$mean) / nrow(rural_sel_permuted),
           prop_perm_perc = sum(prop_outlier >= rural_sel_permuted$prop_outlier) / nrow(rural_sel_permuted)) %>% 
    arrange(desc(mean_perm_perc)) %>% 
    dplyr::select(Chr, winID, start, end, winCenter, mean_perm_perc, prop_perm_perc)

In [None]:
obs_xpnsl_df_filt <- obs_xpnsl_df_filt %>% 
    left_join(., bind_rows(urban_percentile, rural_percentile), by = c("Chr", "winID", "start", "end", "winCenter"))
head(obs_xpnsl_df_filt)

write_delim(obs_xpnsl_df_filt, snakemake@output[["xpnsl_df"]], delim='\t')

### Merge XP-nSL outlier windows and get top 10

In [None]:
# Function to merge consecutive outlier windows
merge_xpnsl_windows <- function(df, win_dist = 0){

    dir <- df %>% pull(direction) %>% unique()
    df_sorted <- df %>% 
        dplyr::select(Chr, start, end, everything()) %>% 
        arrange(Chr, start) %>% 
        mutate(prop_outlier = round(prop_outlier, 3),
               mean_perm_perc = round(mean_perm_perc, 3),
               prop_perm_perc = round(prop_outlier, 3))

    col_names <- c('Chr', 'start', 'end', 'mean_xpnsl', 'min_max', 'direction', 'mean_prop_outlier', 'all_prop_outlier', 'mean_percentile_permuted', 'prop_percentile_permuted','min_xpnsl_rank', 'all_xpnsl_ranks')
    if(dir == 'Urban'){
        cols <- c('6,7,17,18,18,20,21,19,19')
        operation <- c('mean,max,distinct,mean,collapse,collapse,collapse,min,collapse')
    }else if(dir == 'Rural'){
        cols <- c('6,8,17,18,18,20,21,19,19')
        operation <- c('mean,min,distinct,mean,collapse,collapse,collapse,min,collapse')
    }
    
    df_merged  <- bt.merge(i = df_sorted, c = cols, o = operation, d = win_dist)
    names(df_merged) <- col_names
    df_merged <- df_merged %>% 
        mutate(win_size = end - start)

    print(sprintf('%s: There were %s XP-nSL outlier windows prior to merging. There are %s outlier regions after merging consecutive outlier windows', dir, nrow(df_sorted), nrow(df_merged)))
    return(df_merged)
}

win_xpnsl_df_filt_merged <- obs_xpnsl_df_filt %>% 
    filter(direction != 'None') %>% 
    group_split(direction) %>% 
    purrr::map_dfr(., merge_xpnsl_windows)

In [None]:
xpnsl_top10_urban_rural <- win_xpnsl_df_filt_merged %>% 
    filter(min_xpnsl_rank <= 10) %>% 
    group_by(direction) %>% 
    arrange(min_xpnsl_rank, .by_group = TRUE) %>% 
    dplyr::select(Chr, start, end, win_size, direction, mean_xpnsl, min_max, all_prop_outlier, mean_prop_outlier, mean_percentile_permuted, prop_percentile_permuted, all_xpnsl_ranks, min_xpnsl_rank) %>% 
    ungroup()
xpnsl_top10_urban_rural

## nSL

- I estimated nSL in the same 50 Kb windows as above, separately in urban and rural habitats
- I'll do the same thing as I did above, minus the permutations

In [None]:
nsl_df <- read_delim(snakemake@input[["win_nsl"]], delim='\t')
head(nsl_df)

In [None]:
nsl_nsites_thresh <- 25
nsl_nsites_hist <- nsl_df %>% 
    ggplot(aes(x = n)) +
        geom_histogram(bins = 50, color = "black", fill = "white") +
        ylab("Number of windows") + xlab("Number of sites") +
        geom_vline(xintercept = nsl_nsites_thresh, color = "red", linetype = "dashed", linewidth = 1) +
        theme_classic() +
        theme(axis.text = element_text(size = 15),
              axis.title = element_text(size = 17))

nsl_nsites_hist

In [None]:
# Function to assign outliers to nSL windows
assign_nsl_outliers <- function(df, thresh = nsl_nsites_thresh){

    df_filt <- df %>%
        filter(n >= thresh)

    # Get critical values for mean XP-nSL score and proportions greater or lesser than 2 and -2, respectively
    score_quant <- quantile(df_filt %>% pull(mean), probs = 0.99)
    gtprop_quant <- quantile(df_filt %>% pull(gt_frac), probs = 0.99)

    # Identify outliers and add as categorical variable to windows dataframe
    df_filt <- df_filt %>%
        mutate(score_outlier = ifelse(mean >= score_quant, 1, 0),
               gtprop_outlier = ifelse(gt_frac >= gtprop_quant, 1, 0),
               is_outlier = ifelse(score_outlier == 1 & gtprop_outlier == 1, 1, 0))

    return(df_filt)
}

nsl_df_filt <- nsl_df %>% 
    group_split(habitat) %>% 
    purrr::map_dfr(., assign_nsl_outliers, thresh = nsl_nsites_thresh)

sprintf("%s of %s nSL windows remaining after removing those with less than %s sites", nrow(nsl_df_filt), nrow(nsl_df), nsl_nsites_thresh)

head(nsl_df_filt)

In [None]:
plot_single_population_hapstat_win_manhat <- function(df, hab, stat){

    df <- df %>% filter(habitat == hab)
    
    # Setting up cummulative genome-wide x-axis
    data_cum <- df %>% 
        group_by(Chr) %>% 
        summarise(max_winCenter = max(winCenter)) %>% 
        mutate(winCenter_add = lag(cumsum(max_winCenter), default = 0)) %>% 
        dplyr::select(Chr, winCenter_add)
    
    df_mod <- df %>%
        inner_join(data_cum, by = "Chr") %>% 
        mutate(winCenter_cum = winCenter + winCenter_add) 
    
    axis_set <- df_mod %>% 
      group_by(Chr) %>% 
      summarize(center = mean(winCenter_cum))

    # Get Fst outliers and genome-wide critical value
    outliers <- df_mod %>% filter(is_outlier == 1) 
    quant <- quantile(abs(df %>% pull(mean)), probs = c(0.99))

    if(hab == 'Urban'){
        col = '#003876'
    } else {
        col = '#007243'
    }

    if(stat == 'nSL'){
        ylab = '| nSL |'
        coord = c(0, 4)
        breaks = seq(0, 4, 1)
    } else if (stat == 'iHS') {
        ylab = '| iHS |'
        coord = c(0, 3.25)
        breaks = seq(0, 3, 1)
    } else {
        ylab = 'iHH12'
        coord = c(-0.5, 17)
        breaks = seq(0, 16, 2)
    }

    # Generate Manhattan plot
    manhat_plot <- df_mod %>%
        filter(is_outlier != 1) %>% 
        mutate(chrom_cat = case_when(Chr == 'Chr01_Occ' ~ 'One',
                                     Chr == 'Chr01_Pall' ~ 'Two',
                                     Chr == 'Chr02_Occ' ~ 'One',
                                     Chr == 'Chr02_Pall' ~ 'Two',
                                     Chr == 'Chr03_Occ' ~ 'One',
                                     Chr == 'Chr03_Pall' ~ 'Two',
                                     Chr == 'Chr04_Occ' ~ 'One',
                                     Chr == 'Chr04_Pall' ~ 'Two',
                                     Chr == 'Chr05_Occ' ~ 'One',
                                     Chr == 'Chr05_Pall' ~ 'Two',
                                     Chr == 'Chr06_Occ' ~ 'One',
                                     Chr == 'Chr06_Pall' ~ 'Two',
                                     Chr == 'Chr07_Occ' ~ 'One',
                                     Chr == 'Chr07_Pall' ~ 'Two',
                                     Chr == 'Chr08_Occ' ~ 'One',
                                     Chr == 'Chr08_Pall' ~ 'Two')) %>%
            ggplot(aes(x = winCenter_cum, y = mean)) +
            geom_point(shape = 21, alpha = 0.4, size = 3, aes(fill = chrom_cat, color = chrom_cat)) +
            geom_point(data = outliers, shape = 21, alpha = 1, size = 3, color = col, fill = col) +
            geom_hline(yintercept = quant, color = "grey40", linetype = "dashed") +
            scale_x_continuous(label = axis_set$Chr, breaks=axis_set$center) +
            scale_y_continuous(expand = c(0,0), breaks = breaks) +
            coord_cartesian(ylim = coord) +
            scale_fill_manual(values = c("black", "grey40")) + 
            scale_color_manual(values = c("black", "grey40")) + 
            ylab(ylab) + xlab('') +
            ggtitle(sprintf("%s Windowed %s", hab, stat)) +
            theme_classic() +
            theme(
                legend.position = "none",
                panel.border = element_blank(),
                panel.grid.major.x = element_blank(),
                panel.grid.minor.x = element_blank(),
                axis.text = element_text(size=16),
                axis.title = element_text(size=20),
                axis.text.x = element_text(angle = 45, hjust = 1),
                plot.title = element_text(size = 20, face = "bold")
              )
    return(manhat_plot)
}

In [None]:
options(repr.plot.width = 20, repr.plot.height = 6)
rural_nsl_manhat <- plot_single_population_hapstat_win_manhat(nsl_df_filt, "Rural", stat = 'nSL')
rural_nsl_manhat
ggsave(filename = snakemake@output[["rur_nsl_manhat"]], plot = rural_nsl_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
nsl_persite <- snakemake@input[["norm_nsl"]] %>% 
    purrr::map_dfr(., load_hapstat_norm, stat = "nsl") %>% 
    mutate(crit = ifelse(normnsl < -2 | normnsl > 2, 1, 0))

In [None]:
plot_single_population_hapstat_persite_manhat <- function(df, hab, stat){

    df <- df %>% filter(habitat == hab)
    
    # Setting up cummulative genome-wide x-axis
    data_cum <- df %>% 
        group_by(Chr) %>% 
        summarise(max_pos = max(pos)) %>% 
        mutate(pos_add = lag(cumsum(max_pos), default = 0)) %>% 
        dplyr::select(Chr, pos_add)
    
    df_mod <- df %>%
        inner_join(data_cum, by = "Chr") %>% 
        mutate(pos_cum = pos + pos_add) 
    
    axis_set <- df_mod %>% 
      group_by(Chr) %>% 
      summarize(center = mean(pos_cum))

    # Get Fst outliers and genome-wide critical value
    outliers <- df_mod %>% filter(crit == 1) 

    if(hab == 'Urban'){
        col = '#003876'
    } else {
        col = '#007243'
    }

    if(stat == 'nSL'){
        y = "normnsl"
        coord = c(-6, 6)
        breaks = seq(-6, 6, 2)
    } else if (stat == 'iHS') {
        y = "normihs"
        coord = c(-8, 8)
        breaks = seq(-8, 8, 2)
    } else {
        y = "normihh12"
        coord = c(-0.5, 120)
        breaks = seq(0, 120, 20)
    }

    # Generate Manhattan plot
    manhat_plot <- df_mod %>%
        filter(crit == 0) %>% 
        mutate(chrom_cat = case_when(Chr == 'Chr01_Occ' ~ 'One',
                                     Chr == 'Chr01_Pall' ~ 'Two',
                                     Chr == 'Chr02_Occ' ~ 'One',
                                     Chr == 'Chr02_Pall' ~ 'Two',
                                     Chr == 'Chr03_Occ' ~ 'One',
                                     Chr == 'Chr03_Pall' ~ 'Two',
                                     Chr == 'Chr04_Occ' ~ 'One',
                                     Chr == 'Chr04_Pall' ~ 'Two',
                                     Chr == 'Chr05_Occ' ~ 'One',
                                     Chr == 'Chr05_Pall' ~ 'Two',
                                     Chr == 'Chr06_Occ' ~ 'One',
                                     Chr == 'Chr06_Pall' ~ 'Two',
                                     Chr == 'Chr07_Occ' ~ 'One',
                                     Chr == 'Chr07_Pall' ~ 'Two',
                                     Chr == 'Chr08_Occ' ~ 'One',
                                     Chr == 'Chr08_Pall' ~ 'Two')) %>%
            ggplot(aes(x = pos_cum, y = !!sym(y))) +
            geom_point(shape = 21, alpha = 0.1, size = 0.25, aes(fill = chrom_cat, color = chrom_cat)) +
            geom_point(data = outliers, shape = 21, alpha = 0.1, size = 0.25, color = col, fill = col) +
            geom_hline(yintercept = 2, color = "grey40", linetype = "dashed") +
            geom_hline(yintercept = -2, color = "grey40", linetype = "dashed") +
            scale_x_continuous(label = axis_set$Chr, breaks=axis_set$center) +
            scale_y_continuous(expand = c(0,0), breaks = breaks) +
            coord_cartesian(ylim = coord) +
            scale_fill_manual(values = c("black", "grey40")) + 
            scale_color_manual(values = c("black", "grey40")) + 
            ylab(sprintf('Normalized %s', stat)) + xlab('') +
            ggtitle(sprintf("%s Per-site %s", hab, stat)) +
            theme_classic() +
            theme(
                legend.position = "none",
                panel.border = element_blank(),
                panel.grid.major.x = element_blank(),
                panel.grid.minor.x = element_blank(),
                axis.text = element_text(size=16),
                axis.title = element_text(size=20),
                axis.text.x = element_text(angle = 45, hjust = 1),
                plot.title = element_text(size = 20, face = "bold")
              )
    return(manhat_plot)
}

In [None]:
rural_nsl_persite_manhat <- plot_single_population_hapstat_persite_manhat(nsl_persite, "Rural", stat = "nSL")
rural_nsl_persite_manhat
# ggsave(filename = snakemake@output[["rur_nsl_manhat_persite"]], plot = rural_nsl_persite_manhat, 
#        height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
urban_nsl_manhat <- plot_single_population_hapstat_win_manhat(nsl_df_filt, "Urban", stat = 'nSL')
urban_nsl_manhat
ggsave(filename = snakemake@output[["urb_nsl_manhat"]], plot = urban_nsl_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
urban_nsl_persite_manhat <- plot_single_population_hapstat_persite_manhat(nsl_persite, "Urban", stat = "nSL")
urban_nsl_persite_manhat
# ggsave(filename = snakemake@output[["urb_nsl_manhat_persite"]], plot = urban_nsl_persite_manhat, 
#        height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
nsl_df_filt %>% 
    filter(is_outlier == 1) %>% 
    write_csv(snakemake@output[["nsl_df"]], delim="\t")

## iHS

In [None]:
ihs_df <- read_delim(snakemake@input[["win_ihs"]], delim='\t')
head(ihs_df)

In [None]:
ihs_nsites_hist <- ihs_df %>% 
    ggplot(aes(x = n)) +
        geom_histogram(bins = 50, color = "black", fill = "white") +
        ylab("Number of windows") + xlab("Number of sites") +
        geom_vline(xintercept = nsl_nsites_thresh, color = "red", linetype = "dashed", linewidth = 1) +
        theme_classic() +
        theme(axis.text = element_text(size = 15),
              axis.title = element_text(size = 17))

options(repr.plot.width = 8, repr.plot.height = 8)
ihs_nsites_hist

In [None]:
ihs_df_filt <- ihs_df %>% 
    group_split(habitat) %>% 
    purrr::map_dfr(., assign_nsl_outliers, thresh = nsl_nsites_thresh)

sprintf("%s of %s iHS windows remaining after removing those with less than %s sites", nrow(ihs_df_filt), nrow(ihs_df), nsl_nsites_thresh)

head(ihs_df_filt)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 6)
rural_ihs_manhat <- plot_single_population_hapstat_win_manhat(ihs_df_filt, "Rural", stat = 'iHS')
rural_ihs_manhat
ggsave(filename = snakemake@output[["rur_ihs_manhat"]], plot = rural_ihs_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
urban_ihs_manhat <- plot_single_population_hapstat_win_manhat(ihs_df_filt, "Urban", stat = 'iHS')
urban_ihs_manhat
ggsave(filename = snakemake@output[["urb_ihs_manhat"]], plot = urban_ihs_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
ihs_persite <- snakemake@input[["norm_ihs"]] %>% 
    purrr::map_dfr(., load_hapstat_norm, stat = "ihs") %>% 
    mutate(crit = ifelse(normihs < -2 | normihs > 2, 1, 0))

In [None]:
urban_ihs_persite_manhat <- plot_single_population_hapstat_persite_manhat(ihs_persite, "Urban", stat = "iHS")
urban_ihs_persite_manhat
ggsave(filename = snakemake@output[["urb_ihs_manhat_persite"]], plot = urban_ihs_persite_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
rural_ihs_persite_manhat <- plot_single_population_hapstat_persite_manhat(ihs_persite, "Rural", stat = "iHS")
rural_ihs_persite_manhat
ggsave(filename = snakemake@output[["rur_ihs_manhat_persite"]], plot = rural_ihs_persite_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
ihs_df_filt %>% 
    filter(is_outlier == 1) %>% 
    write_csv(snakemake@output[["ihs_df"]], delim="\t")

## iHH12

In [None]:
ihh12_df <- read_delim(snakemake@input[["win_ihh12"]], delim='\t')
head(ihh12_df)

In [None]:
ihh12_nsites_hist <- ihh12_df %>% 
    ggplot(aes(x = n)) +
        geom_histogram(bins = 50, color = "black", fill = "white") +
        ylab("Number of windows") + xlab("Number of sites") +
        geom_vline(xintercept = nsl_nsites_thresh, color = "red", linetype = "dashed", linewidth = 1) +
        theme_classic() +
        theme(axis.text = element_text(size = 15),
              axis.title = element_text(size = 17))

options(repr.plot.width = 8, repr.plot.height = 8)
ihh12_nsites_hist

In [None]:
ihh12_df_filt <- ihh12_df %>% 
    group_split(habitat) %>% 
    purrr::map_dfr(., assign_nsl_outliers, thresh = nsl_nsites_thresh)

sprintf("%s of %s iHH12 windows remaining after removing those with less than %s sites", nrow(ihh12_df_filt), nrow(ihh12_df), nsl_nsites_thresh)

head(ihh12_df_filt)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 6)
urban_ihh12_manhat <- plot_single_population_hapstat_win_manhat(ihh12_df_filt, "Urban", stat = 'iHH12')
urban_ihh12_manhat
ggsave(filename = snakemake@output[["urb_ihh12_manhat"]], plot = urban_ihh12_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
rural_ihh12_manhat <- plot_single_population_hapstat_win_manhat(ihh12_df_filt, "Rural", stat = 'iHH12')
rural_ihh12_manhat
ggsave(filename = snakemake@output[["rur_ihh12_manhat"]], plot = rural_ihh12_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
ihh12_persite <- snakemake@input[["norm_ihh12"]] %>% 
    purrr::map_dfr(., load_hapstat_norm, stat = "ihh12") %>% 
    mutate(crit = ifelse(normihh12 < -2 | normihh12 > 2, 1, 0))

In [None]:
urban_ihh12_persite_manhat <- plot_single_population_hapstat_persite_manhat(ihh12_persite, "Urban", stat = "iHH12")
urban_ihh12_persite_manhat
# ggsave(filename = snakemake@output[["urb_ihh12_manhat_persite"]], plot = urban_ihh12_persite_manhat, 
#        height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
rural_ihh12_persite_manhat <- plot_single_population_hapstat_persite_manhat(ihh12_persite, "Rural", stat = "iHH12")
rural_ihh12_persite_manhat
# ggsave(filename = snakemake@output[["rur_ihh12_manhat_persite"]], plot = rural_ihh12_persite_manhat, 
#        height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
ihh12_df_filt %>% 
    filter(is_outlier == 1) %>% 
    write_csv(snakemake@output[["ihh12_df"]], delim="\t")

## Fst

- I estimated Fst and the urban-rural difference in pi and Tajima's D in 50 kb windows across the genome
- I'll consider windows in the top 1% of the genome-wide empirical Fst distributions as outliers, and further support these outliers if these windows are also in the top 1% of the genome-wide difference in pi and Td distributions
- I'll look at overlap between the Fst and XP-nSL outlier windows

In [None]:
win_sfs_df %>% 
    group_by(fst_outlier, tp_outlier, td_outlier) %>%
    summarise(n = n())

In [None]:
win_sfs_df %>% filter(all_outlier == 1) %>% group_by(direction) %>% summarise(n = n())

In [None]:
win_sfs_df

In [None]:
get_positions <- function(df){
    df_out <- df %>% 
        dplyr::select(Chr, start, end)
    return(df_out)
}

xpnsl_top10_regions_pos <- get_positions(xpnsl_top10_urban_rural) 
fst_outliers_windows_pos <- get_positions(win_sfs_df %>% filter(fst_outlier == 1))
fst_pi_td_outliers_windows_pos <- get_positions(win_sfs_df %>% filter(all_outlier == 1))

In [None]:
get_overlapping_fst_windows <- function(xpnsl_pos, fst_pos){
    
    # Count number of overlapping Fst outlier windows in each XP-nSL window
    df_out <- bt.intersect(xpnsl_pos, fst_pos, c = T)
    names(df_out) <- c('Chr', 'xpnsl_win_start', 'xpnsl_win_end', 'num_fst_outliers')
    return(df_out)
} 

get_overlapping_fst_windows(xpnsl_top10_regions_pos, fst_outliers_windows_pos)

In [None]:
xpnsl_fst_overlap <- bt.intersect(xpnsl_top10_urban_rural %>% dplyr::select(Chr, start, end), 
                                  win_sfs_df %>% dplyr::select(-chrom_pos) %>% filter(fst_outlier == 1), 
                                  c = T)
names(xpnsl_fst_overlap) <- c('Chr', 'start', 'end', 'num_fst_outliers')
xpnsl_top10_urban_rural <- left_join(xpnsl_top10_urban_rural, xpnsl_fst_overlap, by = c('Chr', 'start', 'end'))
xpnsl_top10_urban_rural

In [None]:
urban_tmp <- bt.intersect(xpnsl_top10_urban_rural %>% filter(direction == 'Urban sel') %>% dplyr::select(Chr, start, end), 
                                        win_sfs_df %>% dplyr::select(-chrom_pos) %>% filter(all_outlier == 1 & direction == 'Urban sel'), 
                                        c = T)
rural_tmp <- bt.intersect(xpnsl_top10_urban_rural %>% filter(direction == 'Rural sel') %>% dplyr::select(Chr, start, end), 
                                        win_sfs_df %>% dplyr::select(-chrom_pos) %>% filter(all_outlier == 1 & direction == 'Rural sel'), 
                                        c = T)
xpnsl_fst_pi_td_overlap <- bind_rows(urban_tmp, rural_tmp)
names(xpnsl_fst_pi_td_overlap) <- c('Chr', 'start', 'end', 'num_fst_pi_td_outliers')
xpnsl_top10_urban_rural <- left_join(xpnsl_top10_urban_rural, xpnsl_fst_pi_td_overlap, by = c('Chr', 'start', 'end')) %>% 
    arrange(direction, Chr, start)
xpnsl_top10_urban_rural

## ARG Fst

- Here, I'll look at the top genome-wide per-site Fst values estimated from the ARGs
- I'll check whether any of the signals overlap any of the windows above. Having ARGs in these regions could open up some interesting analyses

In [None]:
arg_df_names <- arg_stats_df %>% 
    filter(arg_branch_fst >= 0.05) %>% 
    mutate(id = paste0(chromosome, "_region", regionID)) %>% 
    dplyr::select(chromosome, arg_win_start, arg_win_end, regionID, id, contains("branch")) %>% 
    rename("arg_chr" = "chromosome", "arg_start" = "arg_win_start", "arg_end" = "arg_win_end") %>% 
    names()

xpnsl_df_names <- win_xpnsl_df_outliers_with_ranks_merged %>% 
    dplyr::select(Chr, start, end, direction, all_xpnsl_ranks) %>% 
    rename("xpnsl_chr" = "Chr", "xpnsl_start" = "start", "xpnsl_end" = "end") %>% 
    names()

all_names <- append(xpnsl_df_names, arg_df_names)

xpnsl_with_arg_outlier <- bt.intersect(win_xpnsl_df_outliers_with_ranks_merged %>% 
                                           dplyr::select(Chr, start, end, direction, all_xpnsl_ranks), 
                                       arg_stats_df %>% 
                                           filter(arg_branch_fst >= 0.05) %>% 
                                           mutate(id = paste0(chromosome, "_region", regionID)) %>% 
                                           dplyr::select(chromosome, arg_win_start, arg_win_end, regionID, id, contains("branch")) %>% 
                                           rename("Chr" = "chromosome", "start" = "arg_win_start", "end" = "arg_win_end"), 
                                       wa=T, wb=T)
names(xpnsl_with_arg_outlier) <- all_names
xpnsl_with_arg_outlier

In [None]:
write_csv(xpnsl_with_arg_outlier, snakemake@output[["xpnsl_arg_out"]])

# Create output tables

## Dataframe with gene ID in top 10 selected regions

In [None]:
# Get gene names dataframe
gene_names <- gff %>% 
    filter(type == 'gene') %>% 
    mutate(gene = str_extract(attributes, pattern = '(?<=gene=)\\w+(?=;)'),
           gene_id = str_extract(attributes, pattern = '(?<=ID\\=)ACLI19_g\\d+(?=;)')) %>% 
    dplyr::select(seqid, start, end, gene_id, gene)

# Get gene products and GO annotations dataframe
prod_go_annot <- gff %>% 
    filter(type == 'mRNA') %>% 
    mutate(id = str_extract(attributes, pattern = '(?<=ID\\=)ACLI19_g\\d+\\.t\\d+(?=;)'),
           func = str_extract(attributes, pattern = '(?<=product=)[^;]*'),
           go = str_extract_all(attributes, pattern = 'GO:\\d+(?=(,|;))')) %>% 
    separate(id, into = c('gene_id', 'trans'), sep = '\\.') %>% 
    filter(trans == 't1') %>% 
    dplyr::select(seqid, gene_id, func, go)

# Combine genes, functions, and GO annotations into single dataframe
genes_prods_go_df <- left_join(gene_names, prod_go_annot, by = c('seqid', 'gene_id')) %>% 
    rename('Chr' = 'seqid')

In [None]:
xpnsl_top10_urban_rural

In [None]:
xpnsl_top10_urban_rural_with_genes_long <- bt.intersect(xpnsl_top10_urban_rural %>% 
                                                            dplyr::select(Chr, start, end, direction), 
                                                        genes_prods_go_df %>% 
                                                            dplyr::select(-go),
                                                        wa=T, wb=T) %>% 
    dplyr::select(V1, V2, V3, V4, V8, V9, V10)
names(xpnsl_top10_urban_rural_with_genes_long) <- c('Chr', 'start', 'end', 'direction', 'gene_id', 'gene_symbol', 'product')
xpnsl_top10_urban_rural_with_genes_long

In [None]:
# Write dataframe with gene IDs for top ten selected urban and rural regions
write_delim(xpnsl_top10_urban_rural_with_genes_long, snakemake@output[['top_ten_genes']], delim = '\t')

## Table with gene symbols, products, and Fst for top selected regions

In [None]:
# Concatenate gene symbols for top 10 selected regions
symbols <- xpnsl_top10_urban_rural_with_genes_long %>% 
    dplyr::select(-gene_id, -product) %>% 
    filter(!is.na(gene_symbol)) %>% 
    group_by(Chr, start, end, direction) %>% 
    summarise(gene_symbols = toString(gene_symbol))

# Concatenate products for top 10 selected regions
prods <- xpnsl_top10_urban_rural_with_genes_long %>% 
    dplyr::select(-gene_id, -gene_symbol) %>% 
    filter(product != 'hypothetical protein') %>% 
    group_by(Chr, start, end, direction) %>% 
    summarise(products = toString(product))

# Add symbols and products to table
xpnsl_top10_urban_rural_with_genes <- xpnsl_top10_urban_rural %>% 
    left_join(., symbols) %>% 
    left_join(., prods)

xpnsl_top10_urban_rural_with_genes

In [None]:
# Write table with gene symbols, products, and Fst overlaps for top ten selected yrban and rural regions
write_delim(xpnsl_top10_urban_rural_with_genes, snakemake@output[['top_ten_tbl']], delim = '\t')

## Dataframe with gene IDs in all selected regions based on XP-nSL

In [None]:
xpnsl_outliers <- win_xpnsl_df %>% 
    filter(direction != 'Not outlier') %>% 
    dplyr::select(Chr, start, end, direction)

xpnsl_outliers_with_genes <- bt.intersect(xpnsl_outliers, gene_names, wa = T, wb = T) %>% 
    dplyr::select(V1, V2, V3, V4, V8)
names(xpnsl_outliers_with_genes) <- c('Chr', 'start', 'end', 'direction', 'gene_id')
xpnsl_outliers_with_genes

In [None]:
write_delim(xpnsl_outliers_with_genes, snakemake@output[['all_xpnsl_sel']], delim = '\t')

## Dataframe with gene IDs in all selected regions based on XP-nSL

In [None]:
bt.intersect(xpnsl_with_arg_outlier %>% dplyr::select(1:5),
             genes_prods_go_df %>% dplyr::select(-go),
             wa=T, wb=T)

In [None]:
# arg_stats_df %>% 
#     filter(arg_branch_fst >= 0.05) %>% 
#     mutate(id = paste0(chromosome, "_region", regionID)) %>% 
#     dplyr::select(chromosome, arg_win_start, arg_win_end, regionID, id, contains("branch")) %>% 
#     rename("Chr" = "chromosome", "start" = "arg_win_start", "end" = "arg_win_end") %>% 
#     group_by(Chr, start, end) %>% 
#     summarise(gene_symbols = toString(id))

genes_with_arg_outliers <- bt.intersect(genes_prods_go_df %>% dplyr::select(-go),
             arg_stats_df %>% 
                 filter(arg_branch_fst >= 0.05) %>% 
                 mutate(id = paste0(chromosome, "_region", regionID, ":", arg_win_start)) %>% 
                 dplyr::select(chromosome, arg_win_start, arg_win_end, regionID, id, contains("branch")),
             wa = T, wb = T)  %>% 
    dplyr::select(V1, V2, V3, V4, V5, V6, V11) %>% 
    group_by(V1, V2, V3, V4, V5, V6) %>% 
    summarise(arg_sites = toString(V11))
names(genes_with_arg_outliers) <- c("Chr", "start", "end", "gene_symbol", "product", "arg_sites")
genes_with_arg_outliers

In [None]:
write_csv(genes_with_arg_outliers, snakemake@output[["arg_gene"]])