# Setup

In [None]:
# Load required packages
library(tidyverse)
library(bedtoolsr)
library(vcfR)
library(GenotypePlot)
library(factoextra)

In [None]:
# ggplot theme with transparent backgrounds
transp_theme <- theme(panel.background = element_rect(fill = "transparent", colour = NA_character_), 
                      panel.grid.major = element_blank(),
                      panel.grid.minor = element_blank(),
                      plot.background = element_rect(fill = "transparent", colour = NA_character_),
                      legend.background = element_rect(fill = "transparent"),
                      legend.box.background = element_rect(fill = "transparent"),
                      legend.key = element_rect(fill = "transparent"))

# Analysis of selective sweeps 

## XP-nSL

In [None]:
remap_chr_names <- function(df){
    df_out <- df %>% 
    mutate(Chr = case_when(Chr == 'Chr01_Occ' ~ 1,
           Chr == 'Chr01_Pall' ~ 2,
           Chr == 'Chr02_Occ' ~ 3,
           Chr == 'Chr02_Pall' ~ 4,
           Chr == 'Chr03_Occ' ~ 5,
           Chr == 'Chr03_Pall' ~ 6,
           Chr == 'Chr04_Occ' ~ 7,
           Chr == 'Chr04_Pall' ~ 8,
           Chr == 'Chr05_Occ' ~ 9,
           Chr == 'Chr05_Pall' ~ 10,
           Chr == 'Chr06_Occ' ~ 11,
           Chr == 'Chr06_Pall' ~ 12,
           Chr == 'Chr07_Occ' ~ 13,
           Chr == 'Chr07_Pall' ~ 14,
           Chr == 'Chr08_Occ' ~ 15,
           Chr == 'Chr08_Pall' ~ 16))
    return(df_out)
}

In [None]:
# Load observed XP-nSL data
obs_xpnsl_ur_df <- read_delim(snakemake@input[["win_xpnsl_ur"]]) %>% 
    remap_chr_names()
obs_xpnsl_ur_outRem_df <- read_delim(snakemake@input[["win_xpnsl_ur_outRem"]]) %>% 
    remap_chr_names()
obs_xpnsl_sr_df <- read_delim(snakemake@input[["win_xpnsl_sr"]]) %>% 
    remap_chr_names()
obs_xpnsl_us_df <- read_delim(snakemake@input[["win_xpnsl_us"]]) %>% 
    remap_chr_names()

In [None]:
# Distribution of the number of sites per window
xpnsl_nsites_thresh <- 50
xpnsl_nSites_hist <- obs_xpnsl_ur_df %>% 
    ggplot(aes(x = n)) +
        geom_histogram(bins = 50, color = "black", fill = "grey") +
        ylab("Number of windows") + xlab("Number of sites") +
        geom_vline(xintercept = xpnsl_nsites_thresh, color = "red", linetype = "dashed", linewidth = 1) +
        theme_classic() +
        theme(axis.text = element_text(size = 15),
              axis.title = element_text(size = 17))

xpnsl_nSites_hist
ggsave(filename = snakemake@output[["xpnsl_nSites_hist"]], plot = xpnsl_nSites_hist, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")

### Filter and identify outlier windows

In [None]:
# Function to assign outliers to XP-nSL windows
assign_xpnsl_outliers <- function(df, comp){

    if(comp == "Urban_Rural"){
        pos_dir = "Urban"
        neg_dir = "Rural"
    } else if (comp == "Suburban_Rural"){
        pos_dir = "Suburban"
        neg_dir = "Rural"
    } else if (comp == "Urban_Suburban"){
        pos_dir = "Urban"
        neg_dir = "Suburban"       
    }

    df_filt <- df %>%
        filter(n >= xpnsl_nsites_thresh)

    # Get critical values for mean XP-nSL score and proportions greater or lesser than 2 and -2, respectively
    xpnsl_score_quant <- quantile(df_filt %>% pull(mean), probs = c(0.01, 0.99))
    xpnsl_gtprop_quant <- quantile(df_filt %>% pull(gt_frac), probs = 0.99)
    xpnsl_ltprop_quant <- quantile(df_filt %>% pull(lt_frac), probs = 0.99)

    # Identify outliers and add as categorical variable to windows dataframe
    df_filt <- df_filt %>%
        mutate(xpnsl_score_outlier = ifelse(mean <= xpnsl_score_quant[1] | mean >= xpnsl_score_quant[2], 1, 0),
               xpnsl_gtprop_outlier = ifelse(gt_frac >= xpnsl_gtprop_quant, 1, 0),
               xpnsl_ltprop_outlier = ifelse(lt_frac >= xpnsl_ltprop_quant, 1, 0),
               direction = case_when(xpnsl_score_outlier == 1 & mean > 0 & xpnsl_gtprop_outlier == 1 ~ pos_dir,
                                     xpnsl_score_outlier == 1 & mean < 0 & xpnsl_ltprop_outlier == 1 ~ neg_dir,
                                     TRUE ~ 'None')) %>% 
    mutate(prop_outlier = case_when(direction == pos_dir ~ gt_frac,
                                    direction == neg_dir ~ lt_frac,
                                    TRUE ~ NA))

    return(df_filt)
}

obs_xpnsl_ur_df_filt <- assign_xpnsl_outliers(obs_xpnsl_ur_df, comp = "Urban_Rural")
obs_xpnsl_ur_outRem_df_filt <- assign_xpnsl_outliers(obs_xpnsl_ur_outRem_df, comp = "Urban_Rural")
obs_xpnsl_sr_df_filt <- assign_xpnsl_outliers(obs_xpnsl_sr_df, comp = "Suburban_Rural")
obs_xpnsl_us_df_filt <- assign_xpnsl_outliers(obs_xpnsl_us_df, comp = "Urban_Suburban")

sprintf("%s of %s XP-nSL windows remaining after removing those with less than %s sites for Urban-Rural comparison", nrow(obs_xpnsl_ur_df_filt), nrow(obs_xpnsl_ur_df), xpnsl_nsites_thresh)
sprintf("%s of %s XP-nSL windows remaining after removing those with less than %s sites for Urban-Rural (outlier removed) comparison", nrow(obs_xpnsl_ur_outRem_df_filt), nrow(obs_xpnsl_ur_outRem_df), xpnsl_nsites_thresh)
sprintf("%s of %s XP-nSL windows remaining after removing those with less than %s sites for Suburban-Rural comparison", nrow(obs_xpnsl_sr_df_filt), nrow(obs_xpnsl_sr_df), xpnsl_nsites_thresh)
sprintf("%s of %s XP-nSL windows remaining after removing those with less than %s sites for Urban-Suburban comparison", nrow(obs_xpnsl_us_df_filt), nrow(obs_xpnsl_us_df), xpnsl_nsites_thresh)

head(obs_xpnsl_ur_df_filt)

### Add ranks and percentiles from permuted distributions

#### Correlation between observed and permuted XP-nSL across iterations

In [None]:
perm_xpnsl_df <- purrr::map(snakemake@input[["win_xpnsl_perm"]], function(x) suppressMessages(read_delim(x, delim ='\t'))) %>% 
    map_dfr(., assign_xpnsl_outliers, comp = "Urban_Rural") %>% 
    remap_chr_names()
head(perm_xpnsl_df)

In [None]:
calc_cor <- function(df){
    val <- cor(df$mean, obs_xpnsl_ur_df_filt$mean, method = "pearson")
    return(data.frame(cor = val))
}
cor_plot <- perm_xpnsl_df %>% 
    group_split(iter) %>% 
    purrr::map_dfr(., calc_cor) %>% 
    ggplot(aes(x = cor)) +
        geom_histogram(bins = 50, color = "black", fill = "grey") +
        ylab("Number of iterations") + xlab("Permuted vs. observed XP-nSL Pearson's correlation coefficient") +
        theme_classic() +
        theme(axis.title = element_text(size = 17),
              axis.text = element_text(size = 15))

options(repr.plot.width = 8, repr.plot.height = 8)
cor_plot
ggsave(filename = snakemake@output[["cor_plot"]], plot = cor_plot, width = 8, 
       height = 8, units = "in", device = "pdf", dpi = 600)

#### Permutations and observed density plots

In [None]:
urban_sel_permuted <- perm_xpnsl_df %>% filter(direction == "Urban")
urban_sel_observed <- obs_xpnsl_ur_df_filt %>% filter(direction == "Urban")

urb_mean_plot <- ggplot() +
    geom_density(data = urban_sel_permuted, aes(x = mean, fill = "Permuted", color = "Permuted"), alpha = 0.5) + 
    geom_density(data = urban_sel_observed, aes(x = mean, fill = "Observed", color = "Observed"), alpha = 0.5) +
    ylab("Density") + xlab("Mean windowed XP-nSL score") +
    scale_fill_manual(name = "Distribution", values = c("Permuted" = "#0070ec", "Observed" = "#003876")) +
    scale_color_manual(name = "Distribution", values = c("Permuted" = "#0070ec", "Observed" = "#003876")) +
    theme_classic() + 
    theme(axis.title = element_text(size = 17),
          axis.text = element_text(size = 15)) +
    transp_theme

urb_mean_plot
ggsave(filename = snakemake@output[["urb_mean_plot"]], plot = urb_mean_plot, width = 8, 
       height = 8, units = "in", device = "pdf", dpi = 600)

In [None]:
urb_prop_plot <- ggplot() +
    geom_density(data = urban_sel_permuted, aes(x = prop_outlier, fill = "Permuted", color = "Permuted"), alpha = 0.5) + 
    geom_density(data = urban_sel_observed, aes(x = prop_outlier, fill = "Observed", color = "Observed"), alpha = 0.5) +
    ylab("Density") + xlab("Proportion of XP-nSL scores in window > 2") +
    scale_fill_manual(name = "Distribution", values = c("Permuted" = "#0070ec", "Observed" = "#003876")) +
    scale_color_manual(name = "Distribution", values = c("Permuted" = "#0070ec", "Observed" = "#003876")) +
    theme_classic() +
    theme(axis.title = element_text(size = 17),
          axis.text = element_text(size = 15)) +
    transp_theme

urb_prop_plot
ggsave(filename = snakemake@output[["urb_prop_plot"]], plot = urb_prop_plot, width = 8, 
       height = 8, units = "in", device = "pdf", dpi = 600)

In [None]:
# Dataframe with percentile in permuted distribution of each observed XP-nSL window
urban_percentile <- urban_sel_observed %>% 
    rowwise() %>% 
    mutate(mean_perm_perc = sum(mean >= urban_sel_permuted$mean) / nrow(urban_sel_permuted),
           prop_perm_perc = sum(prop_outlier >= urban_sel_permuted$prop_outlier) / nrow(urban_sel_permuted)) %>% 
    arrange(desc(mean_perm_perc)) %>% 
    dplyr::select(Chr, winID, start, end, winCenter, mean_perm_perc, prop_perm_perc)

In [None]:
rural_sel_permuted <- perm_xpnsl_df %>% filter(direction == "Rural")
rural_sel_observed <- obs_xpnsl_ur_df_filt %>% filter(direction == "Rural")

rur_mean_plot <- ggplot() +
    geom_density(data = rural_sel_permuted, aes(x = mean, fill = "Permuted", color = "Permuted"), alpha = 0.5) + 
    geom_density(data = rural_sel_observed, aes(x = mean, fill = "Observed", color = "Observed"), alpha = 0.5) +
    ylab("Density") + xlab("Mean windowed XP-nSL score") +
    scale_fill_manual(name = "Distribution", values = c("Permuted" = "#00e989", "Observed" = "#007243")) +
    scale_color_manual(name = "Distribution", values = c("Permuted" = "#00e989", "Observed" = "#007243")) +
    theme_classic() +
    theme(axis.title = element_text(size = 17),
          axis.text = element_text(size = 15)) +
    transp_theme

rur_mean_plot
ggsave(filename = snakemake@output[["rur_mean_plot"]], plot = rur_mean_plot, width = 8, 
       height = 8, units = "in", device = "pdf", dpi = 600)

In [None]:
rur_prop_plot <- ggplot() +
    geom_density(data = rural_sel_permuted, aes(x = prop_outlier, fill = "Permuted", color = "Permuted"), alpha = 0.5) + 
    geom_density(data = rural_sel_observed, aes(x = prop_outlier, fill = "Observed", color = "Observed"), alpha = 0.5) +
    ylab("Density") + xlab("Proportion of XP-nSL scores < -2") +
    scale_fill_manual(name = "Distribution", values = c("Permuted" = "#00e989", "Observed" = "#007243")) +
    scale_color_manual(name = "Distribution", values = c("Permuted" = "#00e989", "Observed" = "#007243")) +
    theme_classic() +
    theme(axis.title = element_text(size = 17),
          axis.text = element_text(size = 15)) +
    transp_theme

rur_prop_plot
ggsave(filename = snakemake@output[["rur_prop_plot"]], plot = rur_prop_plot, width = 8, 
       height = 8, units = "in", device = "pdf", dpi = 600)

In [None]:
# Dataframe with percentile in permuted distribution of each observed XP-nSL window
rural_percentile <- rural_sel_observed %>% 
    rowwise() %>% 
    mutate(mean_perm_perc = sum(mean <= rural_sel_permuted$mean) / nrow(rural_sel_permuted),
           prop_perm_perc = sum(prop_outlier >= rural_sel_permuted$prop_outlier) / nrow(rural_sel_permuted)) %>% 
    arrange(desc(mean_perm_perc)) %>% 
    dplyr::select(Chr, winID, start, end, winCenter, mean_perm_perc, prop_perm_perc)

In [None]:
obs_xpnsl_ur_df_filt <- obs_xpnsl_ur_df_filt %>% 
    left_join(., bind_rows(urban_percentile, rural_percentile), by = c("Chr", "winID", "start", "end", "winCenter"))
head(obs_xpnsl_ur_df_filt)

write_delim(obs_xpnsl_ur_df_filt, snakemake@output[["xpnsl_df"]], delim='\t')

#### Add ranks

In [None]:
# Function to add ranks to XP-nSL windows based on the proportion of outlier scores
add_xpnsl_ranks <- function(df){
        
    df_out <- df %>% 
        arrange(desc(prop_perm_perc)) %>% 
        mutate(xpnsl_rank = 1:n())
    return(df_out)
}

obs_xpnsl_ur_df_filt <- obs_xpnsl_ur_df_filt %>% 
    group_split(direction) %>% 
    purrr::map_dfr(., add_xpnsl_ranks) %>% 
    mutate(xpnsl_rank = ifelse(direction == 'None', NA, xpnsl_rank))

### Generate Manhattan plot

#### Functions

In [None]:
create_xpnsl_manhat_ur <- function(df, comp){

    pos_dir <- "Urban"
    neg_dir <- "Rural"
    pos_col <- "#003876"
    neg_col <- "#007243"

    # Setting up cummulative genome-wide x-axis
    data_cum <- df %>% 
        group_by(Chr) %>% 
        summarise(max_winCenter = max(winCenter)) %>% 
        mutate(winCenter_add = lag(cumsum(max_winCenter), default = 0)) %>% 
        dplyr::select(Chr, winCenter_add)
    
    df_mod <- df %>%
        inner_join(data_cum, by = "Chr") %>% 
        mutate(winCenter_cum = winCenter + winCenter_add) 
    
    axis_set <- df_mod %>% 
      group_by(Chr) %>% 
      summarize(center = mean(winCenter_cum))
    
    # Get XP-nSL outliers for plotting as separate layers
    urban_xpnsl_topPerm <- df_mod %>% filter(direction == pos_dir & mean_perm_perc >= 0.95 & prop_perm_perc >= 0.95)
    rural_xpnsl_topPerm <- df_mod %>% filter(direction == neg_dir & mean_perm_perc >= 0.95 & prop_perm_perc >= 0.95)
    xpnsl_not_outlier <- df_mod %>% filter(direction == 'None')
    urban_xpnsl_outliers <- df_mod %>% filter(direction == pos_dir & mean_perm_perc < 0.95 & prop_perm_perc < 0.95)
    rural_xpnsl_outliers <- df_mod %>% filter(direction == neg_dir & mean_perm_perc < 0.95 & prop_perm_perc < 0.95)
    
    # Get XP-nSL score quantile for plotting
    xpnsl_score_quant <- quantile(df_mod %>% pull(mean), probs = c(0.01, 0.99))
    
    xpnsl_not_outlier <- xpnsl_not_outlier %>%
        mutate(chrom_cat = case_when(Chr == '1' ~ 'One',
                                     Chr == '2' ~ 'Two',
                                     Chr == '3' ~ 'One',
                                     Chr == '4' ~ 'Two',
                                     Chr == '5' ~ 'One',
                                     Chr == '6' ~ 'Two',
                                     Chr == '7' ~ 'One',
                                     Chr == '8' ~ 'Two',
                                     Chr == '9' ~ 'One',
                                     Chr == '10' ~ 'Two',
                                     Chr == '11' ~ 'One',
                                     Chr == '12' ~ 'Two',
                                     Chr == '13' ~ 'One',
                                     Chr == '14' ~ 'Two',
                                     Chr == '15' ~ 'One',
                                     Chr == '16' ~ 'Two'))
    
    manhat <- ggplot() +
            geom_point(data = xpnsl_not_outlier, shape = 21, alpha = 0.4, size = 3, 
                       aes(x = winCenter_cum, y = mean, fill = chrom_cat, color = chrom_cat)) +
            geom_point(data = urban_xpnsl_outliers, shape = 21, alpha = 1, size = 3, color = pos_col, fill = pos_col,
                       aes(x = winCenter_cum, y = mean)) +
            geom_point(data = rural_xpnsl_outliers, shape = 21, alpha = 1, size = 3, color = neg_col, fill = neg_col,
                       aes(x = winCenter_cum, y = mean)) +
            geom_point(data = urban_xpnsl_topPerm, shape = 23, alpha = 1, size = 4.5, color = pos_col, fill = pos_col,
                       aes(x = winCenter_cum, y = mean)) +
            geom_point(data = rural_xpnsl_topPerm, shape = 23, alpha = 1, size = 4.5, color = neg_col, fill = neg_col,
                       aes(x = winCenter_cum, y = mean)) +
            geom_hline(yintercept = xpnsl_score_quant, color = "grey40", linetype = "dashed") +
            scale_x_continuous(label = axis_set$Chr, breaks = axis_set$center) +
            scale_y_continuous(expand = c(0,0)) +
            coord_cartesian(ylim = c(-6, 5)) +
            scale_fill_manual(values = c("black", "grey40")) + 
            scale_color_manual(values = c("black", "grey40")) + 
            ylab('Urban-rural normalized XP-nSL') + xlab('Chromosomes') +
            theme_classic() +
            theme(
                legend.position = "none",
                panel.border = element_blank(),
                panel.grid.major.x = element_blank(),
                panel.grid.minor.x = element_blank(),
                axis.text = element_text(size=16),
                axis.title = element_text(size = 20),
              ) +
            transp_theme
    
    return(manhat)
}


In [None]:
create_xpnsl_manhat_ur_outRem <- function(df, comp){

    pos_dir <- "Urban"
    neg_dir <- "Rural"
    pos_col <- "#003876"
    neg_col <- "#007243"

    # Setting up cummulative genome-wide x-axis
    data_cum <- df %>% 
        group_by(Chr) %>% 
        summarise(max_winCenter = max(winCenter)) %>% 
        mutate(winCenter_add = lag(cumsum(max_winCenter), default = 0)) %>% 
        dplyr::select(Chr, winCenter_add)
    
    df_mod <- df %>%
        inner_join(data_cum, by = "Chr") %>% 
        mutate(winCenter_cum = winCenter + winCenter_add) 
    
    axis_set <- df_mod %>% 
      group_by(Chr) %>% 
      summarize(center = mean(winCenter_cum))
    
    # Get XP-nSL outliers for plotting as separate layers
    xpnsl_not_outlier <- df_mod %>% filter(direction == 'None')
    urban_xpnsl_outliers <- df_mod %>% filter(direction == pos_dir)
    rural_xpnsl_outliers <- df_mod %>% filter(direction == neg_dir)
    
    # Get XP-nSL score quantile for plotting
    xpnsl_score_quant <- quantile(df_mod %>% pull(mean), probs = c(0.01, 0.99))
    
    xpnsl_not_outlier <- xpnsl_not_outlier %>%
        mutate(chrom_cat = case_when(Chr == '1' ~ 'One',
                                     Chr == '2' ~ 'Two',
                                     Chr == '3' ~ 'One',
                                     Chr == '4' ~ 'Two',
                                     Chr == '5' ~ 'One',
                                     Chr == '6' ~ 'Two',
                                     Chr == '7' ~ 'One',
                                     Chr == '8' ~ 'Two',
                                     Chr == '9' ~ 'One',
                                     Chr == '10' ~ 'Two',
                                     Chr == '11' ~ 'One',
                                     Chr == '12' ~ 'Two',
                                     Chr == '13' ~ 'One',
                                     Chr == '14' ~ 'Two',
                                     Chr == '15' ~ 'One',
                                     Chr == '16' ~ 'Two'))
    
    manhat <- ggplot() +
            geom_point(data = xpnsl_not_outlier, shape = 21, alpha = 0.4, size = 3, 
                       aes(x = winCenter_cum, y = mean, fill = chrom_cat, color = chrom_cat)) +
            geom_point(data = urban_xpnsl_outliers, shape = 21, alpha = 1, size = 3, color = pos_col, fill = pos_col,
                       aes(x = winCenter_cum, y = mean)) +
            geom_point(data = rural_xpnsl_outliers, shape = 21, alpha = 1, size = 3, color = neg_col, fill = neg_col,
                       aes(x = winCenter_cum, y = mean)) +
            geom_hline(yintercept = xpnsl_score_quant, color = "grey40", linetype = "dashed") +
            scale_x_continuous(label = axis_set$Chr, breaks = axis_set$center) +
            scale_y_continuous(expand = c(0,0)) +
            coord_cartesian(ylim = c(-6, 5)) +
            scale_fill_manual(values = c("black", "grey40")) + 
            scale_color_manual(values = c("black", "grey40")) + 
            ylab('Urban-rural normalized XP-nSL') + xlab('Chromosomes') +
            theme_classic() +
            theme(
                legend.position = "none",
                panel.border = element_blank(),
                panel.grid.major.x = element_blank(),
                panel.grid.minor.x = element_blank(),
                axis.text = element_text(size=16),
                axis.title = element_text(size = 20),
              ) +
            transp_theme
    
    return(manhat)
}


In [None]:
create_xpnsl_manhat_sr_us <- function(df, comp){

    if(comp == "Suburban_Rural"){
        pos_dir = "Suburban"
        neg_dir = "Rural"
        pos_col <- "#914205"
        neg_col <- "#007243"
        ylab <- "Suburban-rural normalized XP-nSL"
    } else if (comp == "Urban_Suburban"){
        pos_dir = "Urban"
        neg_dir = "Suburban"  
        pos_col <- "#003876"
        neg_col <- "#914205"
        ylab <- "Urban-suburban normalized XP-nSL"
    }
    
    # Setting up cummulative genome-wide x-axis
    data_cum <- df %>% 
        group_by(Chr) %>% 
        summarise(max_winCenter = max(winCenter)) %>% 
        mutate(winCenter_add = lag(cumsum(max_winCenter), default = 0)) %>% 
        dplyr::select(Chr, winCenter_add)
    
    df_mod <- df %>%
        inner_join(data_cum, by = "Chr") %>% 
        mutate(winCenter_cum = winCenter + winCenter_add) 
    
    axis_set <- df_mod %>% 
      group_by(Chr) %>% 
      summarize(center = mean(winCenter_cum))
    
    # Get XP-nSL outliers for plotting as separate layers
    xpnsl_not_outlier <- df_mod %>% filter(direction == 'None')
    urban_xpnsl_outliers <- df_mod %>% filter(direction == pos_dir)
    rural_xpnsl_outliers <- df_mod %>% filter(direction == neg_dir)
    
    # Get XP-nSL score quantile for plotting
    xpnsl_score_quant <- quantile(df_mod %>% pull(mean), probs = c(0.01, 0.99))
    
    xpnsl_not_outlier <- xpnsl_not_outlier %>%
        mutate(chrom_cat = case_when(Chr == 1 ~ 'One',
                                     Chr == 2 ~ 'Two',
                                     Chr == 3 ~ 'One',
                                     Chr == 4 ~ 'Two',
                                     Chr == 5 ~ 'One',
                                     Chr == 6 ~ 'Two',
                                     Chr == 7 ~ 'One',
                                     Chr == 8 ~ 'Two',
                                     Chr == 9 ~ 'One',
                                     Chr == 10 ~ 'Two',
                                     Chr == 11 ~ 'One',
                                     Chr == 12 ~ 'Two',
                                     Chr == 13 ~ 'One',
                                     Chr == 14 ~ 'Two',
                                     Chr == 15 ~ 'One',
                                     Chr == 16 ~ 'Two'))
    
    manhat <- ggplot() +
            geom_point(data = xpnsl_not_outlier, shape = 21, alpha = 0.4, size = 3, 
                       aes(x = winCenter_cum, y = mean, fill = chrom_cat, color = chrom_cat)) +
            geom_point(data = urban_xpnsl_outliers, shape = 21, alpha = 1, size = 3, color = pos_col, fill = pos_col,
                       aes(x = winCenter_cum, y = mean)) +
            geom_point(data = rural_xpnsl_outliers, shape = 21, alpha = 1, size = 3, color = neg_col, fill = neg_col,
                       aes(x = winCenter_cum, y = mean)) +
            geom_hline(yintercept = xpnsl_score_quant, color = "grey40", linetype = "dashed") +
            scale_x_continuous(label = axis_set$Chr, breaks = axis_set$center) +
            scale_y_continuous(expand = c(0,0)) +
            coord_cartesian(ylim = c(-6, 5)) +
            scale_fill_manual(values = c("black", "grey40")) + 
            scale_color_manual(values = c("black", "grey40")) + 
            ylab(ylab) + xlab('Chromosomes') +
            theme_classic() +
            theme(
                legend.position = "none",
                panel.border = element_blank(),
                panel.grid.major.x = element_blank(),
                panel.grid.minor.x = element_blank(),
                axis.text = element_text(size=16),
                axis.title = element_text(size = 20)
              ) +
            transp_theme
    
    return(manhat)
}


#### Generate plots

In [None]:
options(repr.plot.width = 20, repr.plot.height = 6)
xpnsl_manhat_ur <- create_xpnsl_manhat_ur(obs_xpnsl_ur_df_filt, comp = "Urban_Rural")
xpnsl_manhat_ur
ggsave(filename = snakemake@output[["xpnsl_manhat_ur"]], plot = xpnsl_manhat_ur, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
xpnsl_manhat_ur_outRem <- create_xpnsl_manhat_ur_outRem(obs_xpnsl_ur_outRem_df_filt, comp = "Urban_Rural")
xpnsl_manhat_ur_outRem
ggsave(filename = snakemake@output[["xpnsl_manhat_ur_outRem"]], plot = xpnsl_manhat_ur_outRem, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
xpnsl_manhat_sr <- create_xpnsl_manhat_sr_us(obs_xpnsl_sr_df_filt, comp = "Suburban_Rural")
xpnsl_manhat_sr
ggsave(filename = snakemake@output[["xpnsl_manhat_sr"]], plot = xpnsl_manhat_sr, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
xpnsl_manhat_us <- create_xpnsl_manhat_sr_us(obs_xpnsl_us_df_filt, comp = "Urban_Suburban")
xpnsl_manhat_us
ggsave(filename = snakemake@output[["xpnsl_manhat_us"]], plot = xpnsl_manhat_us, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
# Suburban outliers shared with Urban vs. Rural
ur_urban_ids <- obs_xpnsl_ur_df_filt %>% filter(direction == "Urban") %>% mutate(id = paste0(Chr, "_", winCenter)) %>% pull(id)
ur_rural_ids <- obs_xpnsl_ur_df_filt %>% filter(direction == "Rural") %>% mutate(id = paste0(Chr, "_", winCenter)) %>% pull(id)
sr_suburban_ids <- obs_xpnsl_sr_df_filt %>% filter(direction == "Suburban") %>% mutate(id = paste0(Chr, "_", winCenter)) %>% pull(id)

sum(ur_urban_ids %in% sr_suburban_ids) / length(ur_urban_ids)
sum(ur_rural_ids %in% sr_suburban_ids) / length(ur_rural_ids)

In [None]:
# Overlap in urban and rural outliers with an without removal out population structure outlier populations
ur_urban_ids <- obs_xpnsl_ur_df_filt %>% filter(direction == "Urban") %>% mutate(id = paste0(Chr, "_", winCenter)) %>% pull(id)
ur_urban_top_ids <- obs_xpnsl_ur_df_filt %>% 
    filter(direction == "Urban" & mean_perm_perc >= 0.95 & prop_perm_perc >= 0.95) %>% 
    mutate(id = paste0(Chr, "_", winCenter)) %>% pull(id)
ur_urban_outRem_ids <- obs_xpnsl_ur_outRem_df_filt %>% filter(direction == "Urban") %>% mutate(id = paste0(Chr, "_", winCenter)) %>% pull(id)


sum(ur_urban_ids %in% ur_urban_outRem_ids) / length(ur_urban_ids)
sum(ur_urban_top_ids %in% ur_urban_outRem_ids) / length(ur_urban_top_ids)

In [None]:
ur_rural_ids <- obs_xpnsl_ur_df_filt %>% filter(direction == "Rural") %>% mutate(id = paste0(Chr, "_", winCenter)) %>% pull(id)
ur_rural_top_ids <- obs_xpnsl_ur_df_filt %>% 
    filter(direction == "Rural" & mean_perm_perc >= 0.95 & prop_perm_perc >= 0.95) %>% 
    mutate(id = paste0(Chr, "_", winCenter)) %>% pull(id)
ur_rural_outRem_ids <- obs_xpnsl_ur_outRem_df_filt %>% filter(direction == "Rural") %>% mutate(id = paste0(Chr, "_", winCenter)) %>% pull(id)


sum(ur_rural_ids %in% ur_rural_outRem_ids) / length(ur_rural_ids)
sum(ur_rural_top_ids %in% ur_rural_outRem_ids) / length(ur_rural_top_ids)

### Merge XP-nSL outlier windows and get top 10

In [None]:
# Function to merge consecutive outlier windows
merge_xpnsl_windows <- function(df, win_dist = 0){

    dir <- df %>% pull(direction) %>% unique()
    df_sorted <- df %>% 
        dplyr::select(Chr, start, end, everything()) %>% 
        arrange(Chr, start) %>% 
        mutate(prop_outlier = round(prop_outlier, 3),
               mean_perm_perc = round(mean_perm_perc, 3),
               prop_perm_perc = round(prop_perm_perc, 3)) %>% 
        mutate(is_topPerm = ifelse(mean_perm_perc >= 0.95 & prop_perm_perc >= 0.95, 1, 0))

    col_names <- c('Chr', 'start', 'end', 'mean_xpnsl', 'min_max', 'direction', 'mean_prop_outlier', 'all_prop_outlier', 'mean_percentile_permuted', 'all_percentile_permuted','mean_prop_percentile_permuted','all_prop_percentile_permuted', 'min_xpnsl_rank', 'all_xpnsl_ranks', 'num_topPerm')
    if(dir == 'Urban'){
        cols <- c('6,7,17,18,18,19,19,20,20,21,21,22')
        operation <- c('mean,max,distinct,mean,collapse,mean,collapse,mean,collapse,min,collapse,sum')
    }else if(dir == 'Rural'){
        cols <- c('6,8,17,18,18,19,19,20,20,21,21,22')
        operation <- c('mean,min,distinct,mean,collapse,mean,collapse,mean,collapse,min,collapse,sum')
    }
    
    df_merged  <- bt.merge(i = df_sorted, c = cols, o = operation, d = win_dist)
    names(df_merged) <- col_names
    df_merged <- df_merged %>% 
        mutate(win_size = end - start)

    print(sprintf('%s: There were %s XP-nSL outlier windows prior to merging. There are %s outlier regions after merging consecutive outlier windows', dir, nrow(df_sorted), nrow(df_merged)))
    return(df_merged)
}

win_xpnsl_ur_df_filt_merged <- obs_xpnsl_ur_df_filt %>% 
    filter(direction != 'None') %>% 
    group_split(direction) %>% 
    purrr::map_dfr(., merge_xpnsl_windows)

In [None]:
xpnsl_topHits_urban_rural <- win_xpnsl_ur_df_filt_merged %>% 
    filter(num_topPerm > 0) %>% 
    group_by(direction) %>% 
    arrange(min_xpnsl_rank, .by_group = TRUE) %>% 
    dplyr::select(Chr, start, end, win_size, direction, mean_xpnsl, min_max, all_prop_outlier, mean_prop_outlier, mean_percentile_permuted,all_percentile_permuted, mean_prop_percentile_permuted, all_prop_percentile_permuted, all_xpnsl_ranks, min_xpnsl_rank, num_topPerm) %>% 
    ungroup()
xpnsl_topHits_urban_rural

## saltiLassi

In [None]:
urban_lassi_df <- read_delim(snakemake@input[["lassip"]][1], delim='\t') %>% 
    rename_with(.cols = all_of(starts_with("Urban_")), .fn = ~str_replace(., "Urban_", "")) %>% 
    rename("Chr" = "chr", "mean" = "L") %>% 
    mutate(habitat = "Urban",
           is_outlier = ifelse(mean >= quantile(.$mean, prob = 0.99), 1, 0)) %>% 
    mutate(winCenter = start + ((end - start) / 2)) %>% 
    remap_chr_names()

rural_lassi_df <- read_delim(snakemake@input[["lassip"]][2], delim='\t') %>% 
    rename_with(.cols = all_of(starts_with("Rural_")), .fn = ~str_replace(., "Rural_", "")) %>% 
    rename("Chr" = "chr", "mean" = "L") %>% 
    mutate(habitat = "Rural",
           is_outlier = ifelse(mean >= quantile(.$mean, prob = 0.99), 1, 0)) %>% 
    mutate(winCenter = start + ((end - start) / 2)) %>% 
    remap_chr_names()

write_delim(bind_rows(urban_lassi_df, rural_lassi_df), snakemake@output[["salti_df"]], delim='\t')

In [None]:
plot_single_population_hapstat_win_manhat <- function(df, hab, stat){

    df <- df %>% filter(habitat == hab)
    
    # Setting up cummulative genome-wide x-axis
    data_cum <- df %>% 
        group_by(Chr) %>% 
        summarise(max_winCenter = max(winCenter)) %>% 
        mutate(winCenter_add = lag(cumsum(max_winCenter), default = 0)) %>% 
        dplyr::select(Chr, winCenter_add)
    
    df_mod <- df %>%
        inner_join(data_cum, by = "Chr") %>% 
        mutate(winCenter_cum = winCenter + winCenter_add) 
    
    axis_set <- df_mod %>% 
      group_by(Chr) %>% 
      summarize(center = mean(winCenter_cum))

    # Get Fst outliers and genome-wide critical value
    outliers <- df_mod %>% filter(is_outlier == 1) 
    quant <- quantile(abs(df %>% pull(mean)), probs = c(0.99))

    if(hab == 'Urban'){
        col = '#003876'
    } else {
        col = '#007243'
    }

    if(stat == 'nSL'){
        ylab = '| nSL |'
        coord = c(0, 4)
        breaks = seq(0, 4, 1)
    } else if (stat == 'iHS') {
        ylab = '| iHS |'
        coord = c(0, 3.25)
        breaks = seq(0, 3, 1)
    } else if (stat == "iHH12") {
        ylab = 'iHH12'
        coord = c(-0.5, 17)
        breaks = seq(0, 16, 2)
    } else {
        ylab = 'Lambda'
        coord = c(-0.5, 140)
        breaks = seq(0, 140, 20)
    }

    # Generate Manhattan plot
    manhat_plot <- df_mod %>%
        filter(is_outlier != 1) %>% 
        mutate(chrom_cat = case_when(Chr == 1 ~ 'One',
                                     Chr == 2 ~ 'Two',
                                     Chr == 3 ~ 'One',
                                     Chr == 4 ~ 'Two',
                                     Chr == 5 ~ 'One',
                                     Chr == 6 ~ 'Two',
                                     Chr == 7 ~ 'One',
                                     Chr == 8 ~ 'Two',
                                     Chr == 9 ~ 'One',
                                     Chr == 10 ~ 'Two',
                                     Chr == 11 ~ 'One',
                                     Chr == 12 ~ 'Two',
                                     Chr == 13 ~ 'One',
                                     Chr == 14 ~ 'Two',
                                     Chr == 15 ~ 'One',
                                     Chr == 16 ~ 'Two')) %>% 
            ggplot(aes(x = winCenter_cum, y = mean)) +
            geom_point(shape = 21, alpha = 0.4, size = 3, aes(fill = chrom_cat, color = chrom_cat)) +
            geom_point(data = outliers, shape = 21, alpha = 1, size = 3, color = col, fill = col) +
            geom_hline(yintercept = quant, color = "grey40", linetype = "dashed") +
            scale_x_continuous(label = axis_set$Chr, breaks=axis_set$center) +
            scale_y_continuous(expand = c(0,0), breaks = breaks) +
            coord_cartesian(ylim = coord) +
            scale_fill_manual(values = c("black", "grey40")) + 
            scale_color_manual(values = c("black", "grey40")) + 
            ylab(ylab) + xlab('') +
            ggtitle(sprintf("%s Windowed %s", hab, stat)) +
            theme_classic() +
            theme(
                legend.position = "none",
                panel.border = element_blank(),
                panel.grid.major.x = element_blank(),
                panel.grid.minor.x = element_blank(),
                axis.text = element_text(size=16),
                axis.title = element_text(size=20),
                plot.title = element_text(size = 20, face = "bold")
              )
    return(manhat_plot)
}

In [None]:
options(repr.plot.width = 20, repr.plot.height = 6)
urban_salti_manhat <- plot_single_population_hapstat_win_manhat(urban_lassi_df, "Urban", "saltiLassi")
ggsave(filename = snakemake@output[["urban_salti_manhat"]], plot = urban_salti_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
rural_salti_manhat <- plot_single_population_hapstat_win_manhat(rural_lassi_df, "Rural", "saltiLassi")
ggsave(filename = snakemake@output[["rural_salti_manhat"]], plot = rural_salti_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
cols <- c("Chr", "start", "end", "winCenter", "is_outlier", "m", "A", "mean", "habitat")
urban_lassi_only <- bt.intersect(urban_lassi_df %>% dplyr::select(Chr, start, end, winCenter, is_outlier, m, A, mean, habitat) %>% 
                                     filter(is_outlier == 1), 
                                 rural_lassi_df %>% dplyr::select(Chr, start, end, winCenter, is_outlier, m, A, mean, habitat) %>% 
                                     filter(is_outlier == 0), wa=T) %>% distinct()
names(urban_lassi_only) <- cols
rural_lassi_only <- bt.intersect(rural_lassi_df %>% dplyr::select(Chr, start, end, winCenter, is_outlier, m, A, mean, habitat) %>% 
                                     filter(is_outlier == 1), 
                                 urban_lassi_df %>% dplyr::select(Chr, start, end, winCenter, is_outlier, m, A, mean, habitat) %>% 
                                     filter(is_outlier == 0), wa=T) %>% distinct()
names(rural_lassi_only) <- cols

In [None]:
urban_lassi_xpnsl <- bt.intersect(urban_lassi_only %>% 
                                      dplyr::select(Chr, start, end, winCenter, is_outlier, m, A, mean, habitat), 
                                  obs_xpnsl_ur_df_filt %>% filter(direction == "Urban"), wa=T) %>% distinct()
names(urban_lassi_xpnsl) <- cols
rural_lassi_xpnsl <- bt.intersect(rural_lassi_only %>% 
                                      dplyr::select(Chr, start, end, winCenter, is_outlier, m, A, mean, habitat), 
                                  obs_xpnsl_ur_df_filt %>% filter(direction == "Rural"), wa=T) %>% distinct()
names(rural_lassi_xpnsl) <- cols
lassi_xpnsl <- bind_rows(urban_lassi_xpnsl, rural_lassi_xpnsl) %>% 
    distinct()

In [None]:
options(repr.plot.width = 8, repr.plot.height = 6)
salti_m_hist <- lassi_xpnsl %>% 
    group_by(habitat, m) %>% 
    summarize(count = n()) %>% 
    ungroup() %>% 
    complete(habitat, m, fill = list(count = 0)) %>% 
    ggplot(aes(x = m, y = count)) +
        geom_bar(stat = "identity", position = "dodge", aes(color = habitat, fill = habitat)) +
        scale_color_manual(values = c("#007243", "#003876")) +
        scale_fill_manual(values = c("#007243", "#003876")) +
        ylab("Number of windows") + xlab("Number of sweeping haplotypes") +
        scale_x_continuous(breaks = seq(1, 8, 1)) +
        theme_classic() +
        theme(axis.text = element_text(size = 15),
              axis.title = element_text(size = 17),
              legend.text = element_text(size = 13),
              legend.title = element_text(size = 15)) +
        transp_theme
salti_m_hist
ggsave(filename = snakemake@output[["salti_m_hist"]], plot = salti_m_hist, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
salti_sr_hist <- lassi_xpnsl %>% 
    mutate(sweep_type = ifelse(m > 1, "Soft (m > 1)", "Hard (m = 1)")) %>% 
    group_by(habitat, sweep_type) %>% 
    summarize(count = n()) %>% 
    ungroup() %>% 
    ggplot(aes(x = sweep_type, y = count)) +
        geom_bar(stat = "identity", position = "dodge", aes(color = habitat, fill = habitat)) +
        scale_color_manual(values = c("#007243", "#003876")) +
        scale_fill_manual(values = c("#007243", "#003876")) +
        ylab("Number of windows") + xlab("Sweep architecture") +
        theme_classic() +
        theme(axis.text = element_text(size = 15),
              axis.title = element_text(size = 17),
              legend.text = element_text(size = 13),
              legend.title = element_text(size = 15)) +
        transp_theme
salti_sr_hist
ggsave(filename = snakemake@output[["salti_sr_hist"]], plot = salti_sr_hist, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
chisq_table <- lassi_xpnsl %>% 
    mutate(sweep_type = ifelse(m > 1, "Soft", "Hard")) %>% 
    dplyr::select(habitat, sweep_type) %>% 
    group_by(habitat, sweep_type) %>% 
    summarize(count = n()) %>% 
    ungroup() %>% 
    pivot_wider(names_from = "sweep_type", values_from = "count") %>% 
    column_to_rownames(., var = "habitat") %>% 
    as.matrix()
chisq_table

In [None]:
chisq_sweep_arch <- chisq.test(chisq_table)
chisq_sweep_arch

In [None]:
urban_A <- mean(log10(lassi_xpnsl %>% filter(habitat == "Urban") %>% pull(A)))
rural_A <- mean(log10(lassi_xpnsl %>% filter(habitat == "Rural") %>% pull(A)))
A_diff_obs <- urban_A - rural_A

# Set seed for reproducible results
set.seed(49)

# Initialize a list to store the simulated test-statistics
simulated_diffs <- list()
nreps = 1000

for(i in 1:nreps){

    # Create temporary dataframe to permute so we don't modify the original
    reshuffled <- lassi_xpnsl 
    
    # Permute the width column with the 'sample()' function. 
    reshuffled$A <- sample(reshuffled$A, size = nrow(reshuffled), replace = FALSE)
    
    # Calculate the means for each habitat
    mean_urban_sim <- mean(log10(reshuffled %>% filter(habitat == 'Urban') %>% pull(A)))
    mean_rural_sim <- mean(log10(reshuffled %>% filter(habitat == 'Rural') %>% pull(A)))
    
    # Calculate to difference between simulated urban and rural sweep widths
    A_diff_sim <- mean_urban_sim - mean_rural_sim

    # Append simulated mean difference to list
    simulated_diffs[i] <- A_diff_sim
}    

# Unlist simulated means list into numeric vector
simulated_diffs <- unlist(simulated_diffs)

In [None]:
logA_perm_plot <- ggplot() +
    geom_histogram(aes(x = simulated_diffs), bins = 30, 
                   fill = "grey", alpha = 0.4, colour = "black") +
    geom_vline(xintercept = A_diff_obs, linewidth = 1, 
               linetype = "dashed", colour = "black") + 
    ylab("Number of iterations") + 
    xlab("Simulated urban-rural difference in log(A) between habitats") +
    theme_classic() +
    theme(axis.title = element_text(size = 17),
          axis.text = element_text(size = 15)) +
    transp_theme
logA_perm_plot

ggsave(filename = snakemake@output[["logA_perm_plot"]], plot = logA_perm_plot, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
logA_ur_density <- lassi_xpnsl %>% 
    group_by(habitat) %>% 
    ggplot(aes(x = log10(A))) +
        geom_density(aes(color = habitat, fill = habitat), alpha = 0.7) +
        scale_color_manual(values = c("#007243", "#003876")) +
        scale_fill_manual(values = c("#007243", "#003876")) +
        ylab("Density") + 
        xlab("Width of selective sweep [log(A)]") +
        theme_classic() +
        theme(axis.title = element_text(size = 17),
              axis.text = element_text(size = 15)) +
        transp_theme
logA_ur_density

ggsave(filename = snakemake@output[["logA_ur_density"]], plot = logA_ur_density, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")

## nSL

- I estimated nSL in the same 50 Kb windows as above, separately in urban and rural habitats
- I'll do the same thing as I did above, minus the permutations

In [None]:
nsl_df <- read_delim(snakemake@input[["win_nsl"]], delim='\t') %>% 
    remap_chr_names()
head(nsl_df)

In [None]:
nsl_df %>% 
    ggplot(aes(x = n)) +
        geom_histogram(bins = 50, color = "black", fill = "white") +
        ylab("Number of windows") + xlab("Number of sites") +
        geom_vline(xintercept = 25, color = "red", linetype = "dashed", linewidth = 1) +
        theme_classic() +
        theme(axis.text = element_text(size = 15),
              axis.title = element_text(size = 17))

In [None]:
# Function to assign outliers to nSL windows
assign_nsl_outliers <- function(df, thresh = nsl_nsites_thresh){

    df_filt <- df %>%
        filter(n >= thresh)

    # Get critical values for mean XP-nSL score and proportions greater or lesser than 2 and -2, respectively
    score_quant <- quantile(df_filt %>% pull(mean), probs = 0.99)
    gtprop_quant <- quantile(df_filt %>% pull(gt_frac), probs = 0.99)

    # Identify outliers and add as categorical variable to windows dataframe
    df_filt <- df_filt %>%
        mutate(score_outlier = ifelse(mean >= score_quant, 1, 0),
               gtprop_outlier = ifelse(gt_frac >= gtprop_quant, 1, 0),
               is_outlier = ifelse(score_outlier == 1 & gtprop_outlier == 1, 1, 0))

    return(df_filt)
}

nsl_nsites_thresh <- 25
nsl_df_filt <- nsl_df %>% 
    group_split(habitat) %>% 
    purrr::map_dfr(., assign_nsl_outliers, thresh = nsl_nsites_thresh)

sprintf("%s of %s nSL windows remaining after removing those with less than %s sites", nrow(nsl_df_filt), nrow(nsl_df), nsl_nsites_thresh)

head(nsl_df_filt)

In [None]:
plot_single_population_hapstat_win_manhat <- function(df, hab, stat){

    df <- df %>% filter(habitat == hab)
    
    # Setting up cummulative genome-wide x-axis
    data_cum <- df %>% 
        group_by(Chr) %>% 
        summarise(max_winCenter = max(winCenter)) %>% 
        mutate(winCenter_add = lag(cumsum(max_winCenter), default = 0)) %>% 
        dplyr::select(Chr, winCenter_add)
    
    df_mod <- df %>%
        inner_join(data_cum, by = "Chr") %>% 
        mutate(winCenter_cum = winCenter + winCenter_add) 
    
    axis_set <- df_mod %>% 
        group_by(Chr) %>% 
        summarize(center = mean(winCenter_cum))

    # Get Fst outliers and genome-wide critical value
    outliers <- df_mod %>% filter(is_outlier == 1) 
    quant <- quantile(abs(df %>% pull(mean)), probs = c(0.99))

    if(hab == 'Urban'){
        col = '#003876'
    } else {
        col = '#007243'
    }

    if(stat == 'nSL'){
        ylab = '| nSL |'
        coord = c(0, 4)
        breaks = seq(0, 4, 1)
    } else if (stat == 'iHS') {
        ylab = '| iHS |'
        coord = c(0, 3.25)
        breaks = seq(0, 3, 1)
    } else {
        ylab = 'iHH12'
        coord = c(-0.5, 17)
        breaks = seq(0, 16, 2)
    }

    # Generate Manhattan plot
    manhat_plot <- df_mod %>%
        filter(is_outlier != 1) %>% 
        mutate(chrom_cat = case_when(Chr == 1 ~ 'One',
                                     Chr == 2 ~ 'Two',
                                     Chr == 3 ~ 'One',
                                     Chr == 4 ~ 'Two',
                                     Chr == 5 ~ 'One',
                                     Chr == 6 ~ 'Two',
                                     Chr == 7 ~ 'One',
                                     Chr == 8 ~ 'Two',
                                     Chr == 9 ~ 'One',
                                     Chr == 10 ~ 'Two',
                                     Chr == 11 ~ 'One',
                                     Chr == 12 ~ 'Two',
                                     Chr == 13 ~ 'One',
                                     Chr == 14 ~ 'Two',
                                     Chr == 15 ~ 'One',
                                     Chr == 16 ~ 'Two')) %>% 
            ggplot(aes(x = winCenter_cum, y = mean)) +
            geom_point(shape = 21, alpha = 0.4, size = 3, aes(fill = chrom_cat, color = chrom_cat)) +
            geom_point(data = outliers, shape = 21, alpha = 1, size = 3, color = col, fill = col) +
            geom_hline(yintercept = quant, color = "grey40", linetype = "dashed") +
            scale_x_continuous(label = axis_set$Chr, breaks=axis_set$center) +
            scale_y_continuous(expand = c(0,0), breaks = breaks) +
            coord_cartesian(ylim = coord) +
            scale_fill_manual(values = c("black", "grey40")) + 
            scale_color_manual(values = c("black", "grey40")) + 
            ylab(ylab) + xlab('') +
            ggtitle(sprintf("%s Windowed %s", hab, stat)) +
            theme_classic() +
            theme(
                legend.position = "none",
                panel.border = element_blank(),
                panel.grid.major.x = element_blank(),
                panel.grid.minor.x = element_blank(),
                axis.text = element_text(size=16),
                axis.title = element_text(size=20),
                plot.title = element_text(size = 20, face = "bold")
              )
    return(manhat_plot)
}

In [None]:
options(repr.plot.width = 20, repr.plot.height = 6)
rural_nsl_manhat <- plot_single_population_hapstat_win_manhat(nsl_df_filt, "Rural", stat = 'nSL')
rural_nsl_manhat
ggsave(filename = snakemake@output[["rur_nsl_manhat"]], plot = rural_nsl_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
urban_nsl_manhat <- plot_single_population_hapstat_win_manhat(nsl_df_filt, "Urban", stat = 'nSL')
urban_nsl_manhat
ggsave(filename = snakemake@output[["urb_nsl_manhat"]], plot = urban_nsl_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
nsl_df_filt %>% 
    filter(is_outlier == 1) %>% 
    write_delim(snakemake@output[["nsl_df"]], delim="\t")

## iHS

In [None]:
ihs_df <- read_delim(snakemake@input[["win_ihs"]], delim='\t') %>% 
    remap_chr_names()
head(ihs_df)

In [None]:
ihs_df_filt <- ihs_df %>% 
    group_split(habitat) %>% 
    purrr::map_dfr(., assign_nsl_outliers, thresh = nsl_nsites_thresh)

sprintf("%s of %s iHS windows remaining after removing those with less than %s sites", nrow(ihs_df_filt), nrow(ihs_df), nsl_nsites_thresh)

head(ihs_df_filt)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 6)
rural_ihs_manhat <- plot_single_population_hapstat_win_manhat(ihs_df_filt, "Rural", stat = 'iHS')
rural_ihs_manhat
ggsave(filename = snakemake@output[["rur_ihs_manhat"]], plot = rural_ihs_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
urban_ihs_manhat <- plot_single_population_hapstat_win_manhat(ihs_df_filt, "Urban", stat = 'iHS')
urban_ihs_manhat
ggsave(filename = snakemake@output[["urb_ihs_manhat"]], plot = urban_ihs_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
ihs_df_filt %>% 
    filter(is_outlier == 1) %>% 
    write_delim(snakemake@output[["ihs_df"]], delim="\t")

## iHH12

In [None]:
ihh12_df <- read_delim(snakemake@input[["win_ihh12"]], delim='\t') %>% 
    remap_chr_names()
head(ihh12_df)

In [None]:
ihh12_df_filt <- ihh12_df %>% 
    group_split(habitat) %>% 
    purrr::map_dfr(., assign_nsl_outliers, thresh = nsl_nsites_thresh)

sprintf("%s of %s iHH12 windows remaining after removing those with less than %s sites", nrow(ihh12_df_filt), nrow(ihh12_df), nsl_nsites_thresh)

head(ihh12_df_filt)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 6)
urban_ihh12_manhat <- plot_single_population_hapstat_win_manhat(ihh12_df_filt, "Urban", stat = 'iHH12')
urban_ihh12_manhat
ggsave(filename = snakemake@output[["urb_ihh12_manhat"]], plot = urban_ihh12_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
rural_ihh12_manhat <- plot_single_population_hapstat_win_manhat(ihh12_df_filt, "Rural", stat = 'iHH12')
rural_ihh12_manhat
ggsave(filename = snakemake@output[["rur_ihh12_manhat"]], plot = rural_ihh12_manhat, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
ihh12_df_filt %>% 
    filter(is_outlier == 1) %>% 
    write_delim(snakemake@output[["ihh12_df"]], delim="\t")

## Fst

In [None]:
gt_fst_df <- read_delim(snakemake@input[["gt_win_fst"]]) %>% 
    filter(pop1 == "Urban" & pop2 == "Rural") %>%
    mutate(winCenter = (window_pos_1 - 1) + 25000) %>% 
    rename("Chr" = "chromosome",
           "start" = "window_pos_1",
           "end" = "window_pos_2") %>% 
    remap_chr_names()
head(gt_fst_df)

In [None]:
gt_fst_df_filt <- gt_fst_df %>% 
    filter(no_snps >= xpnsl_nsites_thresh) %>% 
    mutate(avg_hudson_fst = ifelse(avg_hudson_fst < 0, 0, avg_hudson_fst))

gt_fst_quant <- quantile(gt_fst_df_filt %>% pull(avg_hudson_fst), probs = 0.99)

gt_fst_df_filt <- gt_fst_df_filt %>% 
    mutate(is_outlier = ifelse(avg_hudson_fst >= gt_fst_quant, 1, 0))

In [None]:
range(gt_fst_df_filt$avg_hudson_fst)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 4)

# Setting up cummulative genome-wide x-axis
data_cum <- gt_fst_df_filt %>% 
    group_by(Chr) %>% 
    summarise(max_winCenter = max(winCenter)) %>% 
    mutate(winCenter_add = lag(cumsum(max_winCenter), default = 0)) %>% 
    dplyr::select(Chr, winCenter_add)

gt_fst_df_filt_mod <- gt_fst_df_filt %>%
    inner_join(data_cum, by = "Chr") %>% 
    mutate(winCenter_cum = winCenter + winCenter_add) 

axis_set <- gt_fst_df_filt_mod %>% 
    group_by(Chr) %>% 
    summarize(center = mean(winCenter_cum))

# Get Fst outliers for plotting as separate layers
gt_fst_not_outlier <- gt_fst_df_filt_mod %>% filter(is_outlier == 0)
gt_fst_outliers <- gt_fst_df_filt_mod %>% filter(is_outlier == 1)

gt_fst_not_outlier <- gt_fst_not_outlier %>%
        mutate(chrom_cat = case_when(Chr == 1 ~ 'One',
                                     Chr == 2 ~ 'Two',
                                     Chr == 3 ~ 'One',
                                     Chr == 4 ~ 'Two',
                                     Chr == 5 ~ 'One',
                                     Chr == 6 ~ 'Two',
                                     Chr == 7 ~ 'One',
                                     Chr == 8 ~ 'Two',
                                     Chr == 9 ~ 'One',
                                     Chr == 10 ~ 'Two',
                                     Chr == 11 ~ 'One',
                                     Chr == 12 ~ 'Two',
                                     Chr == 13 ~ 'One',
                                     Chr == 14 ~ 'Two',
                                     Chr == 15 ~ 'One',
                                     Chr == 16 ~ 'Two'))

gt_fst_manhat <- ggplot() +
        geom_point(data = gt_fst_not_outlier, shape = 21, alpha = 0.4, size = 3, 
                   aes(x = winCenter_cum, y = avg_hudson_fst, fill = chrom_cat, color = chrom_cat)) +
        geom_point(data = gt_fst_outliers, shape = 21, alpha = 1, size = 3, color = 'red', fill = 'red',
                   aes(x = winCenter_cum, y = avg_hudson_fst)) +
        geom_hline(yintercept = gt_fst_quant, color = "grey40", linetype = "dashed") +
        scale_x_continuous(label = axis_set$Chr, breaks = axis_set$center) +
        scale_y_continuous(expand = c(0,0), breaks = seq(0, 0.06, 0.02)) +
        coord_cartesian(ylim = c(0, 0.06)) +
        scale_fill_manual(values = c("black", "grey40")) + 
        scale_color_manual(values = c("black", "grey40")) + 
        ylab("Urban-Rural Hudson's Fst") + xlab('Chromosomes') +
        theme_classic() +
        theme(
            legend.position = "none",
            panel.border = element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            axis.text = element_text(size=16),
            axis.title = element_text(size=20)) +
        transp_theme

options(repr.plot.width = 20, repr.plot.height = 6)
gt_fst_manhat
ggsave(filename = snakemake@output[["gt_fst_manhat"]], plot = gt_fst_manhat, 
       height = 4, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
gt_fst_df_filt %>% 
    filter(is_outlier == 1) %>% 
    write_delim(snakemake@output[["gt_fst_df"]], delim="\t")

## Tajima's D

In [None]:
win_sfs_stats <- read_delim(snakemake@input[["win_sfs_fst"]], delim = "\t") %>% 
    remap_chr_names() %>%
    rename("winCenter" = "WinCenter") %>% 
    dplyr::select(Chr, start, end, winCenter, nSites_Urban, nSites_Rural, Tajima_Urban, Tajima_Rural, delta_td_ur) 

In [None]:
td_diff_quant <- quantile(win_sfs_stats$delta_td_ur, probs = c(0.01, 0.99))
win_sfs_stats_filt <- win_sfs_stats %>% 
    mutate(is_outlier = case_when(delta_td_ur <= td_diff_quant[1] ~ 1,
                                 delta_td_ur >= td_diff_quant[2] ~ 1,
                                 TRUE ~ 0))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 4)

# Setting up cummulative genome-wide x-axis
data_cum <- win_sfs_stats_filt %>% 
    group_by(Chr) %>% 
    summarise(max_winCenter = max(winCenter)) %>% 
    mutate(winCenter_add = lag(cumsum(max_winCenter), default = 0)) %>% 
    dplyr::select(Chr, winCenter_add)

win_sfs_stats_mod <- win_sfs_stats_filt %>%
    inner_join(data_cum, by = "Chr") %>% 
    mutate(winCenter_cum = winCenter + winCenter_add) 

axis_set <- win_sfs_stats_mod %>% 
    group_by(Chr) %>% 
    summarize(center = mean(winCenter_cum))

td_not_outlier <- win_sfs_stats_mod %>% filter(is_outlier == 0)
td_outliers <- win_sfs_stats_mod %>% filter(is_outlier == 1)

td_not_outlier <- td_not_outlier %>%
        mutate(chrom_cat = case_when(Chr == 1 ~ 'One',
                                     Chr == 2 ~ 'Two',
                                     Chr == 3 ~ 'One',
                                     Chr == 4 ~ 'Two',
                                     Chr == 5 ~ 'One',
                                     Chr == 6 ~ 'Two',
                                     Chr == 7 ~ 'One',
                                     Chr == 8 ~ 'Two',
                                     Chr == 9 ~ 'One',
                                     Chr == 10 ~ 'Two',
                                     Chr == 11 ~ 'One',
                                     Chr == 12 ~ 'Two',
                                     Chr == 13 ~ 'One',
                                     Chr == 14 ~ 'Two',
                                     Chr == 15 ~ 'One',
                                     Chr == 16 ~ 'Two'))

tajima_manhat <- win_sfs_stats_mod %>% 
    ggplot(aes(x = winCenter_cum, y = delta_td_ur)) +
        geom_point(data = td_not_outlier, shape = 21, alpha = 0.4, size = 3, 
                   aes(x = winCenter_cum, y = delta_td_ur, fill = chrom_cat, color = chrom_cat)) +
        geom_point(data = td_outliers, shape = 21, alpha = 1, size = 3, color = 'red', fill = 'red',
                   aes(x = winCenter_cum, y = delta_td_ur)) +
        # geom_smooth(aes(group = Chr), method = "loess", span = 0.1, se = FALSE, color = "yellow",
        #             linewidth = 0.75) +
        scale_x_continuous(label = axis_set$Chr, breaks = axis_set$center) +
        scale_color_manual(values = c("black", "grey40")) + 
        scale_fill_manual(values = c("black", "grey40")) + 
        ylab(bquote(Tajima[Urban]~-~Tajima[Rural])) + xlab('Chromosomes') +
        theme_classic() +
        theme(
            legend.position = "none",
            panel.border = element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            axis.text = element_text(size=16),
            axis.title = element_text(size=20)) +
        transp_theme
tajima_manhat

ggsave(filename = snakemake@output[["tajima_manhat"]], plot = tajima_manhat, 
       height = 4, width = 20, device = "pdf", dpi = 600, units = "in")

## LD

In [None]:
win_ld_df <- read_delim(snakemake@input[["win_ld"]], delim = "\t") %>% 
    rename("ld" = "mean") %>% 
    remap_chr_names()

In [None]:
options(repr.plot.width = 20, repr.plot.height = 4)

# Setting up cummulative genome-wide x-axis
data_cum <- win_ld_df %>% 
    group_by(Chr) %>% 
    summarise(max_winCenter = max(winCenter)) %>% 
    mutate(winCenter_add = lag(cumsum(max_winCenter), default = 0)) %>% 
    dplyr::select(Chr, winCenter_add)

win_ld_df_mod <- win_ld_df %>%
    inner_join(data_cum, by = "Chr") %>% 
    mutate(winCenter_cum = winCenter + winCenter_add) 

axis_set <- win_ld_df_mod %>% 
    group_by(Chr) %>% 
    summarize(center = mean(winCenter_cum))

win_ld_df_mod <- win_ld_df_mod %>%
        mutate(chrom_cat = case_when(Chr == 1 ~ 'One',
                                     Chr == 2 ~ 'Two',
                                     Chr == 3 ~ 'One',
                                     Chr == 4 ~ 'Two',
                                     Chr == 5 ~ 'One',
                                     Chr == 6 ~ 'Two',
                                     Chr == 7 ~ 'One',
                                     Chr == 8 ~ 'Two',
                                     Chr == 9 ~ 'One',
                                     Chr == 10 ~ 'Two',
                                     Chr == 11 ~ 'One',
                                     Chr == 12 ~ 'Two',
                                     Chr == 13 ~ 'One',
                                     Chr == 14 ~ 'Two',
                                     Chr == 15 ~ 'One',
                                     Chr == 16 ~ 'Two'))

ld_manhat <- win_ld_df_mod %>% 
    ggplot(aes(x = winCenter_cum, y = ld,)) +
        scale_x_continuous(label = axis_set$Chr, breaks = axis_set$center) +
        geom_smooth(data = . %>% filter(habitat == "Urban"), aes(group = Chr), 
                    method = "loess", span = 0.1, se = FALSE, color = "#007243",
                    linewidth = 0.75) +
        geom_smooth(data = . %>% filter(habitat == "Rural"), aes(group = Chr), 
                    method = "loess", span = 0.1, se = FALSE, color = "#003876",
                    linewidth = 0.75) +
        ylab(bquote(LD~(r^2))) + xlab('Chromosomes') +
        theme_classic() +
        theme(
            legend.position = "none",
            panel.border = element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            axis.text = element_text(size=16),
            axis.title = element_text(size=20)) +
        transp_theme
ld_manhat
ggsave(filename = snakemake@output[["ld_manhat"]], plot = ld_manhat, 
       height = 4, width = 20, device = "pdf", dpi = 600, units = "in")

## Recombination rate

In [None]:
gen_map <- read_delim(snakemake@input[["gen_map"]], delim="\t") %>% 
    rename("Chr" = "chrom") %>% 
    remap_chr_names()

In [None]:
gen_map_with_rates <- gen_map %>% 
    group_by(Chr) %>% 
    mutate(pos_diff_Mbp = (pos - lag(pos, default = first(pos))) / 1e6,
           cM_diff = cM - lag(cM, default = first(cM)),
           rate_cM_Mbp = ifelse(pos_diff_Mbp > 0, cM_diff / pos_diff_Mbp, NA))

head(gen_map_with_rates)

In [None]:
calculate_windowed_recomb <- function(df, window_size, step){
    chrom <- df %>% pull(Chr) %>% unique()
    winStarts <- seq(from = 0, to = max(df$pos) + window_size, by = step)
    mat <- matrix(0, nrow = length(winStarts), ncol = 6)
    for(i in 1:length(winStarts)){
        start <- winStarts[i]
        end <- start + step
        df_filt <- df %>% filter(pos >= start & pos < end)
        winID <- i
        winCenter <- start + (step / 2)
        mean <- suppressWarnings(mean(df_filt$rate_cM_Mbp, na.rm = TRUE))
        stats <- c(chrom, winID, start, end, winCenter, mean)
        mat[i, ] <- stats
    }
    stats_df <- as.data.frame(mat)
    names(stats_df) <- c("Chr", "winID", "start", "end", "winCenter", "mean")
    return(stats_df)
}

win_recomb <- gen_map_with_rates %>% 
    ungroup() %>% 
    group_split(Chr) %>% 
    purrr::map_dfr(., calculate_windowed_recomb, window_size = 50000, step = 50000)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 4)

# Setting up cummulative genome-wide x-axis
data_cum <- win_recomb %>% 
    group_by(Chr) %>% 
    summarise(max_winCenter = max(winCenter)) %>% 
    mutate(winCenter_add = lag(cumsum(max_winCenter), default = 0)) %>% 
    dplyr::select(Chr, winCenter_add)

win_recomb_mod <- win_recomb %>%
    inner_join(data_cum, by = "Chr") %>% 
    mutate(winCenter_cum = winCenter + winCenter_add) 

axis_set <- win_recomb_mod %>% 
    group_by(Chr) %>% 
    summarize(center = mean(winCenter_cum))

win_recomb_mod <- win_recomb_mod %>%
        mutate(chrom_cat = case_when(Chr == 1 ~ 'One',
                                     Chr == 2 ~ 'Two',
                                     Chr == 3 ~ 'One',
                                     Chr == 4 ~ 'Two',
                                     Chr == 5 ~ 'One',
                                     Chr == 6 ~ 'Two',
                                     Chr == 7 ~ 'One',
                                     Chr == 8 ~ 'Two',
                                     Chr == 9 ~ 'One',
                                     Chr == 10 ~ 'Two',
                                     Chr == 11 ~ 'One',
                                     Chr == 12 ~ 'Two',
                                     Chr == 13 ~ 'One',
                                     Chr == 14 ~ 'Two',
                                     Chr == 15 ~ 'One',
                                     Chr == 16 ~ 'Two'))

recomb_manhat <- win_recomb_mod %>% 
    ggplot(aes(x = winCenter_cum, y = mean)) +
        scale_x_continuous(label = axis_set$Chr, breaks = axis_set$center) +
        geom_line(linewidth = 1.5, aes(color = chrom_cat)) +
        ylab("Recombination rate (cM/Mb)") + xlab('Chromosomes') +
        scale_color_manual(values = c("black", "grey40")) + 
        coord_cartesian(ylim = c(0, 145)) +
        scale_y_continuous(breaks = seq(0, 140, 20)) +
        theme_classic() +
        theme(
            legend.position = "none",
            panel.border = element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            axis.text = element_text(size=16),
            axis.title = element_text(size=20)) +
        transp_theme
recomb_manhat
ggsave(filename = snakemake@output[["recomb_manhat"]], plot = recomb_manhat, 
       height = 4, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
xpnsl_recomb_df <- obs_xpnsl_ur_df_filt %>% 
    filter(direction != "None") %>% 
    dplyr::select(Chr, winCenter, mean) %>% 
    rename("xpnsl" = "mean") %>% 
    left_join(., win_recomb, by = c("Chr", "winCenter")) 

In [None]:
summary(lm(xpnsl ~ mean, data = xpnsl_recomb_df %>% filter(mean < 100)))

In [None]:
xpnsl_recomb_df %>% 
    filter(mean > 100)

In [None]:
options(repr.plot.width = 8, repr.plot.height = 8)

xpnsl_vs_recomb_plot <- xpnsl_recomb_df %>% 
    filter(mean < 100) %>% 
    ggplot(aes(x = mean, y = abs(xpnsl))) +
        geom_point(size = 4) +
        geom_smooth(method = "lm", color = "black", linewidth = 1.5) +
        ylab("Normalized XP-nSL") + xlab("Recombination rate (cM/Mb)") +
        theme_classic() +
        theme(
            legend.position = "none",
            panel.border = element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            axis.text = element_text(size=16),
            axis.title = element_text(size=20)) +
        transp_theme
xpnsl_vs_recomb_plot
ggsave(filename = snakemake@output[["xpnsl_vs_recomb_plot"]], plot = xpnsl_vs_recomb_plot, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
head(gt_fst_df_filt)

In [None]:
fst_recomb_df <- gt_fst_df_filt %>% 
    filter(is_outlier == 1) %>% 
    dplyr::select(Chr, winCenter, avg_hudson_fst) %>% 
    left_join(., win_recomb, by = c("Chr", "winCenter")) 

In [None]:
summary(lm(avg_hudson_fst ~ mean, data = fst_recomb_df))

In [None]:
fst_vs_recomb_plot <- fst_recomb_df %>% 
    ggplot(aes(x = mean, y = avg_hudson_fst)) +
        geom_point(size = 4) +
        geom_smooth(method = "lm", color = "black", linewidth = 1.5) +
        ylab("Hudson's Fst") + xlab("Recombination rate (cM/Mb)") +
        theme_classic() +
        theme(
            legend.position = "none",
            panel.border = element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            axis.text = element_text(size=16),
            axis.title = element_text(size=20)) +
        transp_theme
fst_vs_recomb_plot
ggsave(filename = snakemake@output[["fst_vs_recomb_plot"]], plot = fst_vs_recomb_plot, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")

# Create output tables

## Dataframe with gene ID in top selected regions

In [None]:
xpnsl_urban_outliers <- obs_xpnsl_ur_df_filt %>% filter(direction == "Urban") %>% mutate(id = paste0(Chr, '__', winCenter)) 
nsl_urban_outlier <- nsl_df_filt %>% filter(habitat == "Urban" & is_outlier == 1) %>% mutate(id = paste0(Chr, '__', winCenter))
ihs_urban_outlier <- ihs_df_filt %>% filter(habitat == "Urban" & is_outlier == 1) %>% mutate(id = paste0(Chr, '__', winCenter))
ihh12_urban_outlier <- ihh12_df_filt %>% filter(habitat == "Urban" & is_outlier == 1) %>% mutate(id = paste0(Chr, '__', winCenter))
gt_fst_outlier <- gt_fst_df_filt %>% filter(is_outlier == 1) %>% mutate(id = paste0(Chr, '__', winCenter))
td_outlier <- win_sfs_stats_filt %>% filter(is_outlier == 1) %>% mutate(id = paste0(Chr, '__', winCenter))

In [None]:
xpnsl_rural_outliers <- obs_xpnsl_ur_df_filt %>% filter(direction == "Rural") %>% mutate(id = paste0(Chr, '__', winCenter))
nsl_rural_outlier <- nsl_df_filt %>% filter(habitat == "Rural" & is_outlier == 1) %>% mutate(id = paste0(Chr, '__', winCenter))
ihs_rural_outlier <- ihs_df_filt %>% filter(habitat == "Rural" & is_outlier == 1) %>% mutate(id = paste0(Chr, '__', winCenter))
ihh12_rural_outlier <- ihh12_df_filt %>% filter(habitat == "Rural" & is_outlier == 1) %>% mutate(id = paste0(Chr, '__', winCenter))

In [None]:
get_positions <- function(df){
    df_out <- df %>% 
        dplyr::select(Chr, start, end)
    return(df_out)
}

In [None]:
xpnsl_topHits_positions <- get_positions(xpnsl_topHits_urban_rural)
nsl_urban_positions <- get_positions(nsl_urban_outlier)
nsl_rural_positions <- get_positions(nsl_rural_outlier)
ihs_urban_positions <- get_positions(ihs_urban_outlier)
ihs_rural_positions <- get_positions(ihs_rural_outlier)

In [None]:
xpnsl_topHits_urban_rural

In [None]:
tajima_vals <- bt.intersect(xpnsl_topHits_positions, td_outlier, wa = T, wb = T) %>% 
    dplyr::select(V1, V2, V3, V10, V11)
names(tajima_vals) <- c("Chr", "start", "end", "Tajima_Urban", "Tajima_Rural")
tajima_vals <- tajima_vals %>% 
    group_by(Chr, start, end) %>% 
    summarise(Tajima_Urban = mean(Tajima_Urban),
              Tajima_Rural = mean(Tajima_Rural))
tajima_vals

In [None]:
count_overlapping_outlier_windows <- function(pos1, pos2){
    
    # Count number of overlapping statistic outlier windows in each XP-nSL window
    num_outliers <- bt.intersect(pos1, pos2, c = T) %>% pull(V4)
    return(num_outliers)
} 

In [None]:
xpnsl_topHits_urban_rural_with_other_stats <- xpnsl_topHits_urban_rural %>% 
    mutate(num_nsl_urban = count_overlapping_outlier_windows(get_positions(.), get_positions(nsl_urban_outlier)),
           num_nsl_rural = count_overlapping_outlier_windows(get_positions(.), get_positions(nsl_rural_outlier)),
           num_ihs_urban = count_overlapping_outlier_windows(get_positions(.), get_positions(ihs_urban_outlier)),
           num_ihs_rural = count_overlapping_outlier_windows(get_positions(.), get_positions(ihs_rural_outlier)),
           num_ihh12_urban = count_overlapping_outlier_windows(get_positions(.), get_positions(ihh12_urban_outlier)),
           num_ihh12_rural = count_overlapping_outlier_windows(get_positions(.), get_positions(ihh12_rural_outlier)),
           num_fst = count_overlapping_outlier_windows(get_positions(.), get_positions(gt_fst_outlier)),
           num_td = count_overlapping_outlier_windows(get_positions(.), get_positions(td_outlier))) %>% 
    left_join(., tajima_vals, by = c("Chr", "start", "end"))

In [None]:
# Load GFF
gff <- ape::read.gff(snakemake@input[['gff']], GFF3 = TRUE) %>% 
    dplyr::select(seqid, start, end, everything())

# Get gene names dataframe
gene_names <- gff %>% 
    filter(type == 'gene') %>% 
    mutate(gene = str_extract(attributes, pattern = '(?<=gene=)\\w+(?=;)'),
           gene_id = str_extract(attributes, pattern = '(?<=ID\\=)ACLI19_g\\d+(?=;)')) %>% 
    dplyr::select(seqid, start, end, gene_id, gene) %>% 
    rename('Chr' = 'seqid') %>% 
    remap_chr_names()

# Get gene products and GO annotations dataframe
prod_go_annot <- gff %>% 
    filter(type == 'mRNA') %>% 
    mutate(id = str_extract(attributes, pattern = '(?<=ID\\=)ACLI19_g\\d+\\.t\\d+(?=;)'),
           func = str_extract(attributes, pattern = '(?<=product=)[^;]*'),
           go = str_extract_all(attributes, pattern = 'GO:\\d+(?=(,|;))')) %>% 
    separate(id, into = c('gene_id', 'trans'), sep = '\\.') %>% 
    filter(trans == 't1') %>% 
    dplyr::select(seqid, gene_id, func, go) %>% 
    rename('Chr' = 'seqid') %>% 
    remap_chr_names()

# Combine genes, functions, and GO annotations into single dataframe
genes_prods_go_df <- left_join(gene_names, prod_go_annot, by = c('Chr', 'gene_id'))

In [None]:
xpnsl_topHits_urban_rural_with_genes_long <- bt.intersect(xpnsl_topHits_urban_rural %>% 
                                                            dplyr::select(Chr, start, end, direction),
                                                        genes_prods_go_df %>%   
                                                            dplyr::select(-go),     
                                                        wa=T, wb=T) %>%           
    dplyr::select(V1, V2, V3, V4, V8, V9, V10)                                    
names(xpnsl_topHits_urban_rural_with_genes_long) <- c('Chr', 'start', 'end', 'direction', 'gene_id', 'gene_symbol', 'product')
head(xpnsl_topHits_urban_rural_with_genes_long)

# Write dataframe with gene IDs for top ten selected urban and rural regions
write_delim(xpnsl_topHits_urban_rural_with_genes_long, snakemake@output[['top_hits_genes']], delim = '\t')

## Table with gene symbols, products, and Fst for top selected regions

In [None]:
# Concatenate gene symbols for top 10 selected regions
symbols <- xpnsl_topHits_urban_rural_with_genes_long %>% 
    dplyr::select(-gene_id, -product) %>% 
    filter(!is.na(gene_symbol)) %>% 
    group_by(Chr, start, end, direction) %>% 
    summarise(gene_symbols = toString(gene_symbol))

# Concatenate products for top 10 selected regions
prods <- xpnsl_topHits_urban_rural_with_genes_long %>% 
    dplyr::select(-gene_id, -gene_symbol) %>% 
    filter(product != 'hypothetical protein') %>% 
    group_by(Chr, start, end, direction) %>% 
    summarise(products = toString(product))

# Add symbols and products to table
xpnsl_topHits_urban_rural_with_genes <- xpnsl_topHits_urban_rural_with_other_stats %>% 
    left_join(., symbols) %>% 
    left_join(., prods)

xpnsl_topHits_urban_rural_with_genes

In [None]:
# Write table with gene symbols, products, and Fst overlaps for top ten selected yrban and rural regions
write_delim(xpnsl_topHits_urban_rural_with_genes, snakemake@output[['top_hits_tbl']], delim = '\t')

## Dataframe with gene IDs in selected regions based on XP-nSL

In [None]:
# All selected regions
xpnsl_outliers <- obs_xpnsl_ur_df_filt %>% 
    filter(direction != 'None') %>% 
    dplyr::select(Chr, start, end, direction)

xpnsl_outliers_with_genes <- bt.intersect(xpnsl_outliers, gene_names, wa = T, wb = T) %>% 
    dplyr::select(V1, V2, V3, V4, V8)
names(xpnsl_outliers_with_genes) <- c('Chr', 'start', 'end', 'direction', 'gene_id')
xpnsl_outliers_with_genes

# Write dataframe with gene IDs for all selected urban and rural regions
write_delim(xpnsl_outliers_with_genes, snakemake@output[['xpnsl_out_genes']], delim = '\t')

## Polygenic selection

In [None]:
load_persite_xpnsl <- function(path){
    base <- basename(path)
    chrom <- str_extract(string = base, pattern = ".*(?=_Urban)")
    df_out <- suppressMessages(read_delim(path, delim="\t")) %>% 
        mutate(Chr = chrom) %>% 
    remap_chr_names()
    return(df_out)
}
xpnsl_persite <- snakemake@input[["norm_xpnsl"]] %>% 
    purrr::map_dfr(., load_persite_xpnsl) %>% 
    rename("normxpnsl" = "normxpehh") %>% 
    mutate(crit = ifelse(normxpnsl < -2 | normxpnsl > 2, 1, 0),
           direction = case_when(crit == 1 & normxpnsl > 0 ~ "Urban",
                                 crit == 1 & normxpnsl < 0 ~ "Rural",
                                 TRUE ~ "None"))

In [None]:
num_genes_df <- gene_names %>% 
    group_by(Chr) %>% 
    summarise(num_genes = n())

chr_lengths <- read_delim(snakemake@input[["chr_lengths"]], delim="\t", 
                          col_names = c("Chr", "size")) %>% 
    remap_chr_names()

num_xpnsl_outliers_by_chr_urban <- xpnsl_persite %>% 
    filter(direction == "Urban") %>% 
    group_by(Chr) %>% 
    summarise(Urban = n())

num_xpnsl_outliers_by_chr_rural <- xpnsl_persite %>% 
    filter(direction == "Rural") %>% 
    group_by(Chr) %>% 
    summarise(Rural = n())

chr_size_outliers_df <- chr_lengths %>% 
     left_join(., num_genes_df, by = "Chr") %>% 
     left_join(., num_xpnsl_outliers_by_chr_urban, by = "Chr") %>% 
     left_join(., num_xpnsl_outliers_by_chr_rural, by = "Chr") %>% 
     filter(!is.na(Chr)) %>% 
     pivot_longer(Urban:Rural, names_to = "habitat", values_to = "num_outliers")

head(chr_size_outliers_df)

In [None]:
summary(lm(num_outliers ~ num_genes*habitat, data = chr_size_outliers_df))

In [None]:
options(repr.plot.width = 8, repr.plot.height = 8)
num_xpnsl_by_num_genes <- chr_size_outliers_df %>% 
    ggplot(aes(x = num_genes, y = num_outliers)) + 
        geom_point(size = 4, aes(color = habitat, fill = habitat)) +
        geom_smooth(method = "lm", linewidth = 1.5,
                    aes(color = habitat)) +
        ylab("Number of XP-nSL outliers") + xlab("Number of genes") +
        scale_color_manual(values = c("#007243", "#003876")) +
        scale_fill_manual(values = c("#007243", "#003876")) +
        theme_classic() +
        theme(
            legend.position = "none",
            panel.border = element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            axis.text = element_text(size=16),
            axis.title = element_text(size=20)) +
        transp_theme
num_xpnsl_by_num_genes

ggsave(filename = snakemake@output[["num_xpnsl_by_num_genes"]], plot = num_xpnsl_by_num_genes, 
       height = 8, width = 8, device = "pdf", dpi = 600, units = "in")

## Zoomed-in Manhattan and Haplotype plots

### Positive selection in urban habitats

In [None]:
xpnsl_topHits_urban_rural_with_other_stats %>% 
    filter(direction == 'Urban') %>% 
    dplyr::select(Chr, start, end, win_size, min_xpnsl_rank, num_nsl_urban:num_fst)

In [None]:
max_xpnsl <- xpnsl_persite %>% filter(normxpnsl == max(normxpnsl))
max_xpnsl

In [None]:
max <- max_xpnsl %>% dplyr::select(Chr, pos) %>% rename('start' = 'pos') %>% mutate(end = start) %>% dplyr::select(Chr, start, end)
genes <- genes_prods_go_df %>% dplyr::select(Chr, start, end, gene, func)
bt.intersect(max, genes, wb=T)

#### Chromosome 7

In [None]:
# Function for formatting x-axis numbers
formatter1e6 <- function(x){ 
    x <- x / 1e6
    return(sprintf(x, fmt = '%#.2f'))
}

In [None]:
region_df <- xpnsl_topHits_urban_rural %>% 
    filter(direction == 'Urban') %>% 
    filter(Chr == 7) %>%
    dplyr::select(Chr, start, end)

chrom <- region_df %>% pull(Chr) %>% unique()
min_pos <- region_df %>% pull(start) %>% min()
max_pos <- region_df %>% pull(end) %>% max()
buffer <- 500000

all_outlier_regions <- xpnsl_urban_outliers %>% 
    filter(Chr == chrom & start >= min_pos - buffer & end <= max_pos + buffer) %>% 
    dplyr::select(Chr, start, end)

region_raw_df <- xpnsl_persite %>%
    filter(Chr == chrom & pos >= min_pos - buffer & pos <= max_pos + buffer)

xpnsl_not_outlier <- region_raw_df %>% filter(crit == 0)
urban_xpnsl_outliers <- region_raw_df %>% filter(crit == 1 & normxpnsl > 0)
rural_xpnsl_outliers <- region_raw_df %>% filter(crit == 1 & normxpnsl < 0)

fst_outliers <- gt_fst_df_filt %>% filter(is_outlier == 1) %>% dplyr::select(Chr, start, end)
fst_wins <- bt.intersect(fst_outliers, all_outlier_regions, wa = T) %>% 
    rename('Chr' = 'V1', 'start' = 'V2', 'end' = 'V3')
nsl_outliers <- nsl_df_filt %>% filter(is_outlier == 1) %>% dplyr::select(Chr, start, end)
nsl_wins <- bt.intersect(nsl_outliers, all_outlier_regions, wa = T) %>% 
    rename('Chr' = 'V1', 'start' = 'V2', 'end' = 'V3')

manhat_plot <- ggplot(xpnsl_not_outlier, aes(x = pos, y = normxpnsl)) +
    geom_rect(data=region_df, aes(xmin=start, xmax=end), ymin=-Inf, ymax=Inf, fill="black", alpha=0.3, inherit.aes = F) +
    geom_rect(data=all_outlier_regions, aes(xmin=start, xmax=end), ymin=-Inf, ymax=Inf, fill="black", alpha=0.1, inherit.aes = F) +
    geom_point(data = xpnsl_not_outlier, shape = 21, alpha = 0.3, size = 1, , fill = "black", color = "black") +
    geom_point(data = urban_xpnsl_outliers, shape = 21, alpha = 0.3, size = 1, color = '#003876', fill = '#003876',
               aes(x = pos, y = normxpnsl)) +
    geom_point(data = rural_xpnsl_outliers, shape = 21, alpha = 0.3, size = 1, color = '#007243', fill = '#007243',
               aes(x = pos, y = normxpnsl)) +
    geom_hline(yintercept = 2, color = "grey40", linetype = "dashed") +
    geom_hline(yintercept = -2, color = "grey40", linetype = "dashed") +
    xlab('Chromosome 7 (position in Mbp)') + ylab('Normalized XP-nSL') +
    coord_cartesian(ylim = c(-3.25, 6.5)) +
    geom_segment(data = fst_wins, aes(x = start, xend = end), y = -3, yend = -3, color = '#d62828', linewidth = 1) +
    geom_segment(data = nsl_wins, aes(x = start, xend = end), y = -3.25, yend = -3.25, color = '#ffbe0b', linewidth = 1) +
    scale_x_continuous(breaks = seq(min_pos - buffer, max_pos + buffer, 500000), labels = formatter1e6) +
    theme_classic() +
    theme(
        panel.border = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        axis.text = element_text(size=16),
        axis.title = element_text(size=20),
      ) +
    transp_theme

options(repr.plot.width = 8, repr.plot.height = 6)
manhat_plot
ggsave(filename = snakemake@output[["Chr04_Occ_urb_xpnsl"]], plot = manhat_plot, 
       height = 6, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
chr04_Occ_vcf <- vcfR::read.vcfR(snakemake@input[["vcfs"]][7])
popmap <- read_delim(snakemake@input[["popmap"]], delim = "\t", col_names = c("ind", "pop")) %>% 
    filter(pop != "Suburban") %>% 
    as.data.frame()

In [None]:
# Subset VCF around same region as above
idx <- which(vcfR::getPOS(chr04_Occ_vcf) >= min_pos - buffer & vcfR::getPOS(chr04_Occ_vcf) <= max_pos + buffer)
start_idx <- head(idx, n = 1)
end_idx <- tail(idx, n = 1)
chr04_Occ_vcf_urb_sub <- chr04_Occ_vcf[start_idx:end_idx]

In [None]:
# Create genotype plot and cluster genotypes with PCA and hclust
chr04_Occ_urb_clustered <- genotype_plot(vcf_object = chr04_Occ_vcf_urb_sub, popmap = popmap, snp_label_size = 500000, cluster = T)

In [None]:
# Reorder popmap based on clustering. This is only for visualization
popmap2 <- data.frame(ind = chr04_Occ_urb_clustered$dendro_labels) %>% 
    left_join(popmap, by = "ind")

In [None]:
# Create final haplotype plot with colors. Unclustered and split genotypes into phased haplotypes
chr04_Occ_urb_final <- genotype_plot(vcf_object = chr04_Occ_vcf_urb_sub, 
                                     popmap = popmap2, snp_label_size = 500000, 
                                     cluster = F, plot_phased = T)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 6)
Chr04_Occ_urb_ur_haps <- chr04_Occ_urb_final$genotypes + 
    geom_rect(data=region_df, aes(xmin=start, xmax=end), ymin=0, ymax=164.5, fill="black", alpha=0.3, inherit.aes = F) +
    geom_rect(data=all_outlier_regions, aes(xmin=start, xmax=end), ymin=0, ymax=164.5, fill="black", alpha=0.1, inherit.aes = F) +
    scale_x_continuous(expand = c(0,0), limit = c(min_pos - buffer, max_pos + buffer), 
                       breaks = seq(min_pos - buffer, max_pos + buffer, 500000), labels = formatter1e6) +
    scale_fill_manual(values = c("#fcbf49", "#d62828"), name="Haplotype",
                      breaks=c("0","1"),labels=c("REF","ALT")) +
    theme(
        axis.text.x = element_text(size=16),
        axis.title = element_text(size=20),
        axis.text.y = element_text(angle = 90),
        axis.ticks.x = element_line(),
        legend.position = "top"
      ) +
    transp_theme
Chr04_Occ_urb_ur_haps
ggsave(filename = snakemake@output[["Chr04_Occ_urb_ur_haps"]], plot = Chr04_Occ_urb_ur_haps, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
subset_vcf <- function(df, vcf){
    start <- df %>% pull(start)
    end <- df %>% pull(end)
    idx <- which(vcfR::getPOS(vcf) >= start & vcfR::getPOS(vcf) <= end)
    start_idx <- head(idx, n = 1)
    end_idx <- tail(idx, n = 1)
    vcf_sub <- vcf[start_idx:end_idx]
    return(vcf_sub)
}

all_vcfs <- all_outlier_regions %>% 
    group_split(start) %>% 
    purrr::map(., subset_vcf, vcf = chr04_Occ_vcf_urb_sub)

In [None]:
combined_vcf <- do.call("rbind", all_vcfs)
chr04_Occ_urb_sel_AF <- genotype_plot(vcf_object = combined_vcf, popmap = popmap, snp_label_size = 500000, 
                                      plot_allele_frequency = T)

In [None]:
options(repr.plot.width = 8, repr.plot.height = 6)
cols_hab <- c("#007243", "red", "#003876")
Chr04_Occ_urb_ur_af <- chr04_Occ_urb_sel_AF$genotypes$data %>% 
    ggplot(aes(x = AF)) +
        geom_density(aes(color = pop, fill = pop), alpha = 0.7) +
        scale_color_manual(values = cols_hab) +
        scale_fill_manual(values = cols_hab) +
        ylab("Density") + 
        xlab("Frequency of REF allele") +
        theme_classic() +
        theme(axis.text = element_text(size = 18),
              axis.title = element_text(size = 20),
              legend.position = 'top',
              legend.title = element_text(size = 16),
              legend.text = element_text(size = 14)) +
        transp_theme
Chr04_Occ_urb_ur_af
ggsave(filename = snakemake@output[["Chr04_Occ_urb_ur_af"]], plot = Chr04_Occ_urb_ur_af, 
       height = 6, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
chr04_Occ_urb_sel_clust <- genotype_plot(vcf_object = combined_vcf, popmap = popmap, snp_label_size = 500000, 
                                         cluster = T, plot_phased = T)

In [None]:
var_PC1 <- round(get_eigenvalue(chr04_Occ_urb_sel_clust$cluster_pca)["Dim.1","variance.percent"], 1)
var_PC2 <- round(get_eigenvalue(chr04_Occ_urb_sel_clust$cluster_pca)["Dim.2","variance.percent"], 1)

Chr04_Occ_urb_ur_pca <- chr04_Occ_urb_sel_clust$cluster_pca$l1 %>% 
    as.data.frame() %>% 
    rownames_to_column(., var = "tmp") %>% 
    mutate(ind = str_sub(tmp, 1, -3),
           hap = str_sub(tmp, -1)) %>% 
    dplyr::select(ind, hap, RS1, RS2) %>% 
    left_join(popmap, by = "ind") %>% 
    ggplot(., aes(x = RS1, y = RS2)) +
            geom_point(aes(color = pop, shape = pop), size = 7, alpha = 0.75) +
            scale_color_manual(values = cols_hab) +
            theme_classic() +
            xlab(paste0('PC1 (', var_PC1, '%)')) + ylab(paste0('PC2 (', var_PC2, '%)')) +
            theme(axis.text = element_text(size = 18),
                  axis.title = element_text(size = 20),
                  legend.position = 'top',
                  legend.title = element_text(size = 16),
                  legend.text = element_text(size = 14)) +
    transp_theme
Chr04_Occ_urb_ur_pca
ggsave(filename = snakemake@output[["Chr04_Occ_urb_ur_pca"]], plot = Chr04_Occ_urb_ur_pca, 
       height = 6, width = 8, device = "pdf", dpi = 600, units = "in")

#### Chromosome 9

In [None]:
region_df <- xpnsl_topHits_urban_rural %>% 
    filter(direction == 'Urban') %>% 
    filter(Chr == 9) %>%
    dplyr::select(Chr, start, end)

chrom <- region_df %>% pull(Chr) %>% unique()
min_pos <- region_df %>% pull(start) %>% min()
max_pos <- region_df %>% pull(end) %>% max()
buffer <- 500000

all_outlier_regions <- xpnsl_urban_outliers %>% 
    filter(Chr == chrom & start >= min_pos - buffer & end <= max_pos + buffer) %>% 
    dplyr::select(Chr, start, end)

region_raw_df <- xpnsl_persite %>%
    filter(Chr == chrom & pos >= min_pos - buffer & pos <= max_pos + buffer)

xpnsl_not_outlier <- region_raw_df %>% filter(crit == 0)
urban_xpnsl_outliers <- region_raw_df %>% filter(crit == 1 & normxpnsl > 0)
rural_xpnsl_outliers <- region_raw_df %>% filter(crit == 1 & normxpnsl < 0)

formatter1e6 <- function(x){ 
    x <- x / 1e6
    return(sprintf(x, fmt = '%#.2f'))
}

fst_outliers <- gt_fst_df_filt %>% filter(is_outlier == 1) %>% dplyr::select(Chr, start, end)
fst_wins <- bt.intersect(fst_outliers, all_outlier_regions, wa = T) %>% 
    rename('Chr' = 'V1', 'start' = 'V2', 'end' = 'V3')
nsl_outliers <- nsl_df_filt %>% filter(is_outlier == 1 & habitat == "Urban") %>% dplyr::select(Chr, start, end)
nsl_wins <- bt.intersect(nsl_outliers, all_outlier_regions, wa = T) %>% 
    rename('Chr' = 'V1', 'start' = 'V2', 'end' = 'V3')
ihs_outliers <- ihs_df_filt %>% filter(is_outlier == 1 & habitat == "Urban") %>% dplyr::select(Chr, start, end)
ihs_wins <- bt.intersect(ihs_outliers, all_outlier_regions, wa = T) %>% 
    rename('Chr' = 'V1', 'start' = 'V2', 'end' = 'V3')
ihh12_outliers <- ihh12_df_filt %>% filter(is_outlier == 1 & habitat == "Urban") %>% dplyr::select(Chr, start, end)
ihh12_wins <- bt.intersect(ihh12_outliers, all_outlier_regions, wa = T) %>% 
    rename('Chr' = 'V1', 'start' = 'V2', 'end' = 'V3')

manhat_plot <- ggplot(xpnsl_not_outlier, aes(x = pos, y = normxpnsl)) +
    geom_rect(data=region_df, aes(xmin=start, xmax=end), ymin=-Inf, ymax=Inf, fill="black", alpha=0.3, inherit.aes = F) +
    geom_rect(data=all_outlier_regions, aes(xmin=start, xmax=end), ymin=-Inf, ymax=Inf, fill="black", alpha=0.1, inherit.aes = F) +   
    geom_point(shape = 21, alpha = 0.3, size = 1, , fill = "black", color = "black") +
    geom_point(data = urban_xpnsl_outliers, shape = 21, alpha = 0.3, size = 1, color = '#003876', fill = '#003876',
               aes(x = pos, y = normxpnsl)) +
    geom_point(data = rural_xpnsl_outliers, shape = 21, alpha = 0.3, size = 1, color = '#007243', fill = '#007243',
               aes(x = pos, y = normxpnsl)) +
    geom_point(data = max_xpnsl, shape = 23, size = 3.5, fill = 'yellow', alpha = 1) +
    geom_hline(yintercept = 2, color = "grey40", linetype = "dashed") +
    geom_hline(yintercept = -2, color = "grey40", linetype = "dashed") +
    xlab('Chromosome 9 (position in Mbp)') + ylab('Normalized XP-nSL') +
    coord_cartesian(ylim = c(-3.5, 6.5)) +
    geom_segment(data = fst_wins, aes(x = start, xend = end), y = -3, yend = -3, color = '#d62828', linewidth = 1) +
    geom_segment(data = nsl_wins, aes(x = start, xend = end), y = -3.25, yend = -3.25, color = '#ffbe0b', linewidth = 1) +
    geom_segment(data = ihs_wins, aes(x = start, xend = end), y = -3.5, yend = -3.5, color = '#8338ec', linewidth = 1) +
    geom_segment(data = ihh12_wins, aes(x = start, xend = end), y = -3.75, yend = -3.75, color = '#386641', linewidth = 1) +
    scale_x_continuous(breaks = seq(min_pos - buffer, max_pos + buffer, 200000), labels = formatter1e6) +
    # scale_y_continuous(breaks = seq(-2, 6, 2)) +
    theme_classic() +
    theme(
        panel.border = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        axis.text = element_text(size=16),
        axis.title = element_text(size=20),
      ) +
    transp_theme

options(repr.plot.width = 8, repr.plot.height = 6)
manhat_plot
ggsave(filename = snakemake@output[["Chr05_Occ_urb_xpnsl"]], plot = manhat_plot, 
       height = 6, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
chr05_Occ_vcf <- vcfR::read.vcfR(snakemake@input[["vcfs"]][9])

In [None]:
idx <- which(vcfR::getPOS(chr05_Occ_vcf) >= min_pos - buffer & vcfR::getPOS(chr05_Occ_vcf) <= max_pos + buffer)
start_idx <- head(idx, n = 1)
end_idx <- tail(idx, n = 1)
chr05_Occ_vcf_urb_sub <- chr05_Occ_vcf[start_idx:end_idx]

In [None]:
# Create genotype plot and cluster genotypes with PCA and hclust
chr05_Occ_urb_clustered <- genotype_plot(vcf_object = chr05_Occ_vcf_urb_sub, popmap = popmap, snp_label_size = 200000, cluster = T)

In [None]:
# Reorder popmap based on clustering. This is only for visualization
popmap2 <- data.frame(ind = chr05_Occ_urb_clustered$dendro_labels) %>% 
    left_join(popmap, by = "ind")

In [None]:
# Create final haplotype plot with colors. Unclustered and split genotypes into phased haplotypes
chr05_Occ_urb_final <- genotype_plot(vcf_object = chr05_Occ_vcf_urb_sub, 
                                     popmap = popmap2, snp_label_size = 200000, 
                                     cluster = F, plot_phased = T)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 6)
Chr05_Occ_urb_ur_haps <- chr05_Occ_urb_final$genotypes + 
    geom_rect(data=region_df, aes(xmin=start, xmax=end), ymin=0, ymax=164.5, fill="black", alpha=0.3, inherit.aes = F) +
    geom_rect(data=all_outlier_regions, aes(xmin=start, xmax=end), ymin=0, ymax=164.5, fill="black", alpha=0.1, inherit.aes = F) +
    scale_x_continuous(expand = c(0,0), limit = c(min_pos - buffer, max_pos + buffer), 
                       breaks = seq(min_pos - buffer, max_pos + buffer, 200000), labels = formatter1e6) +
    scale_fill_manual(values = c("#fcbf49", "#d62828"), name="Haplotype",
                      breaks=c("0","1"),labels=c("REF","ALT")) +
    theme(
        axis.text.x = element_text(size=16),
        axis.title = element_text(size=20),
        axis.text.y = element_text(angle = 90),
        axis.ticks.x = element_line(),
        legend.position = "top"
      ) +
    transp_theme
Chr05_Occ_urb_ur_haps
ggsave(filename = snakemake@output[["Chr05_Occ_urb_ur_haps"]], plot = Chr05_Occ_urb_ur_haps, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
all_vcfs <- all_outlier_regions %>% 
    group_split(start) %>% 
    purrr::map(., subset_vcf, vcf = chr05_Occ_vcf_urb_sub)

In [None]:
combined_vcf <- do.call("rbind", all_vcfs)
chr05_Occ_urb_sel_AF <- genotype_plot(vcf_object = combined_vcf, popmap = popmap, snp_label_size = 200000, 
                                      plot_allele_frequency = T)

In [None]:
options(repr.plot.width = 8, repr.plot.height = 6)
cols_hab <- c("#007243", "#003876")
Chr05_Occ_urb_ur_af <- chr05_Occ_urb_sel_AF$genotypes$data %>% 
    ggplot(aes(x = AF)) +
        geom_density(aes(color = pop, fill = pop), alpha = 0.7) +
        scale_color_manual(values = cols_hab) +
        scale_fill_manual(values = cols_hab) +
        ylab("Density") + 
        xlab("Frequency of REF allele") +
        theme_classic() +
        theme(axis.text = element_text(size = 18),
              axis.title = element_text(size = 20),
              legend.position = 'top',
              legend.title = element_text(size = 16),
              legend.text = element_text(size = 14)) +
        transp_theme
Chr05_Occ_urb_ur_af
ggsave(filename = snakemake@output[["Chr05_Occ_urb_ur_af"]], plot = Chr05_Occ_urb_ur_af, 
       height = 6, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
chr05_Occ_urb_sel_clust <- genotype_plot(vcf_object = combined_vcf, popmap = popmap, snp_label_size = 500000, 
                                         cluster = T, plot_phased = T)

In [None]:
var_PC1 <- round(get_eigenvalue(chr05_Occ_urb_sel_clust$cluster_pca)["Dim.1","variance.percent"], 1)
var_PC2 <- round(get_eigenvalue(chr05_Occ_urb_sel_clust$cluster_pca)["Dim.2","variance.percent"], 1)

Chr05_Occ_urb_ur_pca <- chr05_Occ_urb_sel_clust$cluster_pca$l1 %>% 
    as.data.frame() %>% 
    rownames_to_column(., var = "tmp") %>% 
    mutate(ind = str_sub(tmp, 1, -3),
           hap = str_sub(tmp, -1)) %>% 
    dplyr::select(ind, hap, RS1, RS2) %>% 
    left_join(popmap, by = "ind") %>% 
    ggplot(., aes(x = RS1, y = RS2)) +
            geom_point(aes(color = pop, shape = pop), size = 7, alpha = 0.75) +
            scale_color_manual(values = cols_hab) +
            theme_classic() +
            xlab(paste0('PC1 (', var_PC1, '%)')) + ylab(paste0('PC2 (', var_PC2, '%)')) +
            theme(axis.text = element_text(size = 18),
                  axis.title = element_text(size = 20),
                  legend.position = 'top',
                  legend.title = element_text(size = 16),
                  legend.text = element_text(size = 14)) +
    transp_theme
Chr05_Occ_urb_ur_pca
ggsave(filename = snakemake@output[["Chr05_Occ_urb_ur_pca"]], plot = Chr05_Occ_urb_ur_pca, 
       height = 6, width = 8, device = "pdf", dpi = 600, units = "in")

### Positive selection in rural habitats

In [None]:
xpnsl_topHits_urban_rural_with_other_stats %>% 
    filter(direction == 'Rural') %>% 
    dplyr::select(Chr, start, end, win_size, min_xpnsl_rank, num_nsl_urban:num_fst)

In [None]:
min_xpnsl <- xpnsl_persite %>% filter(normxpnsl == min(normxpnsl))
min_xpnsl

In [None]:
min <- min_xpnsl %>% dplyr::select(Chr, pos) %>% rename('start' = 'pos') %>% mutate(end = start) %>% dplyr::select(Chr, start, end)
bt.intersect(min, genes, wb=T)

#### Chr08_Pall

In [None]:
region_df <- xpnsl_topHits_urban_rural %>% 
    filter(direction == 'Rural') %>% 
    filter(Chr == 16) %>%
    dplyr::select(Chr, start, end)

chrom <- region_df %>% pull(Chr) %>% unique()
min_pos <- region_df %>% pull(start) %>% min()
max_pos <- region_df %>% pull(end) %>% max()
buffer <- 500000

all_outlier_regions <- xpnsl_rural_outliers %>% 
    filter(Chr == chrom & start >= min_pos - buffer & end <= max_pos + buffer) %>% 
    dplyr::select(Chr, start, end)

region_raw_df <- xpnsl_persite %>%
    filter(Chr == chrom & pos >= min_pos - buffer & pos <= max_pos + buffer)

xpnsl_not_outlier <- region_raw_df %>% filter(crit == 0)
urban_xpnsl_outliers <- region_raw_df %>% filter(crit == 1 & normxpnsl > 0)
rural_xpnsl_outliers <- region_raw_df %>% filter(crit == 1 & normxpnsl < 0)

formatter1e6 <- function(x){ 
    x <- x / 1e6
    return(sprintf(x, fmt = '%#.2f'))
}

fst_outliers <- gt_fst_df_filt %>% filter(is_outlier == 1) %>% dplyr::select(Chr, start, end)
fst_wins <- bt.intersect(fst_outliers, all_outlier_regions, wa = T) %>% 
    rename('Chr' = 'V1', 'start' = 'V2', 'end' = 'V3')
nsl_outliers <- nsl_df_filt %>% filter(is_outlier == 1) %>% dplyr::select(Chr, start, end)
nsl_wins <- bt.intersect(nsl_outliers, all_outlier_regions, wa = T) %>% 
    rename('Chr' = 'V1', 'start' = 'V2', 'end' = 'V3')
ihs_outliers <- ihs_df_filt %>% filter(is_outlier == 1) %>% dplyr::select(Chr, start, end)
ihs_wins <- bt.intersect(ihs_outliers, all_outlier_regions, wa = T) %>% 
    rename('Chr' = 'V1', 'start' = 'V2', 'end' = 'V3')

manhat_plot <- ggplot(xpnsl_not_outlier, aes(x = pos, y = normxpnsl)) +
    geom_rect(data=region_df, aes(xmin=start, xmax=end), ymin=-Inf, ymax=Inf, fill="black", alpha=0.3, inherit.aes = F) +
    geom_rect(data=all_outlier_regions, aes(xmin=start, xmax=end), ymin=-Inf, ymax=Inf, fill="black", alpha=0.1, inherit.aes = F) +
    geom_point(data = xpnsl_not_outlier, shape = 21, alpha = 0.3, size = 1, , fill = "black", color = "black") +
    geom_point(data = urban_xpnsl_outliers, shape = 21, alpha = 0.3, size = 1, color = '#003876', fill = '#003876',
               aes(x = pos, y = normxpnsl)) +
    geom_point(data = rural_xpnsl_outliers, shape = 21, alpha = 0.3, size = 1, color = '#007243', fill = '#007243',
               aes(x = pos, y = normxpnsl)) +
    geom_point(data = min_xpnsl, shape = 23, size = 3.5, fill = 'yellow', alpha = 1) +
    geom_hline(yintercept = 2, color = "grey40", linetype = "dashed") +
    geom_hline(yintercept = -2, color = "grey40", linetype = "dashed") +
    xlab('Chromosome 16 (position in Mbp)') + ylab('Normalized XP-nSL') +
    coord_cartesian(ylim = c(-9.5, 3.5)) +
    geom_segment(data = fst_wins, aes(x = start, xend = end), y = -9, yend = -9, color = '#d62828', linewidth = 1) +
    geom_segment(data = nsl_wins, aes(x = start, xend = end), y = -9.25, yend = -9.25, color = '#ffbe0b', linewidth = 1) +
    geom_segment(data = ihs_wins, aes(x = start, xend = end), y = -9.5, yend = -9.5, color = '#8338ec', linewidth = 1) +
    scale_x_continuous(breaks = seq(min_pos - buffer, max_pos + buffer, 200000), labels = formatter1e6) +
    scale_y_continuous(breaks = seq(-9, 3, 2)) +
    theme_classic() +
    theme(
        panel.border = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        axis.text = element_text(size=16),
        axis.title = element_text(size=20),
      ) +
    transp_theme

options(repr.plot.width = 8, repr.plot.height = 6)
manhat_plot
ggsave(filename = snakemake@output[["Chr08_Pall_rur_xpnsl"]], plot = manhat_plot, 
       height = 6, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
chr08_Pall_vcf <- vcfR::read.vcfR(snakemake@input[["vcfs"]][16])

In [None]:
idx <- which(vcfR::getPOS(chr08_Pall_vcf) >= min_pos - buffer & vcfR::getPOS(chr08_Pall_vcf) <= max_pos + buffer)
start_idx <- head(idx, n = 1)
end_idx <- tail(idx, n = 1)
chr08_Pall_vcf_rur_sub <- chr08_Pall_vcf[start_idx:end_idx]

In [None]:
# Create genotype plot and cluster genotypes with PCA and hclust
chr08_Pall_rur_clustered <- genotype_plot(vcf_object = chr08_Pall_vcf_rur_sub, popmap = popmap, snp_label_size = 500000, cluster = T)

In [None]:
# Reorder popmap based on clustering. This is only for visualization
popmap2 <- data.frame(ind = chr08_Pall_rur_clustered$dendro_labels) %>% 
    left_join(popmap, by = "ind")

In [None]:
# Create final haplotype plot with colors. Unclustered and split genotypes into phased haplotypes
chr08_Pall_rur_final <- genotype_plot(vcf_object = chr08_Pall_vcf_rur_sub, 
                                     popmap = popmap2, snp_label_size = 200000, 
                                     cluster = F, plot_phased = T)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 6)
chr08_Pall_rur_ur_haps <- chr08_Pall_rur_final$genotypes + 
    geom_rect(data=region_df, aes(xmin=start, xmax=end), ymin=0, ymax=164.5, fill="black", alpha=0.3, inherit.aes = F) +
    geom_rect(data=all_outlier_regions, aes(xmin=start, xmax=end), ymin=0, ymax=164.5, fill="black", alpha=0.1, inherit.aes = F) +
    scale_x_continuous(expand = c(0,0), limit = c(min_pos - buffer, max_pos + buffer), 
                       breaks = seq(min_pos - buffer, max_pos + buffer, 200000), labels = formatter1e6) +
    scale_fill_manual(values = c("#fcbf49", "#d62828"), name="Haplotype",
                      breaks=c("0","1"),labels=c("REF","ALT")) +
    theme(
        axis.text.x = element_text(size=16),
        axis.title = element_text(size=20),
        axis.text.y = element_text(angle = 90),
        axis.ticks.x = element_line(),
        legend.position = "top"
      ) +
    transp_theme
chr08_Pall_rur_ur_haps
ggsave(filename = snakemake@output[["Chr08_Pall_rur_ur_haps"]], plot = chr08_Pall_rur_ur_haps, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
all_vcfs <- all_outlier_regions %>% 
    group_split(start) %>% 
    purrr::map(., subset_vcf, vcf = chr08_Pall_vcf_rur_sub)

In [None]:
combined_vcf <- do.call("rbind", all_vcfs)
chr08_Pall_vcf_sel_AF <- genotype_plot(vcf_object = combined_vcf, popmap = popmap, snp_label_size = 200000, 
                                      plot_allele_frequency = T)

In [None]:
options(repr.plot.width = 8, repr.plot.height = 6)
cols_hab <- c("#007243", "#003876")
chr08_Pall_rur_ur_af <- chr08_Pall_vcf_sel_AF$genotypes$data %>% 
    ggplot(aes(x = AF)) +
        geom_density(aes(color = pop, fill = pop), alpha = 0.7) +
        scale_color_manual(values = cols_hab) +
        scale_fill_manual(values = cols_hab) +
        ylab("Density") + 
        xlab("Frequency of REF allele") +
        theme_classic() +
        theme(axis.text = element_text(size = 18),
              axis.title = element_text(size = 20),
              legend.position = 'top',
              legend.title = element_text(size = 16),
              legend.text = element_text(size = 14)) +
        transp_theme
chr08_Pall_rur_ur_af
ggsave(filename = snakemake@output[["Chr08_Pall_rur_ur_af"]], plot = chr08_Pall_rur_ur_af, 
       height = 6, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
chr08_Pall_rur_sel_clust <- genotype_plot(vcf_object = combined_vcf, popmap = popmap, snp_label_size = 500000, 
                                         cluster = T, plot_phased = T)

In [None]:
var_PC1 <- round(get_eigenvalue(chr08_Pall_rur_sel_clust$cluster_pca)["Dim.1","variance.percent"], 1)
var_PC2 <- round(get_eigenvalue(chr08_Pall_rur_sel_clust$cluster_pca)["Dim.2","variance.percent"], 1)

chr08_Pall_rur_ur_pca <- chr08_Pall_rur_sel_clust$cluster_pca$l1 %>% 
    as.data.frame() %>% 
    rownames_to_column(., var = "tmp") %>% 
    mutate(ind = str_sub(tmp, 1, -3),
           hap = str_sub(tmp, -1)) %>% 
    dplyr::select(ind, hap, RS1, RS2) %>% 
    left_join(popmap, by = "ind") %>% 
    ggplot(., aes(x = RS1, y = RS2)) +
            geom_point(aes(color = pop, shape = pop), size = 7, alpha = 0.75) +
            scale_color_manual(values = cols_hab) +
            theme_classic() +
            xlab(paste0('PC1 (', var_PC1, '%)')) + ylab(paste0('PC2 (', var_PC2, '%)')) +
            theme(axis.text = element_text(size = 18),
                  axis.title = element_text(size = 20),
                  legend.position = 'top',
                  legend.title = element_text(size = 16),
                  legend.text = element_text(size = 14)) +
    transp_theme
chr08_Pall_rur_ur_pca
ggsave(filename = snakemake@output[["Chr08_Pall_rur_ur_pca"]], plot = chr08_Pall_rur_ur_pca, 
       height = 6, width = 8, device = "pdf", dpi = 600, units = "in")

#### Chr04_Occ

In [None]:
region_df <- xpnsl_topHits_urban_rural %>% 
    filter(direction == 'Rural') %>% 
    filter(Chr == 7) %>%
    dplyr::select(Chr, start, end)

chrom <- region_df %>% pull(Chr) %>% unique()
min_pos <- region_df %>% pull(start) %>% min()
max_pos <- region_df %>% pull(end) %>% max()
buffer <- 500000

all_outlier_regions <- xpnsl_rural_outliers %>% 
    filter(Chr == chrom & start >= min_pos - buffer & end <= max_pos + buffer) %>% 
    dplyr::select(Chr, start, end)

region_raw_df <- xpnsl_persite %>%
    filter(Chr == chrom & pos >= min_pos - buffer & pos <= max_pos + buffer)

xpnsl_not_outlier <- region_raw_df %>% filter(crit == 0)
urban_xpnsl_outliers <- region_raw_df %>% filter(crit == 1 & normxpnsl > 0)
rural_xpnsl_outliers <- region_raw_df %>% filter(crit == 1 & normxpnsl < 0)

formatter1e6 <- function(x){ 
    x <- x / 1e6
    return(sprintf(x, fmt = '%#.2f'))
}
fst_outliers <- gt_fst_df_filt %>% filter(is_outlier == 1) %>% dplyr::select(Chr, start, end)
fst_wins <- bt.intersect(fst_outliers, all_outlier_regions, wa = T) %>% 
    rename('Chr' = 'V1', 'start' = 'V2', 'end' = 'V3')
nsl_outliers <- nsl_df_filt %>% filter(is_outlier == 1 & habitat == "Urban") %>% dplyr::select(Chr, start, end)
nsl_wins <- bt.intersect(nsl_outliers, all_outlier_regions, wa = T) %>% 
    rename('Chr' = 'V1', 'start' = 'V2', 'end' = 'V3')
ihs_outliers <- ihs_df_filt %>% filter(is_outlier == 1 & habitat == "Urban") %>% dplyr::select(Chr, start, end)
ihs_wins <- bt.intersect(ihs_outliers, all_outlier_regions, wa = T) %>% 
    rename('Chr' = 'V1', 'start' = 'V2', 'end' = 'V3')

manhat_plot <- ggplot(xpnsl_not_outlier, aes(x = pos, y = normxpnsl)) +
    geom_rect(data=region_df, aes(xmin=start, xmax=end), ymin=-Inf, ymax=Inf, fill="black", alpha=0.3, inherit.aes = F) +
    geom_rect(data=all_outlier_regions, aes(xmin=start, xmax=end), ymin=-Inf, ymax=Inf, fill="black", alpha=0.1, inherit.aes = F) +
    geom_point(data = xpnsl_not_outlier, shape = 21, alpha = 0.3, size = 1, , fill = "black", color = "black") +
    geom_point(data = urban_xpnsl_outliers, shape = 21, alpha = 0.3, size = 1, color = '#003876', fill = '#003876',
               aes(x = pos, y = normxpnsl)) +
    geom_point(data = rural_xpnsl_outliers, shape = 21, alpha = 0.3, size = 1, color = '#007243', fill = '#007243',
               aes(x = pos, y = normxpnsl)) +
    geom_hline(yintercept = 2, color = "grey40", linetype = "dashed") +
    geom_hline(yintercept = -2, color = "grey40", linetype = "dashed") +
    xlab('Chromosome 7 (position in Mbp)') + ylab('Normalized XP-nSL') +
    coord_cartesian(ylim = c(-6.5, 3.5)) +
    scale_x_continuous(breaks = seq(min_pos - buffer, max_pos + buffer, 200000), labels = formatter1e6) +
    geom_segment(data = fst_wins, aes(x = start, xend = end), y = -6, yend = -6, color = '#d62828', linewidth = 1) +
    geom_segment(data = nsl_wins, aes(x = start, xend = end), y = -6.25, yend = -6.25, color = '#ffbe0b', linewidth = 1) +
    geom_segment(data = ihs_wins, aes(x = start, xend = end), y = -6.5, yend = -6.5, color = '#8338ec', linewidth = 1) +
    scale_y_continuous(breaks = seq(-6, 4, 2)) +
    theme_classic() +
    theme(
        panel.border = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        axis.text = element_text(size=16),
        axis.title = element_text(size=20),
      ) +
    transp_theme

options(repr.plot.width = 8, repr.plot.height = 6)
manhat_plot
ggsave(filename = snakemake@output[["Chr04_Occ_rur_xpnsl"]], plot = manhat_plot, 
       height = 6, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
# Subset VCF around same region as above
idx <- which(vcfR::getPOS(chr04_Occ_vcf) >= min_pos - buffer & vcfR::getPOS(chr04_Occ_vcf) <= max_pos + buffer)
start_idx <- head(idx, n = 1)
end_idx <- tail(idx, n = 1)
chr04_Occ_vcf_rur_sub <- chr04_Occ_vcf[start_idx:end_idx]

In [None]:
# Create genotype plot and cluster genotypes with PCA and hclust
chr04_Occ_rur_clustered <- genotype_plot(vcf_object = chr04_Occ_vcf_rur_sub, popmap = popmap, snp_label_size = 200000, cluster = T)

In [None]:
# Reorder popmap based on clustering. This is only for visualization
popmap2 <- data.frame(ind = chr04_Occ_rur_clustered$dendro_labels) %>% 
    left_join(popmap, by = "ind")

In [None]:
# Create final haplotype plot with colors. Unclustered and split genotypes into phased haplotypes
chr04_Occ_rur_final <- genotype_plot(vcf_object = chr04_Occ_vcf_rur_sub, 
                                     popmap = popmap2, snp_label_size = 200000, 
                                     cluster = F, plot_phased = T)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 6)
Chr04_Occ_rur_ur_haps <- chr04_Occ_rur_final$genotypes + 
    geom_rect(data=region_df, aes(xmin=start, xmax=end), ymin=0, ymax=164.5, fill="black", alpha=0.3, inherit.aes = F) +
    geom_rect(data=all_outlier_regions, aes(xmin=start, xmax=end), ymin=0, ymax=164.5, fill="black", alpha=0.1, inherit.aes = F) +
    scale_x_continuous(expand = c(0,0), limit = c(min_pos - buffer, max_pos + buffer), 
                       breaks = seq(min_pos - buffer, max_pos + buffer, 200000), labels = formatter1e6) +
    scale_fill_manual(values = c("#fcbf49", "#d62828"), name="Haplotype",
                      breaks=c("0","1"),labels=c("REF","ALT")) +
    theme(
        axis.text.x = element_text(size=16),
        axis.title = element_text(size=20),
        axis.text.y = element_text(angle = 90),
        axis.ticks.x = element_line(),
        legend.position = "top"
      ) +
    transp_theme
Chr04_Occ_rur_ur_haps
ggsave(filename = snakemake@output[["Chr04_Occ_rur_ur_haps"]], plot = Chr04_Occ_rur_ur_haps, 
       height = 8, width = 20, device = "pdf", dpi = 600, units = "in")

In [None]:
all_vcfs <- all_outlier_regions %>% 
    group_split(start) %>% 
    purrr::map(., subset_vcf, vcf = chr04_Occ_vcf_rur_sub)

In [None]:
combined_vcf <- do.call("rbind", all_vcfs)
chr04_Occ_rur_sel_AF <- genotype_plot(vcf_object = combined_vcf, popmap = popmap, snp_label_size = 200000, 
                                      plot_allele_frequency = T)

In [None]:
options(repr.plot.width = 8, repr.plot.height = 6)
cols_hab <- c("#007243", "#003876")
Chr04_Occ_rur_ur_af <- chr04_Occ_rur_sel_AF$genotypes$data %>% 
    ggplot(aes(x = AF)) +
        geom_density(aes(color = pop, fill = pop), alpha = 0.7) +
        scale_color_manual(values = cols_hab) +
        scale_fill_manual(values = cols_hab) +
        ylab("Density") + 
        xlab("Frequency of REF allele") +
        theme_classic() +
        theme(axis.text = element_text(size = 18),
              axis.title = element_text(size = 20),
              legend.position = 'top',
              legend.title = element_text(size = 16),
              legend.text = element_text(size = 14)) +
        transp_theme
Chr04_Occ_rur_ur_af
ggsave(filename = snakemake@output[["Chr04_Occ_rur_ur_af"]], plot = Chr04_Occ_rur_ur_af, 
       height = 6, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
chr04_Occ_rur_sel_clust <- genotype_plot(vcf_object = combined_vcf, popmap = popmap, snp_label_size = 200000, 
                                         cluster = T, plot_phased = T)

In [None]:
var_PC1 <- round(get_eigenvalue(chr04_Occ_rur_sel_clust$cluster_pca)["Dim.1","variance.percent"], 1)
var_PC2 <- round(get_eigenvalue(chr04_Occ_rur_sel_clust$cluster_pca)["Dim.2","variance.percent"], 1)

Chr04_Occ_rur_ur_pca <- chr04_Occ_rur_sel_clust$cluster_pca$l1 %>% 
    as.data.frame() %>% 
    rownames_to_column(., var = "tmp") %>% 
    mutate(ind = str_sub(tmp, 1, -3),
           hap = str_sub(tmp, -1)) %>% 
    dplyr::select(ind, hap, RS1, RS2) %>% 
    left_join(popmap, by = "ind") %>% 
    ggplot(., aes(x = RS1, y = RS2)) +
            geom_point(aes(color = pop, shape = pop), size = 7, alpha = 0.75) +
            scale_color_manual(values = cols_hab) +
            theme_classic() +
            xlab(paste0('PC1 (', var_PC1, '%)')) + ylab(paste0('PC2 (', var_PC2, '%)')) +
            theme(axis.text = element_text(size = 18),
                  axis.title = element_text(size = 20),
                  legend.position = 'top',
                  legend.title = element_text(size = 16),
                  legend.text = element_text(size = 14)) +
    transp_theme
Chr04_Occ_rur_ur_pca
ggsave(filename = snakemake@output[["Chr04_Occ_rur_ur_pca"]], plot = Chr04_Occ_rur_ur_pca, 
       height = 6, width = 8, device = "pdf", dpi = 600, units = "in")

## Haplotype PCA and AF for random unselected regions

In [None]:
set.seed(42)
random_unsel_regions <- obs_xpnsl_ur_df_filt %>% 
    filter(direction == "None" & Chr == 1) %>% 
    dplyr::select(Chr, start, end) %>% 
    sample_n(100) %>% 
    arrange(Chr, start)

In [None]:
random_unsel_regions

In [None]:
subset_vcf <- function(df){
    chrom = "Chr01_Occ"
    vcf_idx <- grep(chrom, snakemake@input[["vcfs"]])
    vcf <- vcfR::read.vcfR(snakemake@input[["vcfs"]][vcf_idx])

    starts <- df$start
    ends <- df$end

    sub_vcfs <- list()
    for(i in 1:length(starts)){
        idx <- which(vcfR::getPOS(vcf) >= starts[i] & vcfR::getPOS(vcf) <= ends[i])
        start_idx <- head(idx, n = 1)
        end_idx <- tail(idx, n = 1)
        vcf_sub <- vcf[start_idx:end_idx]
        sub_vcfs[[i]] <- vcf_sub
    }
    comb_vcf <- do.call(rbind, sub_vcfs)
    
    return(comb_vcf)
}

random_unsel_regions_vcf_list <- random_unsel_regions %>% 
    group_split(Chr) %>% 
    purrr::map(., subset_vcf)
random_unsel_regions_vcf <- do.call(rbind, random_unsel_regions_vcf_list)

In [None]:
random_unsel_regions_vcf

In [None]:
random_unsel_regions_AF <- genotype_plot(vcf_object = random_unsel_regions_vcf, popmap = popmap, snp_label_size = 500000, 
                                         plot_allele_frequency = T)

In [None]:
cols_hab <- c("#007243", "#003876")

random_unsel_regions_af <- random_unsel_regions_AF$genotypes$data %>% 
    ggplot(aes(x = AF)) +
        geom_density(aes(color = pop, fill = pop), alpha = 0.7) +
        scale_color_manual(values = cols_hab) +
        scale_fill_manual(values = cols_hab) +
        ylab("Density") + 
        xlab("Frequency of REF allele") +
        theme_classic() +
        theme(axis.text = element_text(size = 18),
              axis.title = element_text(size = 20),
              legend.position = 'top',
              legend.title = element_text(size = 16),
              legend.text = element_text(size = 14)) +
        transp_theme
random_unsel_regions_af

ggsave(filename = snakemake@output[["random_unsel_regions_af"]], plot = random_unsel_regions_af, 
       height = 6, width = 8, device = "pdf", dpi = 600, units = "in")

In [None]:
random_unsel_regions_clust <- genotype_plot(vcf_object = random_unsel_regions_vcf, popmap = popmap, snp_label_size = 200000, 
                                            cluster = T, plot_phased = T)

In [None]:
var_PC1 <- round(get_eigenvalue(random_unsel_regions_clust$cluster_pca)["Dim.1","variance.percent"], 1)
var_PC2 <- round(get_eigenvalue(random_unsel_regions_clust$cluster_pca)["Dim.2","variance.percent"], 1)

random_unsel_regions_pca <- random_unsel_regions_clust$cluster_pca$l1 %>% 
    as.data.frame() %>% 
    rownames_to_column(., var = "tmp") %>% 
    mutate(ind = str_sub(tmp, 1, -3),
           hap = str_sub(tmp, -1)) %>% 
    dplyr::select(ind, hap, RS1, RS2) %>% 
    left_join(popmap, by = "ind") %>% 
    ggplot(., aes(x = RS1, y = RS2)) +
            geom_point(aes(color = pop, shape = pop), size = 7, alpha = 0.75) +
            scale_color_manual(values = cols_hab) +
            theme_classic() +
            xlab(paste0('PC1 (', var_PC1, '%)')) + ylab(paste0('PC2 (', var_PC2, '%)')) +
            theme(axis.text = element_text(size = 18),
                  axis.title = element_text(size = 20),
                  legend.position = 'top',
                  legend.title = element_text(size = 16),
                  legend.text = element_text(size = 14)) +
    transp_theme
random_unsel_regions_pca

ggsave(filename = snakemake@output[["random_unsel_regions_pca"]], plot = random_unsel_regions_pca, 
       height = 6, width = 8, device = "pdf", dpi = 600, units = "in")