# Setup

In [None]:
# Load required packages
library(tidyverse)
library(bedtoolsr)

In [None]:
# Load dataframes with windowed sweep statistics
win_sfs_df <- read_delim(snakemake@input[['fst']], delim = '\t')
win_xpnsl_df <- read_delim(snakemake@input[['xpnsl']], delim = '\t')

In [None]:
head(win_sfs_df)

In [None]:
head(win_xpnsl_df)

# Analysis of selective sweeps 

## XP-nSL

- I'll start by looking at genome-wide XP-xSL values, estimated in 50 kb windows
- In each window, I estimated the mean XP-nSL score and the proportion of scores that are either above 2 or less than -2.
- Positive outlier windows are those that are in the top 1% of the genome-wide mean XP-nSL distribution AND the top 1% of distribution of scores > 2. There windows represent positive selection in urban habitats
- Negative outlier windows are those that are in the bottom 1% of the genome-wide XP-xSL distribution AND the top 1% of distribution of scores < -2. There windows represent positive selection in rural habitats.
- I'll look at the top 10 windows under selection for urban and rural habitats separately.
- I'll merge consecutive outlier windows into larger outlier regions

In [None]:
# Distribution of the number of sites in a window
win_xpnsl_df %>% 
    ggplot(aes(x = n)) +
        geom_histogram(bins = 50, color = 'black', fill = 'white') +
        geom_vline(xintercept = 10, color = 'red') +
        theme_classic()

In [None]:
nSites_thresh <- 10 # Require at least this many site in a window
win_xpnsl_df_filt <- win_xpnsl_df %>%
    mutate_at(vars(-("Chr")), as.numeric) %>% 
    filter(n >= nSites_thresh)

# Get critical values for mean XP-nSL score and proportions greater or lesser than 2 and -2, respectively
xpnsl_score_quant_filt <- quantile(win_xpnsl_df_filt %>% pull(mean), probs = c(0.01, 0.99))
xpnsl_gtprop_quant_filt <- quantile(win_xpnsl_df_filt %>% pull(gt_frac), probs = 0.99)
xpnsl_ltprop_quant_filt <- quantile(win_xpnsl_df_filt %>% pull(lt_frac), probs = 0.99)

# Identify outliers and add as categorical variable to windows dataframe
win_xpnsl_df_filt <- win_xpnsl_df_filt %>%
    mutate(xpnsl_score_outlier = ifelse(mean <= xpnsl_score_quant_filt[1] | mean >= xpnsl_score_quant_filt[2], 1, 0),
           xpnsl_gtprop_outlier = ifelse(gt_frac >= xpnsl_gtprop_quant_filt, 1, 0),
           xpnsl_ltprop_outlier = ifelse(lt_frac >= xpnsl_ltprop_quant_filt, 1, 0),
           direction = case_when(xpnsl_score_outlier == 1 & mean > 0 & xpnsl_gtprop_outlier == 1 ~ 'Urban sel',
                                 xpnsl_score_outlier == 1 & mean < 0 & xpnsl_ltprop_outlier == 1 ~ 'Rural sel',
                                 TRUE ~ 'Not outlier')) %>% 
    mutate(prop_outlier = case_when(direction == 'Urban sel' ~ gt_frac,
                                    direction == 'Rural sel' ~ lt_frac,
                                    TRUE ~ NA))

In [None]:
# Function to add ranks to XP-nSL windows based on the proportion of outlier scores
add_xpnsl_ranks <- function(df){
        
    df_out <- df %>% 
        arrange(desc(prop_outlier)) %>% 
        mutate(xpnsl_rank = 1:n())
    return(df_out)
}

win_xpnsl_df_filt_outliers_with_ranks <- win_xpnsl_df_filt %>% 
    filter(direction %in% c('Urban sel', 'Rural sel')) %>% 
    group_split(direction) %>% 
    purrr::map_dfr(., add_xpnsl_ranks)

In [None]:
names(win_xpnsl_df_filt_outliers_with_ranks)

In [None]:
# Function to merge consecutive outlier windows
merge_xpnsl_windows <- function(df){

    dir <- df %>% pull(direction) %>% unique()
    df_sorted <- df %>% 
        dplyr::select(Chr, start, end, everything()) %>% 
        arrange(Chr, start) %>% 
        mutate(prop_outlier = round(prop_outlier, 3))

    col_names <- c('Chr', 'start', 'end', 'mean_xpnsl', 'min_max', 'direction', 'mean_prop_outlier', 'all_prop_outlier', 'min_xpnsl_rank', 'all_xpnsl_ranks')
    if(dir == 'Urban sel'){
        cols <- c('6,7,17,18,18,19,19')
        operation <- c('mean,max,distinct,mean,collapse,min,collapse')
    }else if(dir == 'Rural sel'){
        cols <- c('6,8,17,18,18,19,19')
        operation <- c('mean,min,distinct,mean,collapse,min,collapse')
    }
    
    df_merged  <- bt.merge(i = df_sorted, c = cols, o = operation)
    names(df_merged) <- col_names
    df_merged <- df_merged %>% 
        mutate(win_size = end - start)

    print(sprintf('%s: There were %s XP-nSL outlier windows prior to merging. There are %s outlier regions after merging consecutive outlier windows', dir, nrow(df_sorted), nrow(df_merged)))
    return(df_merged)
}

win_xpnsl_df_filt_outliers_with_ranks_merged <- win_xpnsl_df_filt_outliers_with_ranks %>% 
    group_split(direction) %>% 
    purrr::map_dfr(., merge_xpnsl_windows)

In [None]:
xpnsl_top10_urban_rural <- win_xpnsl_df_filt_outliers_with_ranks_merged %>% 
    filter(min_xpnsl_rank <= 10) %>% 
    group_by(direction) %>% 
    arrange(min_xpnsl_rank, .by_group = TRUE) %>% 
    dplyr::select(Chr, start, end, win_size, direction, mean_xpnsl, min_max, all_prop_outlier, mean_prop_outlier, all_xpnsl_ranks, min_xpnsl_rank) %>% 
    ungroup()
xpnsl_top10_urban_rural

## Fst

- I estimated Fst and the urban-rural difference in pi and Tajima's D in 50 kb windows across the genome
- I'll consider windows in the top 1% of the genome-wide empirical Fst distributions as outliers, and further support these outliers if these windows are also in the top 1% of the genome-wide difference in pi and Td distributions
- I'll look at overlap between the Fst and XP-nSL outlier windows

In [None]:
# Distribution of the numbe of sites used to estimate Fst and thetas
win_sfs_df %>% 
    ggplot(., aes(x = nSites_fst)) +
    geom_histogram(bins = 50, color = 'black', fill = 'white') +
    geom_vline(xintercept = 1500, color = 'red') +
    scale_y_continuous(expand = c(0, 0)) +
    theme_classic()

In [None]:
# Identify outliers across genome
nSites_thresh <- 1500
win_sfs_df_filt <- win_sfs_df %>%
    filter_at(vars(starts_with('nSites')), ~ . >= nSites_thresh)

fst_quant_filt <- quantile(win_sfs_df_filt %>% pull(fst), probs = c(0.99))
tp_quant_filt <- quantile(win_sfs_df_filt %>% pull(delta_tp_ur), probs = c(0.01, 0.99))
td_quant_filt <- quantile(win_sfs_df_filt %>% pull(delta_td_ur), probs = c(0.01, 0.99))

win_sfs_df_filt <- win_sfs_df %>%
    mutate(fst_outlier = ifelse(fst >= fst_quant_filt, 1, 0),
           tp_outlier = ifelse(delta_tp_ur <= tp_quant_filt[1] | delta_tp_ur >= tp_quant_filt[2], 1, 0),
           td_outlier = ifelse(delta_td_ur <= td_quant_filt[1] | delta_td_ur >= td_quant_filt[2], 1, 0),
           all_outlier = ifelse(fst_outlier == 1 & tp_outlier == 1 & td_outlier == 1, 1, 0)) %>%
    dplyr::select(chrom_pos, Chr, start, end, WinCenter, fst, delta_tp_ur, delta_td_ur, contains('_outlier'))

In [None]:
win_sfs_df_filt %>% 
    group_by(fst_outlier, tp_outlier, td_outlier) %>%
    summarise(n = n())

In [None]:
# Add habitat under selection based on difference in pi and Tajima's D
win_sfs_df_filt <- win_sfs_df_filt %>% 
    mutate(direction = case_when(delta_tp_ur < 0 & delta_td_ur < 0 ~ 'Urban sel',
                                 delta_tp_ur > 0 & delta_td_ur > 0 ~ 'Rural sel',
                                 TRUE ~ 'Weird'))

In [None]:
win_sfs_df_filt %>% filter(all_outlier == 1) %>% group_by(direction) %>% summarise(n = n())

In [None]:
get_positions <- function(df){
    df_out <- df %>% 
        dplyr::select(Chr, start, end)
    return(df_out)
}

xpnsl_top10_regions_pos <- get_positions(xpnsl_top10_urban_rural) 
fst_outliers_windows_pos <- get_positions(win_sfs_df_filt %>% filter(fst_outlier == 1))
fst_pi_td_outliers_windows_pos <- get_positions(win_sfs_df_filt %>% filter(all_outlier == 1))

In [None]:
get_overlapping_fst_windows <- function(xpnsl_pos, fst_pos){
    
    # Count number of overlapping Fst outlier windows in each XP-nSL window
    df_out <- bt.intersect(xpnsl_pos, fst_pos, c = T)
    names(df_out) <- c('Chr', 'xpnsl_win_start', 'xpnsl_win_end', 'num_fst_outliers')
    return(df_out)
} 

get_overlapping_fst_windows(xpnsl_top10_regions_pos, fst_pi_td_outliers_windows_pos)

In [None]:
xpnsl_fst_overlap <- bt.intersect(xpnsl_top10_urban_rural %>% dplyr::select(Chr, start, end), 
                                  win_sfs_df_filt %>% dplyr::select(-chrom_pos) %>% filter(fst_outlier == 1), 
                                  c = T)
names(xpnsl_fst_overlap) <- c('Chr', 'start', 'end', 'num_fst_outliers')
xpnsl_top10_urban_rural <- left_join(xpnsl_top10_urban_rural, xpnsl_fst_overlap, by = c('Chr', 'start', 'end'))
xpnsl_top10_urban_rural

In [None]:
urban_tmp <- bt.intersect(xpnsl_top10_urban_rural %>% filter(direction == 'Urban sel') %>% dplyr::select(Chr, start, end), 
                                        win_sfs_df_filt %>% dplyr::select(-chrom_pos) %>% filter(all_outlier == 1 & direction == 'Urban sel'), 
                                        c = T)
rural_tmp <- bt.intersect(xpnsl_top10_urban_rural %>% filter(direction == 'Rural sel') %>% dplyr::select(Chr, start, end), 
                                        win_sfs_df_filt %>% dplyr::select(-chrom_pos) %>% filter(all_outlier == 1 & direction == 'Rural sel'), 
                                        c = T)
xpnsl_fst_pi_td_overlap <- bind_rows(urban_tmp, rural_tmp)
names(xpnsl_fst_pi_td_overlap) <- c('Chr', 'start', 'end', 'num_fst_pi_td_outliers')
xpnsl_top10_urban_rural <- left_join(xpnsl_top10_urban_rural, xpnsl_fst_pi_td_overlap, by = c('Chr', 'start', 'end'))
xpnsl_top10_urban_rural

In [None]:
write_delim(xpnsl_top10_urban_rural, snakemake@output[['sel_tbl']], delim = '\t')