# Setup

In [None]:
# Load required packages
library(tidyverse)
library(bedtoolsr)

In [None]:
# Load dataframes with windowed sweep statistics
win_sfs_df <- read_delim(snakemake@input[['fst']], delim = '\t')
win_xpnsl_df <- read_delim(snakemake@input[['xpnsl']], delim = '\t')

# Load dataframes with windows percentiles from permuted distributions
urb_perc <- read_delim(snakemake@input[['urb_perc']], delim = '\t')
rur_perc <- read_delim(snakemake@input[['rur_perc']], delim = '\t')

# Load GFF
gff <- ape::read.gff(snakemake@input[['gff']], GFF3 = TRUE) %>% 
    dplyr::select(seqid, start, end, everything())

In [None]:
# Add window percentiles to dataframe with window stats
win_xpnsl_df <- win_xpnsl_df %>% 
    left_join(., bind_rows(urb_perc, rur_perc), by = c("Chr", "winID", "start", "end", "winCenter"))

In [None]:
head(win_sfs_df)

In [None]:
head(win_xpnsl_df)

# Analysis of selective sweeps 

## XP-nSL

- I'll start by looking at genome-wide XP-xSL values, estimated in 50 kb windows
- In each window, I estimated the mean XP-nSL score and the proportion of scores that are either above 2 or less than -2.
- Positive outlier windows are those that are in the top 1% of the genome-wide mean XP-nSL distribution AND the top 1% of distribution of scores > 2. There windows represent positive selection in urban habitats
- Negative outlier windows are those that are in the bottom 1% of the genome-wide XP-xSL distribution AND the top 1% of distribution of scores < -2. There windows represent positive selection in rural habitats.
- I'll look at the top 10 windows under selection for urban and rural habitats separately.
- I'll merge outlier windows within 50Kb of one another into larger outlier regions

In [None]:
# Function to add ranks to XP-nSL windows based on the proportion of outlier scores
add_xpnsl_ranks <- function(df){
        
    df_out <- df %>% 
        arrange(desc(prop_outlier)) %>% 
        mutate(xpnsl_rank = 1:n())
    return(df_out)
}

win_xpnsl_df_outliers_with_ranks <- win_xpnsl_df %>% 
    filter(direction %in% c('Urban sel', 'Rural sel')) %>% 
    group_split(direction) %>% 
    purrr::map_dfr(., add_xpnsl_ranks)

In [None]:
names(win_xpnsl_df_outliers_with_ranks)

In [None]:
# Function to merge consecutive outlier windows
merge_xpnsl_windows <- function(df, win_dist = 0){

    dir <- df %>% pull(direction) %>% unique()
    df_sorted <- df %>% 
        dplyr::select(Chr, start, end, everything()) %>% 
        arrange(Chr, start) %>% 
        mutate(prop_outlier = round(prop_outlier, 3),
               mean_perm_perc = round(mean_perm_perc, 3),
               prop_perm_perc = round(prop_outlier, 3))

    col_names <- c('Chr', 'start', 'end', 'mean_xpnsl', 'min_max', 'direction', 'mean_prop_outlier', 'all_prop_outlier', 'mean_percentile_permuted', 'prop_percentile_permuted','min_xpnsl_rank', 'all_xpnsl_ranks')
    if(dir == 'Urban sel'){
        cols <- c('6,7,17,18,18,19,20,21,21')
        operation <- c('mean,max,distinct,mean,collapse,collapse,collapse,min,collapse')
    }else if(dir == 'Rural sel'){
        cols <- c('6,8,17,18,18,19,20,21,21')
        operation <- c('mean,min,distinct,mean,collapse,collapse,collapse,min,collapse')
    }
    
    df_merged  <- bt.merge(i = df_sorted, c = cols, o = operation, d = win_dist)
    names(df_merged) <- col_names
    df_merged <- df_merged %>% 
        mutate(win_size = end - start)

    print(sprintf('%s: There were %s XP-nSL outlier windows prior to merging. There are %s outlier regions after merging consecutive outlier windows', dir, nrow(df_sorted), nrow(df_merged)))
    return(df_merged)
}

win_xpnsl_df_outliers_with_ranks_merged <- win_xpnsl_df_outliers_with_ranks %>% 
    group_split(direction) %>% 
    purrr::map_dfr(., merge_xpnsl_windows)

In [None]:
xpnsl_top10_urban_rural <- win_xpnsl_df_outliers_with_ranks_merged %>% 
    filter(min_xpnsl_rank <= 10) %>% 
    group_by(direction) %>% 
    arrange(min_xpnsl_rank, .by_group = TRUE) %>% 
    dplyr::select(Chr, start, end, win_size, direction, mean_xpnsl, min_max, all_prop_outlier, mean_prop_outlier, mean_percentile_permuted, prop_percentile_permuted, all_xpnsl_ranks, min_xpnsl_rank) %>% 
    ungroup()
xpnsl_top10_urban_rural

## Fst

- I estimated Fst and the urban-rural difference in pi and Tajima's D in 50 kb windows across the genome
- I'll consider windows in the top 1% of the genome-wide empirical Fst distributions as outliers, and further support these outliers if these windows are also in the top 1% of the genome-wide difference in pi and Td distributions
- I'll look at overlap between the Fst and XP-nSL outlier windows

In [None]:
win_sfs_df %>% 
    group_by(fst_outlier, tp_outlier, td_outlier) %>%
    summarise(n = n())

In [None]:
win_sfs_df %>% filter(all_outlier == 1) %>% group_by(direction) %>% summarise(n = n())

In [None]:
win_sfs_df

In [None]:
get_positions <- function(df){
    df_out <- df %>% 
        dplyr::select(Chr, start, end)
    return(df_out)
}

xpnsl_top10_regions_pos <- get_positions(xpnsl_top10_urban_rural) 
fst_outliers_windows_pos <- get_positions(win_sfs_df %>% filter(fst_outlier == 1))
fst_pi_td_outliers_windows_pos <- get_positions(win_sfs_df %>% filter(all_outlier == 1))

In [None]:
get_overlapping_fst_windows <- function(xpnsl_pos, fst_pos){
    
    # Count number of overlapping Fst outlier windows in each XP-nSL window
    df_out <- bt.intersect(xpnsl_pos, fst_pos, c = T)
    names(df_out) <- c('Chr', 'xpnsl_win_start', 'xpnsl_win_end', 'num_fst_outliers')
    return(df_out)
} 

get_overlapping_fst_windows(xpnsl_top10_regions_pos, fst_pi_td_outliers_windows_pos)

In [None]:
xpnsl_fst_overlap <- bt.intersect(xpnsl_top10_urban_rural %>% dplyr::select(Chr, start, end), 
                                  win_sfs_df %>% dplyr::select(-chrom_pos) %>% filter(fst_outlier == 1), 
                                  c = T)
names(xpnsl_fst_overlap) <- c('Chr', 'start', 'end', 'num_fst_outliers')
xpnsl_top10_urban_rural <- left_join(xpnsl_top10_urban_rural, xpnsl_fst_overlap, by = c('Chr', 'start', 'end'))
xpnsl_top10_urban_rural

In [None]:
urban_tmp <- bt.intersect(xpnsl_top10_urban_rural %>% filter(direction == 'Urban sel') %>% dplyr::select(Chr, start, end), 
                                        win_sfs_df %>% dplyr::select(-chrom_pos) %>% filter(all_outlier == 1 & direction == 'Urban sel'), 
                                        c = T)
rural_tmp <- bt.intersect(xpnsl_top10_urban_rural %>% filter(direction == 'Rural sel') %>% dplyr::select(Chr, start, end), 
                                        win_sfs_df %>% dplyr::select(-chrom_pos) %>% filter(all_outlier == 1 & direction == 'Rural sel'), 
                                        c = T)
xpnsl_fst_pi_td_overlap <- bind_rows(urban_tmp, rural_tmp)
names(xpnsl_fst_pi_td_overlap) <- c('Chr', 'start', 'end', 'num_fst_pi_td_outliers')
xpnsl_top10_urban_rural <- left_join(xpnsl_top10_urban_rural, xpnsl_fst_pi_td_overlap, by = c('Chr', 'start', 'end')) %>% 
    arrange(direction, Chr, start)
xpnsl_top10_urban_rural

# Create output tables

## Dataframe with gene ID in top 10 selected regions

In [None]:
# Get gene names dataframe
gene_names <- gff %>% 
    filter(type == 'gene') %>% 
    mutate(gene = str_extract(attributes, pattern = '(?<=gene=)\\w+(?=;)'),
           gene_id = str_extract(attributes, pattern = '(?<=ID\\=)ACLI19_g\\d+(?=;)')) %>% 
    dplyr::select(seqid, start, end, gene_id, gene)

# Get gene products and GO annotations dataframe
prod_go_annot <- gff %>% 
    filter(type == 'mRNA') %>% 
    mutate(id = str_extract(attributes, pattern = '(?<=ID\\=)ACLI19_g\\d+\\.t\\d+(?=;)'),
           func = str_extract(attributes, pattern = '(?<=product=)[^;]*'),
           go = str_extract_all(attributes, pattern = 'GO:\\d+(?=(,|;))')) %>% 
    separate(id, into = c('gene_id', 'trans'), sep = '\\.') %>% 
    filter(trans == 't1') %>% 
    dplyr::select(seqid, gene_id, func, go)

# Combine genes, functions, and GO annotations into single dataframe
genes_prods_go_df <- left_join(gene_names, prod_go_annot, by = c('seqid', 'gene_id')) %>% 
    rename('Chr' = 'seqid')

In [None]:
xpnsl_top10_urban_rural_with_genes_long <- bt.intersect(xpnsl_top10_urban_rural, genes_prods_go_df %>% dplyr::select(-go), wa=T, wb=T) %>% 
    dplyr::select(V1, V2, V3, V5, V17, V18, V19)
names(xpnsl_top10_urban_rural_with_genes_long) <- c('Chr', 'start', 'end', 'direction', 'gene_id', 'gene_symbol', 'product')
xpnsl_top10_urban_rural_with_genes_long

In [None]:
# Write dataframe with gene IDs for top ten selected urban and rural regions
write_delim(xpnsl_top10_urban_rural_with_genes_long, snakemake@output[['top_ten_genes']], delim = '\t')

## Table with gene symbols, products, and Fst for top selected regions

In [None]:
# Concatenate gene symbols for top 10 selected regions
symbols <- xpnsl_top10_urban_rural_with_genes_long %>% 
    dplyr::select(-gene_id, -product) %>% 
    filter(!is.na(gene_symbol)) %>% 
    group_by(Chr, start, end, direction) %>% 
    summarise(gene_symbols = toString(gene_symbol))

# Concatenate products for top 10 selected regions
prods <- xpnsl_top10_urban_rural_with_genes_long %>% 
    dplyr::select(-gene_id, -gene_symbol) %>% 
    filter(product != 'hypothetical protein') %>% 
    group_by(Chr, start, end, direction) %>% 
    summarise(products = toString(product))

# Add symbols and products to table
xpnsl_top10_urban_rural_with_genes <- xpnsl_top10_urban_rural %>% 
    left_join(., symbols) %>% 
    left_join(., prods)

xpnsl_top10_urban_rural_with_genes

In [None]:
# Write table with gene symbols, products, and Fst overlaps for top ten selected yrban and rural regions
write_delim(xpnsl_top10_urban_rural_with_genes, snakemake@output[['top_ten_tbl']], delim = '\t')

## Dataframe with gene IDs in all selected regions based on XP-nSL

In [None]:
xpnsl_outliers <- win_xpnsl_df %>% 
    filter(direction != 'Not outlier') %>% 
    dplyr::select(Chr, start, end, direction)

xpnsl_outliers_with_genes <- bt.intersect(xpnsl_outliers, gene_names, wa = T, wb = T) %>% 
    dplyr::select(V1, V2, V3, V4, V8)
names(xpnsl_outliers_with_genes) <- c('Chr', 'start', 'end', 'direction', 'gene_id')
write_delim(xpnsl_outliers_with_genes, snakemake@output[['all_xpnsl_sel']], delim = '\t')

In [None]:
xpnsl_outliers_with_genes