## SETUP

In [None]:
# Load require packages
library(tidyverse)
library(broom)

In [None]:
# ggplot theme with transparent backgrounds
transp_theme <- theme(panel.background = element_rect(fill = "transparent", colour = NA_character_), 
                      panel.grid.major = element_blank(),
                      panel.grid.minor = element_blank(),
                      plot.background = element_rect(fill = "transparent", colour = NA_character_),
                      legend.background = element_blank(),
                      legend.box.background = element_blank(),
                      legend.key = element_blank())

In [None]:
# Function to remap chromosome names to numeric
remap_chr_names <- function(df){
    df_out <- df %>% 
    mutate(Chr = case_when(Chr == 'Chr01_Occ' ~ 1,
           Chr == 'Chr01_Pall' ~ 2,
           Chr == 'Chr02_Occ' ~ 3,
           Chr == 'Chr02_Pall' ~ 4,
           Chr == 'Chr03_Occ' ~ 5,
           Chr == 'Chr03_Pall' ~ 6,
           Chr == 'Chr04_Occ' ~ 7,
           Chr == 'Chr04_Pall' ~ 8,
           Chr == 'Chr05_Occ' ~ 9,
           Chr == 'Chr05_Pall' ~ 10,
           Chr == 'Chr06_Occ' ~ 11,
           Chr == 'Chr06_Pall' ~ 12,
           Chr == 'Chr07_Occ' ~ 13,
           Chr == 'Chr07_Pall' ~ 14,
           Chr == 'Chr08_Occ' ~ 15,
           Chr == 'Chr08_Pall' ~ 16))
    return(df_out)
}

In [None]:
# Load allele dosages for all chromosomes
dosages <- read_delim(snakemake@input[["dos"]], delim="\t") %>% 
    mutate(Chr = str_extract(site, pattern = "Chr\\d+_(Occ|Pall)")) %>% 
    mutate(Pos = str_extract(site, pattern = "(?<=_)\\d+")) %>% 
    dplyr::select(-site) %>% 
    remap_chr_names()

In [None]:
# Load per-site Fst values for all chromosomes
fst <- read_delim(snakemake@input[["fst"]], delim="\t") %>% 
    filter(pop1 == "Urban" & pop2 == "Rural") %>% 
    rename("Chr" = "chromosome") %>% 
    remap_chr_names() %>% 
    dplyr::select(Chr, window_pos_1, avg_hudson_fst) %>% 
    rename("pos" = "window_pos_1")

In [None]:
# Load population and sample metadata
pops <- read_csv(snakemake@config[["pops"]])
samples <- read_delim(snakemake@config[["samples"]]) %>% 
    dplyr::select(Sample, Population) %>% 
    left_join(., pops, by = c("Population")) %>% 
    filter(Sample %in% names(dosages))

In [None]:
# Load top selected regions from outlier analysis
top_hits <- read_delim(snakemake@input[["top_hits"]], delim="\t") %>% 
    dplyr::select(Chr, start, end, direction)

## Cline analysis

In [None]:
# Population with multiple individuals
pops_multi_ind <- samples %>% 
    group_by(Population) %>% 
    summarise(n = n()) %>% 
    filter(n > 1) %>% 
    pull(Population)

In [None]:
# Get dataframe with position having maximum Fst within each outlier region
get_top_fst <- function(df){
    chrom <- df %>% pull(Chr)
    start <- df %>% pull(start)
    end <- df %>% pull(end)
    direction <- df %>% pull(direction)

    max_fst <- fst %>% 
        filter(Chr == chrom & pos > start & pos <= end) %>% 
        filter(avg_hudson_fst == max(avg_hudson_fst, na.rm = TRUE)) %>% 
        mutate(Chr = chrom, start = start, end = end, direction = direction) %>% 
        slice(1) # Only take the first if multiple sites have the same Fst
    return(max_fst)
}

max_fst_df <- top_hits %>% 
    group_split(Chr, start, end) %>% 
    purrr::map_dfr(., get_top_fst)

In [None]:
write_delim(max_fst_df, snakemake@output[["max_fst_df"]], delim="\t")

In [None]:
# Get minor allele frequency of site with Max Fst in urban, suburban, and rural populations
create_freq_df <- function(df){

    # Metadata
    chrom <- df %>% pull(Chr)
    start <- df %>% pull(start)
    end <- df %>% pull(end)
    pos <- df %>% pull(pos)
    direction <- df %>% pull(direction)

    # Function to calculate frequency from dosage in a population
    get_freq <- function(pop){
        
        pop_sub <- samples %>% 
            filter(Population == pop)
        nSamples <- nrow(pop_sub)
        dosages_long <- dosages %>% 
            filter(Chr == chrom & Pos == pos) %>% 
            pivot_longer(-c(Chr,Pos), names_to = "Sample", values_to = "ac")

        # Convert AF to MAF if needed (allele dosage are ALT AF by default)
        af <- dosages_long %>%
            summarise(af = sum(ac) / (2*n())) %>%
            pull(af)
        if(af > 0.5){
            dosages_long <- dosages_long %>% 
                mutate(ac = case_when(ac == 0 ~ 2, ac == 1 ~ 1, ac == 2 ~ 0)) 
        }

        # Calculated frequency
        freq_df <- dosages_long %>% 
            filter(Sample %in% pop_sub$Sample) %>% 
            summarise(ac = sum(ac), freq = ac / (2 * nSamples)) %>% 
            mutate(Population = pop, n = nSamples)
        
        return(freq_df)
    }

    # Map populations over function above to get frequency in all populations
    df_out <- pops_multi_ind %>% 
        purrr::map_dfr(., get_freq) %>% 
        left_join(pops, by = "Population") %>% 
        mutate(Chr = chrom, start = start, end = end, direction = direction, pos = pos)
    
    return(df_out)
}

# Get frequency in all populations for all Max Fst sites
max_fst_freq_by_pop <- max_fst_df %>% 
    group_split(Chr, pos) %>% 
    purrr::map_dfr(., create_freq_df)

In [None]:
# Do the same as above but for 100 randomly-selected sites
set.seed(42)
random_sites <- dosages %>% 
    dplyr::select(Chr, Pos) %>% 
    sample_n(100) %>% 
    mutate(start = NA, end = NA, direction = "None") %>% 
    rename("pos" = "Pos")

random_freq_by_pop <- random_sites %>% 
    group_split(Chr, pos) %>% 
    purrr::map_dfr(., create_freq_df)

In [None]:
# Plot with allele frequency clines at sites with max Fst
cline_plot <- max_fst_freq_by_pop %>% 
    mutate(site_id = paste0(Chr, "_", pos)) %>% 
    ggplot(., aes(x = Distance, y = freq, weight = n)) +  
        geom_line(data = random_freq_by_pop %>% mutate(site_id = paste0(Chr, "_", pos)),
                stat = "smooth", 
                method="glm", 
                aes(color = direction, group = site_id),
                size = 0.5,
                # color = "grey",
                alpha = 0.75,
                show.legend = TRUE,
                method.args = list(family = "binomial")) +
        geom_line(stat = "smooth", 
                method="glm", 
                aes(color = direction, group = site_id),
                size = 1, 
                show.legend = TRUE,
                method.args = list(family = "binomial")) +
        xlab("Distance from the city center (Km)") + ylab("Frequency of ALT allele") + 
        scale_colour_manual(values = c("grey", "#007243", "#003876")) +
        scale_y_continuous(breaks = seq(from = 0, to = 0.7, by = 0.1)) +
        scale_x_continuous(breaks = seq(from = 0, to = 45, by = 5)) +
        coord_cartesian(ylim = c(0, 0.7), xlim = c(0, 48)) +
        theme_classic() +
        theme(axis.text = element_text(size = 18),
              axis.title = element_text(size = 20),
              legend.position = 'top',
              legend.title = element_text(size = 16),
              legend.text = element_text(size = 14)) +
       transp_theme
suppressWarnings(print(cline_plot))

ggsave(filename = snakemake@output[["cline_plot"]], plot = cline_plot, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600)

In [None]:
# GLM models of allele frequency change for selected sites above
selSites_glm_df <- max_fst_freq_by_pop %>% 
    mutate(site_id = paste0(Chr, "_", pos)) %>% 
    nest_by(site_id) %>%
    mutate(mod = list(glm(freq ~ Distance, data = data, weights = n, family = 'binomial'))) %>% 
    reframe(broom::tidy(mod)) %>% 
    filter(term == "Distance") 
selSites_glm_df

write_delim(selSites_glm_df, snakemake@output[["selSites_glm_df"]], delim="\t")

In [None]:
random_freq_by_pop %>% 
    mutate(site_id = paste0(Chr, "_", pos)) %>% 
    nest_by(site_id) %>%
    mutate(mod = list(glm(freq ~ Distance, data = data, weights = n, family = 'binomial'))) %>% 
    reframe(broom::tidy(mod)) %>% 
    filter(term == "Distance") %>%
    filter(p.value < 0.05)

In [None]:
options(repr.plot.width = 12, repr.plot.height = 8)

# Similar to cline plot but broken up by habitat
freq_byHab_plot <- rbind(max_fst_freq_by_pop, random_freq_by_pop) %>% 
    mutate(site_id = paste0(Chr, "_", pos)) %>%
    dplyr::select(site_id, Habitat, Population, freq, direction) %>%
    group_by(site_id, Habitat, direction) %>%
    summarise(mean_freq = mean(freq)) %>%
    ungroup() %>%
    mutate(Habitat = factor(Habitat, levels = c("Urban", "Suburban", "Rural"))) %>%
    ggplot(aes(x = Habitat, y = mean_freq, color = direction, group = site_id)) +
        geom_point(size = 2, position = position_dodge(width = 0.25)) +
        geom_line(linewidth = 1, position = position_dodge(width = 0.25)) +
        facet_wrap(~direction) +
        ylab("Mean allele frequency") +
        xlab("Habitat") +
        scale_colour_manual(values = c("grey", "#007243", "#003876")) +
        theme_classic() +
        theme(axis.text = element_text(size = 18),
              axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
              axis.title = element_text(size = 20),
              strip.text.x = element_text(size = 14),
              legend.position = 'top',
              legend.title = element_text(size = 16),
              legend.text = element_text(size = 14)) +
       transp_theme

ggsave(filename = snakemake@output[["freq_byHab_plot"]], plot = freq_byHab_plot, device = 'pdf', 
       width = 12, height = 8, units = 'in', dpi = 600)

In [None]:
# Plot with mean MAF difference 
freq_diff_df <- rbind(max_fst_freq_by_pop, random_freq_by_pop) %>% 
    mutate(site_id = paste0(Chr, "_", pos)) %>%
    dplyr::select(site_id, Habitat, Population, freq, direction) %>%
    group_by(site_id, Habitat, direction) %>%
    summarise(mean_freq = mean(freq)) %>%
    pivot_wider(id_cols = c("site_id", "direction"), names_from = "Habitat", values_from = "mean_freq") %>%
    mutate(`Urban - Suburban` = abs(Urban - Suburban),
           `Urban - Rural` = abs(Urban - Rural)) %>%
    dplyr::select(site_id, direction, `Urban - Suburban`, `Urban - Rural`) %>%
    pivot_longer(`Urban - Suburban`:`Urban - Rural`, names_to = "comparison", values_to = "delta_freq") %>%
    ungroup()

freq_diff_plot <- freq_diff_df %>%
    group_by(direction, comparison) %>%
    summarise(mean = mean(delta_freq),
              n = n(),
              sd = sd(delta_freq),
              se = sd / sqrt(n)) %>%
    ungroup() %>%
    ggplot(aes(x = comparison, y = mean)) +
        geom_point(size = 6, aes(color = direction)) +
        geom_errorbar(aes(ymin = mean - se , ymax = mean + se, color = direction), width = 0.15) +
        facet_wrap(~direction) +
        ylab("|Allele frequency difference|") +
        xlab("Habitat comparison") +
        scale_colour_manual(values = c("grey", "#007243", "#003876")) +
        theme_classic() +
        theme(axis.text = element_text(size = 18),
              axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
              axis.title = element_text(size = 20),
              strip.text.x = element_text(size = 14),
              legend.position = 'top',
              legend.title = element_text(size = 16),
              legend.text = element_text(size = 14)) +
       transp_theme

ggsave(filename = snakemake@output[["freq_diff_plot"]], plot = freq_diff_plot, device = 'pdf', 
       width = 12, height = 8, units = 'in', dpi = 600)