In [2]:
library(tidyverse)
library(data.table)
library(ggridges)
library(wesanderson)

# HCN Loci Fst outlier test

- Use per-city estimates of Ac and Li deletion frequencies to estimate Hudson's Fst
- Compare Fst at HCN loci to distribution of Fst values at 4fold sites

## Estimate Fst at Ac and Li

- Load in Allele frequency estimates for both loci
- Estimate Hudson's Fst
- Return a single dataframe with Fst estimates for both loci and for all cities

In [88]:
habitat_info <- suppressMessages(read_delim('../../sequencing-prep/resources/low1_sampleSheet.txt', delim = '\t'))
betaLog <- suppressMessages(read_csv('../../phenotypic-analyses/analysis/supplementary-tables/allCities_logisticReg_coefs.csv'))
cyan <- suppressMessages(read_csv('../../phenotypic-analyses/analysis/supplementary-tables/allCities_stats.csv')) %>% 
    dplyr::select(city, meanHCN)

In [111]:
# Function to load genotype likelihoods
load_gls <- function(inpath){
    
    to_remove <- c('Calgary_25_13', 'Linkoping_8_8', 'Melbourne_2_3', 'Paris_15_14', 'Paris_3_9')
    df <- suppressMessages(read_delim(inpath, delim = '\t')) %>% 
        dplyr::select(sample, ends_with('norm')) %>% 
        filter(!(sample %in% to_remove)) %>% 
        left_join(habitat_info, by = 'sample')
    
    return(df)
    
    
}

hudson_fst <- function(p_u, p_r, n_u, n_r){
    
    # Numerator
    a <- (p_u - p_r)^2
    b <- (p_u * (1 - p_u)) / (n_u - 1)
    c <- (p_r * (1 - p_r)) / (n_r - 1)
    num <- a - b - c
    
    # Denominator
    d <- p_u * (1 - p_r)
    e <- p_r * (1 - p_u)
    denom <- d + e
    
    fst <- num / denom
   
    return(fst)
}

estimate_p <- function(df, by_site = TRUE){
    
    if(by_site == TRUE){
        df_out <- df %>% 
            group_by(city, site) %>% 
            summarise(num_aa = sum(l_aa_norm),
                      num_Aa = sum(l_Aa_norm),
                      num_AA = sum(l_AA_norm),
                      n = 2*n(),
                      p = (num_aa + (0.5 * num_Aa)) / sum(num_aa, num_Aa, num_AA)) %>% 
            dplyr::select(-starts_with('num_')) %>% 
            pivot_wider(names_from = site, values_from = c('n', 'p')) %>% 
            mutate(fst = hudson_fst(p_u, p_r, n_u, n_r)) %>% 
            left_join(., betaLog, by = 'city')
    }else{
        df_out <- df %>% 
            group_by(city) %>% 
            summarise(num_aa = sum(l_aa_norm),
                      num_Aa = sum(l_Aa_norm),
                      num_AA = sum(l_AA_norm),
                      p = (num_aa + (0.5 * num_Aa)) / sum(num_aa, num_Aa, num_AA)) %>% 
            dplyr::select(-starts_with('num_'))
    }
    
    return(df_out)
}


In [112]:
# Load in genotype likelihoods for all samples at both loci
# TODO: Remove 5 individuals with high alignment error rate
ac_gls <- load_gls('../results/hcn_genotyping/ac_GLs.txt')
li_gls <- load_gls('../results/hcn_genotyping/li_GLs.txt')

In [113]:
# Estimate urban and rural allele frequencies from genotype likelihoods
ac_freq <- estimate_p(ac_gls, by_site = TRUE) %>% mutate(locus = 'ac')
li_freq <- estimate_p(li_gls, by_site = TRUE) %>% mutate(locus = 'li')

In [114]:
# Combine Ac and Li allele frequencies
all_freqs <- bind_rows(ac_freq, li_freq) %>% 
    arrange(city)
head(all_freqs)

## Correlation between Fst and strength of cline

In [139]:
all_freqs %>% 
    group_by(locus) %>% 
    do(broom::tidy(lm(betaLog ~ fst, data = .)))

In [135]:
all_freqs %>% 
    group_by(locus) %>% 
    ggplot(., aes(x = fst, y = betaLog)) +
    geom_point(size = 3, color = 'black') +
    facet_wrap(~locus) +
    geom_smooth(method = 'lm', se = FALSE, color = 'black') +
    xlab('Fst') + ylab('Slope of cline (logOdds)') +
    theme_classic()

## Correlation in mean HCN frequencies

- Correlate the city-wide HCN frequencies from the phenotyping with the estimated HCN frequencies from the genotype likelihoods at both loci. 

In [131]:
# Create dataframe with city-wide allele frequencies and HCN frequencies
ac_freq_wholeCity <- estimate_p(ac_gls, by_site = FALSE) %>% mutate(locus = 'ac')
li_freq_wholeCity <- estimate_p(li_gls, by_site = FALSE) %>% mutate(locus = 'li')
all_freqs_wholeCity <- bind_rows(ac_freq_wholeCity, li_freq_wholeCity) %>% 
    pivot_wider(names_from = locus, values_from = p) %>% 
    left_join(., cyan) %>% 
    mutate(hcn_est = 1 - (ac^2 + li^2 - (ac^2 * li^2)))
all_freqs_wholeCity

In [132]:
# Plot mean HCN frequencies against frequencies estimated from genotype likelihoods. 
all_freqs_wholeCity %>% 
    ggplot(., aes(x = meanHCN, y = hcn_est)) +
        geom_point(size = 3, color = 'black') +
        geom_smooth(method = 'lm', se = FALSE, color = 'black', size = 1.5) +
        xlab('Mean HCN frequency from phenotyping') + ylab('Estimated HCN frequency from allele frequencies') +
        theme_classic()
print(cor(all_freqs_wholeCity$meanHCN, all_freqs_wholeCity$hcn_est))

## Fst outlier analysis

- Compare Fst at Ac and Li loci to distribution of Fst values along same chromosomes

In [125]:
load_obs_fst <- function(path){
    # Get Fst type and city from filenames
    city <- dirname(path)
    full_path <- paste0(inpath, path)
    colnames <- c('chrom', 'pos', 'num', 'denom')
    df <- suppressMessages(fread(full_path, sep = '\t', header = FALSE, col.names = colnames)) %>%
    
    # Cap numerators at 0 if negative
    # https://github.com/ANGSD/angsd/issues/309 # Does not affect overall pattern 
    mutate(num = ifelse(num < 0, 0, num),
           fst = num / denom) %>%
    mutate(city = city)
    
    return(df)
}

In [126]:
inpath <- '../results/angsd/summary_stats/fst/fst1/'
fst_df <- list.files(inpath, pattern = '.*_r_u_fst1_readable.fst', recursive = TRUE) %>% 
    map_dfr(., load_obs_fst)

In [131]:
pal <- wes_palette('Zissou1', 26, type = 'continuous') 
fst_df %>%
    filter(fst < 0.)
    ggplot(., aes(x = fst, y = city)) +
    geom_density_ridges(scale = 1, show.legend = FALSE) +
#     geom_segment(data = obs_fst_df, aes(x = fst, xend = fst, y = as.numeric(as.factor(city)),
#                                              yend = as.numeric(as.factor(city)) + 0.9),
#                 color = 'black', size = 0.9, linetype = 'dotted') +
    ylab('City') + xlab('Urban-rural Fst') +
    scale_fill_manual(values = pal) +
#     coord_cartesian(xlim = c(0, 0.22)) +
#     scale_x_continuous(breaks = seq(from = 0, to = 0.2, by = 0.05)) +
    theme_classic() + theme(axis.line.x = element_line(color="black",size=1),
          axis.line.y = element_line(color="black",size=1),
          axis.ticks=element_line(color="black"),
          axis.text=element_text(color="black",size=15),
          axis.title=element_text(color="black",size=1),
          axis.title.y=element_text(vjust=2,size=17),
          axis.title.x=element_text(vjust=0.1,size=17),
          axis.text.x=element_text(size=13),
          axis.text.y=element_text(size=13),
          strip.text.x = element_text(size = 10, colour = "black",face = "bold"))

In [151]:
fst_df %>% 
    filter(city == 'Toronto') %>% 
    ggplot(., aes(x = pos, y = fst)) +
    geom_point(size = 0.5, alpha = 0.5) +
    theme_classic()

In [136]:
fst_df %>% 
    filter(city == 'Toronto') %>% 
    filter(fst > 0.5)

In [147]:
quantile(fst_df %>% filter(city == 'Toronto' & fst !=0) %>% filter(!(is.na(fst))) %>% pull(fst), probs = c(0.025, 0.975))