# Divergence of HCN relative to neutral expectation

- For each city, test whether urban-rural divergence in HCN is greater than expected based on null distribution of randomly selected 4fold degenerate sites with a similar genetic architecture as HCN. 

In [5]:
library(data.table)
library(tidyverse)

In [6]:
load_maf <- function(path){
    base <- basename(path)
    city = str_split(base, '_[u|r]_', simplify = TRUE)[1]
    habitat = str_extract(base, pattern = '(?<=_)[r|u]')
    
    df <- suppressMessages(read_delim(path, delim = '\t')) %>%
        filter(nInd >= 10) %>%
        mutate(city = city, 
               habitat = habitat,
               pos = paste(chromo, position, sep = '_')) %>%
    dplyr::select(city, habitat, pos, everything(), -'pK-EM', -ref, -anc, -chromo, -position) %>%
    return(df)
}

intersect_positions <- function(df){
    
    df_out <- df %>%
        filter(pos %in% (df %>% 
                         group_split(habitat) %>% 
                         map(~pull(., pos)) %>% 
                         reduce(intersect)))
    return(df_out)
}

estimate_mock_hcn <- function(pA, pB){
    qA <- 1 - pA
    qB <- 1 - pB
    acn <- qA^2 + qB^2 - (qA^2 * qB^2)
    hcn <- 1 - acn
    return(hcn)
}

hudson_fst <- function(p_u, p_r, n_u, n_r){
    
    # Numerator
    a <- (p_u - p_r)^2
    b <- (p_u * (1 - p_u)) / (n_u - 1)
    c <- (p_r * (1 - p_r)) / (n_r - 1)
    num <- a - b - c
    
    # Denominator
    d <- p_u * (1 - p_r)
    e <- p_r * (1 - p_u)
    denom <- d + e
    
    fst <- num / denom
    
    return(fst)
}

dxy <- function(p_u, p_r){
    
    
    dxy <- (p_u * (1 - p_r)) + (p_r * (1 - p_u))
    return(dxy)
}

simulate_null <- function(df, stat, nreps = 1000){
    
    city <- df %>% pull(city) %>% unique()
    stats_out <- c()
    
    for(i in 1:nreps){
        # Randomly sample 2 sites
        random_sites <- sample_n(df, size = 2)

        # Get mock "Ac" and "Li" allele frequencies in each habitat
        # Get sample sizes too
        Ac_r <- random_sites[1, 'knownEM_r'] %>% pull()
        Li_r <- random_sites[2, 'knownEM_r'] %>% pull()
        Ac_u <- random_sites[1, 'knownEM_u'] %>% pull()
        Li_u <- random_sites[2, 'knownEM_u'] %>% pull()

        # Habitat sample size will be mean of mock Ac and Li sample sizes
        r_ss <- mean(random_sites[1, 'nInd_r']  %>% pull(), random_sites[2, 'nInd_r'] %>% pull())
        u_ss <- mean(random_sites[1, 'nInd_u']  %>% pull(), random_sites[2, 'nInd_u'] %>% pull())

        # Estimate mock HCN frequency in each habitat
        r_hcn <- estimate_mock_hcn(Ac_r, Li_r)
        u_hcn <- estimate_mock_hcn(Ac_u, Li_u)

        # Estimate differentiation stats as measure of differentiation in mock HCN
        fst <- hudson_fst(u_hcn, r_hcn, u_ss, r_ss)
        dxy <- dxy(u_hcn, r_hcn)
        diff <- u_hcn - r_hcn

        if(stat == 'fst'){
            stat = fst
        }else if(stat == 'dxy'){
            stat = dxy
        }else{
            stat = diff
        }
        stats_out[i] <- stat
    }
    
    out <- tibble(city = city, rep = 1:nreps, stat = stats_out)
    return(out)
}

load_plant_data <- function(city){
    inpath <- '../../phenotypic-analyses/data/clean/individualPlant_allCities/'
    full_path <- paste0(inpath, city, '.csv')
    df <- suppressMessages(read_csv(full_path)) %>%
        dplyr::select(city, population, plant, hcn_result) %>%
        mutate(population = as.character(population),
               plant = as.character(plant))
    if(city == 'Armidale'){
        df <- separate(df, plant, sep = '-', into = c('population', 'plant'))
    }
    return(df) 
}

 # Function to estimate P-values
permutation_pval <- function(df){
    city <- df %>% pull(city) %>% unique()
    obs_val <- df %>% filter(is.na(rep)) %>% pull('stat')
    perm_val <- df %>% filter(!(is.na(rep))) %>% pull('stat')
    pval <- sum(abs(perm_val) >= abs(obs_val)) / length(perm_val)
    df_out <- data.frame(city = city, pval = pval)
    return(df_out)
}

In [3]:
# Load all 4fold allele frequencies as single dataframe
inpath <- '../../../results/angsd/afs/by_city/'
df_afs <- list.files(inpath, full.names = TRUE, recursive = TRUE, pattern = '*.mafs.gz') %>%
    map_dfr(., load_maf)

In [7]:
# Get sites common to urban and rural habitats, seaprately for each city
# Pivot data for easier random site selection
# Use only sites with same major and minor alleles
df_afs_inter <- df_afs %>%
    group_split(city) %>%
    map_dfr(., intersect_positions) %>%
    pivot_wider(names_from = 'habitat', values_from = c('knownEM', 'nInd', 'minor')) %>%
    filter(minor_u == minor_r)  
head(df_afs_inter)

city,pos,major,knownEM_r,knownEM_u,nInd_r,nInd_u,minor_r,minor_u
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
Albuquerque,CM019101.1_17063,A,0.522677,0.633789,13,10,C,C
Albuquerque,CM019101.1_17462,T,0.144811,0.157838,13,13,A,A
Albuquerque,CM019101.1_19481,C,0.741929,0.661749,13,13,A,A
Albuquerque,CM019101.1_19520,G,0.746196,0.806513,13,13,A,A
Albuquerque,CM019101.1_19553,A,0.638462,0.828019,12,10,T,T
Albuquerque,CM019101.1_47203,C,0.880891,0.965221,23,19,G,G


In [33]:
# Load in sample sheet
# Load binomial regression model summaries and predict urban and rural HCN
sample_sheet <- suppressMessages(read_delim('../resources/glue_pc_sampleSheet.txt', delim = '\t'))
cline_models <- suppressMessages(read_csv('../../phenotypic-analyses/analysis/supplementary-tables/allCities_logisticReg_coefs.csv')) %>% 
    filter(city %in% sample_sheet$city) %>% 
    mutate(hcn_u = exp(yint_Dist + betaLog_Dist*0) / (1 + (exp(yint_Dist + betaLog_Dist*0))),
           hcn_r = exp(yint_Dist + betaLog_Dist*1) / (1 + (exp(yint_Dist + betaLog_Dist*1)))) %>% 
    mutate(diff = hcn_u - hcn_r,
           fst = hudson_fst(hcn_u, hcn_r, 10, 10)) %>% 
    mutate(fst = ifelse(fst < 0, 0, fst)) %>% 
    mutate(sig = ifelse(pvalLog_Dist < 0.05, 'Yes', 'No')) %>% 
    dplyr::select(city, sig, hcn_u, hcn_r, diff, fst)

In [34]:
head(cline_models)

city,sig,hcn_u,hcn_r,diff,fst
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
Albuquerque,Yes,0.30470333,0.1592258,0.145477556,0.0
Antwerp,Yes,0.09673856,0.4790123,-0.382273783,0.22500845
Armidale,Yes,0.5535439,0.8457958,-0.292251863,0.09387276
Athens,Yes,0.35755317,0.7269097,-0.369356584,0.15734517
Bogota,Yes,0.29629576,0.4935004,-0.197204604,0.0
Buenos_Aires,No,0.75565431,0.7584133,-0.002758981,0.0


In [35]:
 # Data frame with indivudual plant phenotype data for sequenced cities
cities <- df_afs %>% pull(city) %>% unique()
df_allPlants <- map_dfr(cities, load_plant_data) %>%
    mutate(city = case_when(city == 'Toronto' ~ 's',
                            city == 'Buenos_Aires' ~ 'Buen_Air',
                            city == 'Thessaloniki' ~ 'Thessa', 
                            city == 'Christchurch' ~ 'Chrchurch', TRUE ~ city)) %>%
    mutate(sample = paste(city, population, plant, sep = '_')) %>%
    dplyr::select(sample, hcn_result)

In [12]:
set.seed(42)
nulls <- df_afs_inter %>% 
    group_split(city) %>% 
    map_dfr(~simulate_null(., stat = 'fst', nreps = 1000))

In [23]:
combine_stats_sf <- cline_models %>% 
    rename('stat' = 'fst') %>% 
    dplyr::select(city, stat) %>% 
    bind_rows(nulls)

In [36]:
perm_pval_df <- combine_stats_sf %>% 
    group_split(city) %>% 
    purrr::map_dfr(., permutation_pval) %>% 
    left_join(cline_models %>% dplyr::select(city, sig)) %>% 
    mutate(sig_perm = ifelse(pval < 0.05, 'Yes', 'No')) %>% 
    mutate(match_expect = ifelse(sig == sig_perm, 'Yes', 'No'))

Joining, by = "city"


In [37]:
perm_pval_df

city,pval,sig,sig_perm,match_expect
<chr>,<dbl>,<chr>,<chr>,<chr>
Albuquerque,0.647,Yes,No,No
Antwerp,0.031,Yes,Yes,Yes
Armidale,0.16,Yes,No,No
Athens,0.078,Yes,No,No
Bogota,0.632,Yes,No,No
Buenos_Aires,0.752,No,No,Yes
Calgary,0.466,Yes,No,No
Canberra,0.81,No,No,Yes
Cape_Town,0.8,No,No,Yes
Christchurch,0.038,Yes,Yes,Yes


In [38]:
chisq.test(table(perm_pval_df$sig, perm_pval_df$sig_perm))

“Chi-squared approximation may be incorrect”


	Pearson's Chi-squared test with Yates' continuity correction

data:  table(perm_pval_df$sig, perm_pval_df$sig_perm)
X-squared = 2.3805, df = 1, p-value = 0.1229


In [175]:
?chisq.test