In [2]:
library(MASS)
library(tidyverse)
library(car)
library(emmeans)
library(wesanderson)
library(ggrepel)
library(fs)
library(ggridges)
library(data.table)

# Pairwise urban-rural pi and Fst within cities

In this notebook, we'll examine urban-rural differences in diversity (theta_pi) and urban-rural Fst across all 26 cities. 

## Description of the data

- Low coverage individuals have been removed from these analyses, so we're using the samples that are part of the *finalSamples_lowCovRemoved* sample set from the previous analyses. 
- All analyses were performed using genome-wide 4-fold degenerate site.

The basic workflow is as follows:

1. Generate the Site Allele Frequency (SAF) likelihood distribution for each habitat within each city. 

    - Used the same filters as previously, the most important of which is that sites are only included if 50% of individuals have data. Remember, this is 50% of individuals _within_ a habitat; this would correspond to 5 individuals if none have been removed due to low coverage. 

2. To estimate diversity, generate the folded, one-dimensional SFS from the SAF file in step 1 and estimate diversity separately in urban and rural habitats
3. To estimate Fst, generate the folded, two-dimensional joint SFS of urban-rural habitats and estimate Fst. This uses only sites that are shared between both populations (i.e., the intersection of the two SAF files). 

    - For comparison, I estimated Fst using both Weir and Cockerham (1984) and Hudson (1992).
    
4. We also performed a permutation test for pi and Fst in each city by randomly permuting the urban and rural individuals and repeating steps 1 to 2 above

In [3]:
# Theme used for plotting
ng1 <- theme(aspect.ratio=0.7,panel.background = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border=element_blank(),
          axis.line.x = element_line(color="black",size=1),
          axis.line.y = element_line(color="black",size=1),
          axis.ticks=element_line(size = 1, color="black"),
          axis.ticks.length=unit(0.25, 'cm'),
          axis.text=element_text(color="black",size=15),
          axis.title=element_text(color="black",size=1),
          axis.title.y=element_text(vjust=2,size=17),
          axis.title.x=element_text(vjust=0.1,size=17),
          axis.text.x=element_text(size=15),
          axis.text.y=element_text(size=15),
          strip.text.x = element_text(size = 10, colour = "black",face = "bold"),
          strip.background = element_rect(colour="black"),
          legend.position = "top", legend.direction="vertical",
          legend.text=element_text(size=17), legend.key = element_rect(fill = "white"),
          legend.title = element_text(size=17),legend.key.size = unit(1.0, "cm"))

## Pairwise diversity

### Load diversity data

#### Observed

In [3]:
# Function to load observed diversity estimates by city and habitat
load_obs_pairwise_diversity <- function(path){
    
    # Get city and site names from file
    city <- dirname(path)
    site <- str_extract(basename(path), pattern = '(?<=_)[r|u]')
    
    full_path <- paste0(inpath, path)
    df <- suppressMessages(read_delim(full_path, delim = '\t')) %>% 
        mutate(tp_scaled = tP / nSites,
               tw_scaled = tW / nSites,
               city = city,
               habitat = site) %>% 
    dplyr::select(city, habitat, tp_scaled, tw_scaled, nSites)

    return(df)
    
}

In [4]:
# Merge diversity and habitat diversity dfs into single dataframe
inpath <- '../results/angsd/summary_stats/thetas/by_city/'
obs_div_df <- list.files(inpath, pattern = '.*_[u|r]_4fold.thetas.idx.pestPG', recursive = TRUE) %>% 
    map_dfr(., load_obs_pairwise_diversity) 

In [5]:
head(obs_div_df)

In [6]:
print(mean(obs_div_df$nSites))
print(range(obs_div_df$nSites))

In [7]:
obs_div_df %>% filter(nSites == min(nSites) | nSites == max(nSites))

In [10]:
betaLog_df <- read_csv('../../phenotypic-analyses/analysis/supplementary-tables/allCities_logisticReg_coefs.csv')
obs_div_df_wide <- obs_div_df %>% 
    # Calculate urban-rural difference in theta
    pivot_wider(names_from = habitat, values_from = c(tp_scaled, tw_scaled, nSites)) %>% 
    mutate(tp_diff = tp_scaled_u - tp_scaled_r) %>% 
    left_join(., betaLog_df, by = 'city') %>% 
    as.data.frame() %>% 
    mutate(city = fct_reorder(as.factor(city), betaLog),
           sig = case_when(betaLog > 0 & pvalLog < 0.05 ~ 'Significantly positive',
                           betaLog < 0 & pvalLog < 0.05 ~ 'Significantly negative',
                           TRUE ~ 'Not significant'),
          sigLog = ifelse(pvalLog < 0.05, 'Yes', 'No'))
head(obs_div_df_wide)

#### Figure 4A

- Mean urban and rural pi with standard errors

In [10]:
# Colors for histograms
pal <- wes_palette('Darjeeling1', 5, type = 'discrete')
urban_col <- pal[4]
rural_col <- pal[2]
cols <- c(urban_col, rural_col)

In [11]:
pi_by_habitat <- obs_div_df %>% 
    group_by(habitat) %>% 
    summarise(mean = mean(tp_scaled),
              sd = sd(tp_scaled),
              se = sd / sqrt(n())) %>% 
    mutate(habitat = fct_recode(habitat, 'Rural' = 'r', 'Urban' = 'u')) %>% 
    ggplot(., aes(x = habitat, y = mean, fill = habitat)) +
        geom_errorbar(aes(ymax = mean + se, ymin = mean - se), width = 0.15, size = 1) +
        geom_point(size = 8, shape = 21) +
        scale_fill_manual(values = rev(cols), labels = c('Rural', 'Urban')) +
        ylab('Pairwise nucleotide diversity') + xlab('Habitat') +
    ng1
pi_by_habitat

In [12]:
outpath <- '../results/figures/main_text/figure_4/pi_observed_byHabitat_histogram_allCities.pdf'
ggsave(filename = outpath, plot = pi_by_habitat, device = 'pdf', width = 8, height = 10, units = 'in',
      useDingbats = FALSE, dpi = 600)

In [16]:
# Model testing for difference in pi between urban and rural habitats
pi_mod <- aov(tp_scaled ~ city + habitat, data = obs_div_df)
summary(pi_mod)

In [93]:
# Least squared means of pi in each habitat
emmeans(pi_mod, specs = 'habitat')

#### Figure 4B

- Slope of HCN cline vs. difference in pi

In [13]:
pal <- c("#909090", "#FF0000", "#046C9A")

In [14]:
slope_by_PIdiff <- ggplot(obs_div_df_wide, aes(x = tp_diff, y = betaLog)) +
    geom_point(size = 5, aes(fill = sig), shape = 21) +
    geom_smooth(method = 'lm', color = 'black', size = 1.5) +
    scale_fill_manual(values = pal) +
    ylab('Slope of HCN clines (log-odds)') + xlab('Urban-rural difference in pariwise nucleotide diversity') +
    ng1
slope_by_PIdiff

In [15]:
outpath <- '../results/figures/main_text/figure_4/betaLog_vs_pi.pdf'
ggsave(filename = outpath, plot = slope_by_PIdiff, device = 'pdf', width = 8, height = 10, units = 'in',
      useDingbats = FALSE, dpi = 600)

In [18]:
# Model testing relationship between the slope of clines and the urban-rural difference in pi
div_mod <- aov(betaLog ~ tp_diff, data = obs_div_df_wide)
summary(div_mod)

In [40]:
# Model testing whether the urban-rural difference in pi varies by whether there is a cline or not
tpDiff_by_sig_mod <- aov(tp_diff ~ sigLog, data = obs_div_df_wide)
summary(tpDiff_by_sig_mod)

In [41]:
emmeans(tpDiff_by_sig_mod, specs = 'sigLog')

#### Figure 4C

- Permuted distribution of difference in pi by city with observed values overlaid

In [10]:
# Function to load permuted diversity estimates by city and habitat
load_permuted_pairwise_diversity <- function(path){
    
    # Get city and site names from file
    city <- dirname(dirname(path))
    site <- str_extract(basename(path), pattern = '(?<=_)[r|u]')
    seed <- str_extract(basename(path), pattern = '(?<=_seed)\\d+')

    
    full_path <- paste0(inpath, path)
    df <- suppressMessages(read_delim(full_path, delim = '\t')) %>% 
        mutate(tp_scaled = tP / nSites,
               tw_scaled = tW / nSites,
               city = city,
               habitat = site,
               seed = seed) %>% 
    dplyr::select(city, habitat, seed, tp_scaled, tw_scaled, nSites) %>% 
    group_by(city, habitat)

    return(df)
    
}

In [11]:
perm_div_df <- list.files(inpath, pattern = '.*_[u|r]_4fold_seed.*.thetas.idx.pestPG', recursive = TRUE) %>% 
    map_dfr(., load_permuted_pairwise_diversity) 

In [12]:
head(perm_div_df)

In [13]:
perm_div_df_wide <- perm_div_df %>% 
    # Calculate urban-rural difference in theta
    pivot_wider(names_from = habitat, values_from = c(tp_scaled, tw_scaled, nSites)) %>% 
    mutate(tp_diff = tp_scaled_u - tp_scaled_r) %>% 
    left_join(., betaLog_df, by = 'city')  %>% 
    as.data.frame() %>% 
    mutate(city = fct_reorder(as.factor(city), betaLog),
           sig = case_when(betaLog > 0 & pvalLog < 0.05 ~ 'Significantly positive',
                           betaLog < 0 & pvalLog < 0.05 ~ 'Significantly negative',
                           TRUE ~ 'Not significant'))
head(perm_div_df_wide)

### Plot differences in diversity

In [14]:
pal <- c("#909090", "#FF0000", "#046C9A")
piDiff_perm_byCity <- perm_div_df_wide %>% 
    ggplot(., aes(x = tp_diff, y = city, fill = sig)) +
    geom_density_ridges(scale = 1, show.legend = FALSE) +
    geom_segment(data = obs_div_df_wide, aes(x = tp_diff, xend = tp_diff, y = as.numeric(as.factor(city)), 
                                             yend = as.numeric(as.factor(city)) + 0.9),
                color = 'black', size = 0.9, linetype = 'dotted') +
    ylab('City') + xlab('Urban-rural difference in pairwise nucleotide diversity') +
    scale_fill_manual(values = pal) +
    scale_x_continuous(breaks = seq(from = -0.004, to = 0.004, by = 0.002)) +
    theme_classic() + theme(axis.line.x = element_line(color="black",size=1),
          axis.line.y = element_line(color="black",size=1),
          axis.ticks=element_line(color="black"),
          axis.text=element_text(color="black",size=15),
          axis.title=element_text(color="black",size=1),
          axis.title.y=element_text(vjust=2,size=17),
          axis.title.x=element_text(vjust=0.1,size=17),
          axis.text.x=element_text(size=13),
          axis.text.y=element_text(size=13),
          strip.text.x = element_text(size = 10, colour = "black",face = "bold"))
piDiff_perm_byCity

In [24]:
dir.create('../results/figures/main_text/figure_4')
outpath <- '../results/figures/main_text/figure_4/pi-diff_permutation_byCity.pdf'
ggsave(filename = outpath, plot = piDiff_perm_byCity, device = 'pdf', 
       width = 12, height = 12, units = 'in', dpi = 600, useDingbats = FALSE)

In [26]:
# Function to estimate P-values
permutation_pval <- function(df, var){
    
    city <- df %>% pull(city) %>% unique
    
    obs_val <- df %>% filter(is.na(seed)) %>% pull(var)
    perm_val <- df %>% filter(!(is.na(seed))) %>% pull(var)

    pval <- sum(abs(perm_val) >= abs(obs_val)) / 100
    df_out <- data.frame(city = city, pval = pval)
    return(df_out)
}

In [27]:
div_df <- bind_rows(obs_div_df_wide, perm_div_df_wide)
div_pval_df <- div_df %>% 
    group_split(city) %>% 
    purrr::map_dfr(., var = 'tp_diff', permutation_pval)

In [28]:
write_csv(div_pval_df, '../results/figures/tables/piDiff_permutations_pvalues.csv')

## Fst

### Load Fst dataframe

#### Observed

In [11]:
# Function to load observed Fst df by city/habitat
load_obs_fst <- function(path){
    
    # Get Fst type and city from filenames
    city <- dirname(path)
    
    full_path <- paste0(inpath, path)
    colnames <- c('chrom', 'pos', 'num', 'denom')
    df <- suppressMessages(fread(full_path, sep = '\t', header = FALSE, col.names = colnames)) %>% 
        
        # Cap numerators at 0 if negative 
        # https://github.com/ANGSD/angsd/issues/309
        # Does not affect overall pattern
        mutate(num = ifelse(num < 0, 0, num)) %>% 
        
        # Estimate weighted Fst as ratio of averages
        # https://github.com/ANGSD/angsd/issues/61
        summarise(num_sum = sum(num),
                  denom_sum = sum(denom),
                  fst = num_sum / denom_sum,
                  nSites = n()) %>% 
        mutate(city = city)
    
    return(df)
    
}

In [12]:
# Merge Fst dataframes. Only using Hudson's Fst
inpath <- '../results/angsd/summary_stats/fst/fst1/'
obs_fst_df <- list.files(inpath, pattern = '.*_4fold_r_u_fst1_readable.fst', recursive = TRUE) %>% 
    map_dfr(., load_obs_fst) %>% 
    left_join(., betaLog_df, by = 'city') %>% 
    as.data.frame() %>% 
    mutate(city = fct_reorder(as.factor(city), betaLog),
           sig = case_when(betaLog > 0 & pvalLog < 0.05 ~ 'Significantly positive',
                           betaLog < 0 & pvalLog < 0.05 ~ 'Significantly negative',
                           TRUE ~ 'Not significant')) %>% 
    filter(!(is.na(yint)))  # Remove Ac and Li locus estimates

In [13]:
head(obs_fst_df)

In [14]:
# Mean Fst
obs_fst_df %>% 
    summarise(mean = mean(fst),
              sd = sd(fst),
              se = sd / sqrt(n()))

In [15]:
print(mean(obs_fst_df$nSites))
print(range(obs_fst_df$nSites))

In [16]:
obs_fst_df %>% filter(nSites == min(nSites) | nSites == max(nSites))

In [45]:
alpha <- 0.6
fst_by_cline <- ggplot(obs_fst_df, aes(x = fst)) +
    geom_histogram(data = obs_fst_df %>% filter(sig == 'Not significant'),
                                                         bins = 25, 
                                                         aes(fill = sig), 
                                                         color = 'black', 
                                                         alpha = alpha) +
    geom_histogram(data = obs_fst_df %>% filter(sig == 'Significantly negative'),
                                                         bins = 25, 
                                                         aes(fill = sig), 
                                                         color = 'black', 
                                                         alpha = alpha) +
   geom_histogram(data = obs_fst_df %>% filter(sig == 'Significantly positive'),
                                                         bins = 25, 
                                                         aes(fill = sig), 
                                                         color = 'black', 
                                                         alpha = alpha) +
    scale_fill_manual(values = pal) +
    coord_cartesian(xlim = c(0, 0.22)) +
    scale_x_continuous(breaks = seq(from = 0, to = 0.2, by = 0.05), expand = c(0, 0)) +
    ylab('Number of cities') + xlab('Urban-rural Fst') +
    
    scale_y_continuous(breaks = seq(from = 0, to = 6, by = 1), expand = c(0, 0)) +
    ng1
fst_by_cline

In [36]:
outpath <- '../results/figures/main_text/figure_4/fst_observed_histogram_allCities.pdf'
ggsave(filename = outpath, plot = fst_by_cline, device = 'pdf', width = 8, height = 10, units = 'in',
      useDingbats = FALSE, dpi = 600)

In [50]:
# Model testing whether Fst differs between cities with and without clines
fst_mod <- aov(fst ~ sigLog, data = df_allStats)
summary(fst_mod)

In [51]:
emmeans(fst_mod, specs = 'sigLog')

#### Figure 5C

- Slope of clines vs. Fst

In [52]:
slope_by_fst <- ggplot(obs_fst_df, aes(x = fst, y = betaLog)) +
    geom_point(size = 5, aes(fill = sig), shape = 21) +
    geom_smooth(method = 'lm', color = 'black', size = 1.5) +
    scale_fill_manual(values = pal) +
    scale_y_continuous(breaks = seq(from = -4, to = 4, by = 1)) +
    ylab('Slope of HCN clines (log-odds)') + xlab("Hudson's Fst") +
    ng1
slope_by_fst

In [53]:
outpath <- '../results/figures/main_text/figure_4/betaLog_by_fst.pdf'
ggsave(filename = outpath, plot = slope_by_fst, device = 'pdf', width = 8, height = 10, units = 'in',
      useDingbats = FALSE, dpi = 600)

In [54]:
# Model testing whether Fst predicts the strength of clines
fst_mod <- aov(betaLog ~ fst, data = obs_fst_df)
summary(fst_mod)

#### Permuted

In [40]:
# Function to load permuted Fst df by city/habitat
load_perm_fst <- function(path){
    
    # Get Fst type and city from filenames
    city <- dirname(dirname(path))
    seed <- str_extract(basename(path), pattern = '(?<=_seed)\\d+')
    
    full_path <- paste0(inpath, path)
    colnames <- c('chrom', 'pos', 'num', 'denom')
    df <- suppressMessages(fread(full_path, sep = '\t', header = FALSE, col.names = colnames)) %>% 
        
        # Cap numerators at 0 if negative 
        # https://github.com/ANGSD/angsd/issues/309
        # Does not affect overall pattern
        mutate(num = ifelse(num < 0, 0, num)) %>% 
        
        # Estimate weighted Fst as ratio of averages
        # https://github.com/ANGSD/angsd/issues/61
        summarise(num_sum = sum(num),
                  denom_sum = sum(denom),
                  fst = num_sum / denom_sum,
                  nSites = n()) %>% 
        mutate(city = city,
               seed = seed)
    
    return(df)
    
}

In [41]:
# Merge Fst dataframes. Only using Hudson's Fst
inpath <- '../results/angsd/summary_stats/fst/fst1/'
perm_fst_df <- list.files(inpath, pattern = '.*seed\\d+_r_u_readable.fst', recursive = TRUE) %>% 
    map_dfr(., load_perm_fst) %>% 
    left_join(., betaLog_df, by = 'city') %>% 
    as.data.frame() %>% 
    mutate(city = fct_reorder(as.factor(city), betaLog),
           sig = case_when(betaLog > 0 & pvalLog < 0.05 ~ 'Significantly positive',
                           betaLog < 0 & pvalLog < 0.05 ~ 'Significantly negative',
                           TRUE ~ 'Not significant'))

In [116]:
fst_perm_byCity <- perm_fst_df %>% 
    ggplot(., aes(x = fst, y = city, fill = sig)) +
    geom_density_ridges(scale = 1, show.legend = FALSE) +
    geom_segment(data = obs_fst_df, aes(x = fst, xend = fst, y = as.numeric(as.factor(city)), 
                                             yend = as.numeric(as.factor(city)) + 0.9),
                color = 'black', size = 0.9, linetype = 'dotted') +
    ylab('City') + xlab('Urban-rural Fst') +
    scale_fill_manual(values = pal) +
    coord_cartesian(xlim = c(0, 0.22)) +
    scale_x_continuous(breaks = seq(from = 0, to = 0.2, by = 0.05)) +
    theme_classic() + theme(axis.line.x = element_line(color="black",size=1),
          axis.line.y = element_line(color="black",size=1),
          axis.ticks=element_line(color="black"),
          axis.text=element_text(color="black",size=15),
          axis.title=element_text(color="black",size=1),
          axis.title.y=element_text(vjust=2,size=17),
          axis.title.x=element_text(vjust=0.1,size=17),
          axis.text.x=element_text(size=13),
          axis.text.y=element_text(size=13),
          strip.text.x = element_text(size = 10, colour = "black",face = "bold"))
fst_perm_byCity

In [114]:
perm_fst_df %>% 
    filter(city == 'Toronto') %>% 
    ggplot(., aes(x = fst, y = city)) +
    geom_density_ridges(scale = 1, show.legend = FALSE) +
    geom_vline(xintercept = obs_fst_df %>% filter(city == 'Toronto') %>% pull(fst), linetype = 'dotted') +
    theme_classic()

In [75]:
dir.create('../results/figures/main_text/figure_5')
outpath <- '../results/figures/main_text/figure_5/fst_permutation_byCity.pdf'
ggsave(filename = outpath, plot = fst_perm_byCity, device = 'pdf', 
       width = 12, height = 12, units = 'in', dpi = 600, useDingbats = FALSE)

In [76]:
fst_df <- bind_rows(obs_fst_df, perm_fst_df)
fst_pval_df <- fst_df %>% 
    group_split(city) %>% 
    purrr::map_dfr(., var = 'fst', permutation_pval)

In [77]:
write_csv(fst_pval_df, '../results/figures/tables/fst_permutation_pvalues.csv')

## Euclidean distance from PCA

- Estimate Euclidean distance between urban and rural centroids by city

In [26]:
euclidean <- function(x1, y1, x2, y2){
    
    dist <- sqrt((x1 - x2)^2 + (y1 - y2)^2)
    return(dist)
}

In [27]:
# Load data with habitat info
habitat_info <- suppressMessages(
    read_delim(
        '../../sequencing-prep/resources/low1_sampleSheet.txt', 
                           delim = '\t')) %>% 
    dplyr::select(continent, range, city, pop, individual, site, sample)

In [28]:
# Load covariance matrix from PCAngsd
cov_mat <- suppressMessages(
    read_delim(
        '../results/population_structure/pcangsd/highErrorRemoved_4fold_maf0.05_pcangsd.cov', 
                      col_names = FALSE, delim = ' ')) %>% 
      as.matrix()

# Combine continent and habitat data with sample order from ANGSD
samples <- suppressMessages(
    read_table(
        '../results/program_resources/angsd_highErrorRemoved_order.txt', col_names = FALSE) %>% 
  rename('sample' = 'X1')) %>%
  left_join(., habitat_info, by = 'sample')

In [29]:
# Dataframe with eigenvectors
eigenvectors <- eigen(cov_mat)
eigen_df <- eigenvectors$vectors %>% 
    as.data.frame() %>% 
    dplyr::select(V1, V2) %>% 
    rename('PC1' = 'V1',
         'PC2' = 'V2') %>% 
    bind_cols(., samples) %>% 
    mutate(sample_set = 'highErrorRemoved')

In [30]:
euc_dist_df <- eigen_df %>% 
    group_by(city, site) %>% 
    summarise(x = mean(PC1),
              y = mean(PC2)) %>% 
    pivot_wider(names_from = site, values_from = c(x, y)) %>% 
    mutate(distance = euclidean(x_u, y_u, x_r, y_r)) %>% 
    dplyr::select(city, distance)

In [83]:
head(euc_dist_df)

### Does euclidean distance predict HCN?

In [55]:
dist_mod <- aov(betaLog ~ distance, data = df_allStats)
summary(dist_mod)

## Figure SX 

- PCA figure with urban/rural centroids

In [56]:
euc_dist_df_forPlot <- eigen_df %>% 
    group_by(city, site) %>% 
    summarise(x = mean(PC1),
              y = mean(PC2))
head(euc_dist_df_forPlot)

In [57]:
pca_centroids <- ggplot(euc_dist_df_forPlot, aes(x = x, y = y, fill = site, shape = site)) +
    geom_line(size = 0.35, alpha = 0.4, aes(group = city)) +
    geom_point(size = 3.5) +
    scale_fill_manual(values = rev(cols), labels = c('Rural', 'Urban')) +
    scale_shape_manual(values = c(21, 24), labels = c('Rural', 'Urban')) +
    scale_x_continuous(breaks = seq(from = -0.08, to = 0.06, by = 0.02)) +
    scale_y_continuous(breaks = seq(from = -0.05, to = 0.15, by = 0.03)) +
    ylab('PC2 (7%)') + xlab("PC1 (18%)") + 
    geom_label_repel(data = euc_dist_df_forPlot %>% filter(site == 'r'),
                     aes(label = city),
                        color = 'black',
                        fill = 'white', 
                        size = 3,
                        arrow = arrow(length = unit(0.03, "npc"), 
                        type = "closed", ends = "last"),
                        nudge_y = 0.1,
                        segment.size  = 0.3) +
    ng1
pca_centroids

In [58]:
file.create(snakemake@output[[1]])