In [14]:
library(MASS)
library(tidyverse)
library(car)
library(emmeans)
library(wesanderson)
library(ggrepel)
library(fs)
library(ggridges)
library(data.table)

# Pairwise urban-rural pi and Fst within cities

In this notebook, we'll examine urban-rural differences in diversity (theta_pi) and urban-rural Fst across all 26 cities. 

## Description of the data

- Low coverage individuals have been removed from these analyses, so we're using the samples that are part of the *finalSamples_lowCovRemoved* sample set from the previous analyses. 
- All analyses were performed using genome-wide 4-fold degenerate site.

The basic workflow is as follows:

1. Generate the Site Allele Frequency (SAF) likelihood distribution for each habitat within each city. 

    - Used the same filters as previously, the most important of which is that sites are only included if 50% of individuals have data. Remember, this is 50% of individuals _within_ a habitat; this would correspond to 5 individuals if none have been removed due to low coverage. 

2. To estimate diversity, generate the folded, one-dimensional SFS from the SAF file in step 1 and estimate diversity separately in urban and rural habitats
3. To estimate Fst, generate the folded, two-dimensional joint SFS of urban-rural habitats and estimate Fst. This uses only sites that are shared between both populations (i.e., the intersection of the two SAF files). 

    - For comparison, I estimated Fst using both Weir and Cockerham (1984) and Hudson (1992).
    
4. We also performed a permutation test for pi and Fst in each city by randomly permuting the urban and rural individuals and repeating steps 1 to 2 above

## Pairwise diversity

### Load diversity data

#### Observed

In [3]:
# Function to load observed diversity estimates by city and habitat
load_obs_pairwise_diversity <- function(path){
    
    # Get city and site names from file
    city <- dirname(path)
    site <- str_extract(basename(path), pattern = '(?<=_)[r|u]')
    
    full_path <- paste0(inpath, path)
    df <- suppressMessages(read_delim(full_path, delim = '\t')) %>% 
        mutate(tp_scaled = tP / nSites,
               tw_scaled = tW / nSites,
               city = city,
               habitat = site) %>% 
    dplyr::select(city, habitat, tp_scaled, tw_scaled, nSites)

    return(df)
    
}

In [4]:
# Merge diversity and habitat diversity dfs into single dataframe
inpath <- '../results/angsd/summary_stats/thetas/by_city/'
obs_div_df <- list.files(inpath, pattern = '.*_[u|r]_4fold.thetas.idx.pestPG', recursive = TRUE) %>% 
    map_dfr(., load_obs_pairwise_diversity) 

In [5]:
head(obs_div_df)

In [6]:
print(mean(obs_div_df$nSites))
print(range(obs_div_df$nSites))

In [7]:
obs_div_df %>% filter(nSites == min(nSites) | nSites == max(nSites))

In [8]:
obs_div_df_wide <- obs_div_df %>% 
    # Calculate urban-rural difference in theta
    pivot_wider(names_from = habitat, values_from = c(tp_scaled, tw_scaled, nSites)) %>% 
    mutate(tp_diff = tp_scaled_u - tp_scaled_r)
head(obs_div_df_wide)

#### Permuted

In [9]:
# Function to load permuted diversity estimates by city and habitat
load_permuted_pairwise_diversity <- function(path){
    
    # Get city and site names from file
    city <- dirname(dirname(path))
    site <- str_extract(basename(path), pattern = '(?<=_)[r|u]')
    seed <- str_extract(basename(path), pattern = '(?<=_seed)\\d+')

    
    full_path <- paste0(inpath, path)
    df <- suppressMessages(read_delim(full_path, delim = '\t')) %>% 
        mutate(tp_scaled = tP / nSites,
               tw_scaled = tW / nSites,
               city = city,
               habitat = site,
               seed = seed) %>% 
    dplyr::select(city, habitat, seed, tp_scaled, tw_scaled, nSites) %>% 
    group_by(city, habitat)

    return(df)
    
}

In [10]:
perm_div_df <- list.files(inpath, pattern = '.*_[u|r]_4fold_seed.*.thetas.idx.pestPG', recursive = TRUE) %>% 
    map_dfr(., load_permuted_pairwise_diversity) 

In [11]:
head(perm_div_df)

In [12]:
perm_div_df_wide <- perm_div_df %>% 
    # Calculate urban-rural difference in theta
    pivot_wider(names_from = habitat, values_from = c(tp_scaled, tw_scaled, nSites)) %>% 
    mutate(tp_diff = tp_scaled_u - tp_scaled_r)
head(perm_div_df_wide)

### Plot differences in diversity

In [56]:
pal <- wes_palette("Zissou1", 26, type = "continuous")
perm_div_df_wide %>% 
    ggplot(., aes(x = tp_diff, y = city, fill = city)) +
    geom_density_ridges(scale = 1.1, show.legend = FALSE) +
    geom_segment(data = obs_div_df_wide, aes(x = tp_diff, xend = tp_diff, y = as.numeric(as.factor(city)), 
                                             yend = as.numeric(as.factor(city)) + 0.8),
                color = 'black', size = 0.5, linetype = 'dotted') +
    ylab('City') + xlab('Urban-rural difference in pairwise nucleotide diversity') +
    scale_fill_manual(values = pal) +
    theme_classic()

In [165]:
# Function to estimate P-values
permutation_pval <- function(df, var){
    
    city <- df %>% pull(city) %>% unique
    
    obs_val <- df %>% filter(is.na(seed)) %>% pull(var)
    perm_val <- df %>% filter(!(is.na(seed))) %>% pull(var)

    pval <- sum(abs(perm_val) >= abs(obs_val)) / 100
    df_out <- data.frame(city = city, pval = pval)
    return(df_out)
}

In [167]:
div_df <- bind_rows(obs_div_df_wide, perm_div_df_wide)
div_pval_df <- div_df %>% 
    group_split(city) %>% 
    purrr::map_dfr(., var = 'tp_diff', permutation_pval)

## Fst

### Load Fst dataframe

#### Observed

In [84]:
# Function to load observed Fst df by city/habitat
load_obs_fst <- function(path){
    
    # Get Fst type and city from filenames
    city <- dirname(path)
    
    full_path <- paste0(inpath, path)
    colnames <- c('chrom', 'pos', 'num', 'denom')
    df <- suppressMessages(fread(full_path, sep = '\t', header = FALSE, col.names = colnames)) %>% 
        
        # Cap numerators at 0 if negative 
        # https://github.com/ANGSD/angsd/issues/309
        # Does not affect overall pattern
        mutate(num = ifelse(num < 0, 0, num)) %>% 
        
        # Estimate weighted Fst as ratio of averages
        # https://github.com/ANGSD/angsd/issues/61
        summarise(num_sum = sum(num),
                  denom_sum = sum(denom),
                  fst = num_sum / denom_sum,
                  nSites = n()) %>% 
        mutate(city = city)
    
    return(df)
    
}

In [90]:
# Merge Fst dataframes. Only using Hudson's Fst
inpath <- '../results/angsd/summary_stats/fst/fst1/'
obs_fst_df <- list.files(inpath, pattern = '.*_r_u_fst1_readable.fst', recursive = TRUE) %>% 
    map_dfr(., load_obs_fst)

In [86]:
head(obs_fst_df)

In [91]:
# Remove Kyoto since only 1 rural sample
obs_fst_df_noKyoto <-  obs_fst_df %>% 
    filter(!(city == 'Kyoto'))

In [92]:
# Mean number of sites fo Hudson's Fst (without Kyoto)
obs_fst_df_noKyoto %>% pull(nSites) %>% mean()

In [93]:
obs_fst_df_noKyoto %>% 
    ungroup() %>% 
    filter(nSites == min(nSites) | nSites == max(nSites))

#### Permuted

In [97]:
# Function to load permuted Fst df by city/habitat
load_perm_fst <- function(path){
    
    # Get Fst type and city from filenames
    city <- dirname(dirname(path))
    seed <- str_extract(basename(path), pattern = '(?<=_seed)\\d+')
    
    full_path <- paste0(inpath, path)
    colnames <- c('chrom', 'pos', 'num', 'denom')
    df <- suppressMessages(fread(full_path, sep = '\t', header = FALSE, col.names = colnames)) %>% 
        
        # Cap numerators at 0 if negative 
        # https://github.com/ANGSD/angsd/issues/309
        # Does not affect overall pattern
        mutate(num = ifelse(num < 0, 0, num)) %>% 
        
        # Estimate weighted Fst as ratio of averages
        # https://github.com/ANGSD/angsd/issues/61
        summarise(num_sum = sum(num),
                  denom_sum = sum(denom),
                  fst = num_sum / denom_sum,
                  nSites = n()) %>% 
        mutate(city = city,
               seed = seed)
    
    return(df)
    
}

In [98]:
# Merge Fst dataframes. Only using Hudson's Fst
inpath <- '../results/angsd/summary_stats/fst/fst1/'
perm_fst_df <- list.files(inpath, pattern = '.*seed\\d+_r_u_readable.fst', recursive = TRUE) %>% 
    map_dfr(., load_perm_fst)

In [172]:
pal <- wes_palette("Zissou1", 26, type = "continuous")
perm_fst_df %>% 
    ggplot(., aes(x = fst, y = city, fill = city)) +
    geom_density_ridges(scale = 1.1, show.legend = FALSE) +
    geom_segment(data = obs_fst_df, aes(x = fst, xend = fst, y = as.numeric(as.factor(city)), 
                                             yend = as.numeric(as.factor(city)) + 0.8),
                color = 'black', size = 0.5, linetype = 'dotted') +
    ylab('City') + xlab('Urban-rural Fst') +
    scale_fill_manual(values = pal) +
    theme_classic()

In [169]:
fst_df <- bind_rows(obs_fst_df, perm_fst_df)
fst_pval_df <- fst_df %>% 
    group_split(city) %>% 
    purrr::map_dfr(., var = 'fst', permutation_pval)

In [170]:
fst_pval_df

## Euclidean distance from PCA

- Estimate Euclidean distance between urban and rural centroids by city

In [173]:
euclidean <- function(x1, y1, x2, y2){
    
    dist <- sqrt((x1 - x2)^2 + (y1 - y2)^2)
    return(dist)
}

In [174]:
# Load data with habitat info
habitat_info <- suppressMessages(
    read_delim(
        '../../sequencing-prep/resources/low1_sampleSheet.txt', 
                           delim = '\t')) %>% 
    dplyr::select(continent, range, city, pop, individual, site, sample)

In [175]:
# Load covariance matrix from PCAngsd
cov_mat <- suppressMessages(
    read_delim(
        '../results/population_structure/pcangsd/highErrorRemoved_4fold_maf0.05_pcangsd.cov', 
                      col_names = FALSE, delim = ' ')) %>% 
      as.matrix()

# Combine continent and habitat data with sample order from ANGSD
samples <- suppressMessages(
    read_table(
        '../results/program_resources/angsd_highErrorRemoved_order.txt', col_names = FALSE) %>% 
  rename('sample' = 'X1')) %>%
  left_join(., habitat_info, by = 'sample')

In [176]:
# Dataframe with eigenvectors
eigenvectors <- eigen(cov_mat)
eigen_df <- eigenvectors$vectors %>% 
    as.data.frame() %>% 
    dplyr::select(V1, V2) %>% 
    rename('PC1' = 'V1',
         'PC2' = 'V2') %>% 
    bind_cols(., samples) %>% 
    mutate(sample_set = 'highErrorRemoved')

In [177]:
euc_dist_df <- eigen_df %>% 
    group_by(city, site) %>% 
    summarise(x = mean(PC1),
              y = mean(PC2)) %>% 
    pivot_wider(names_from = site, values_from = c(x, y)) %>% 
    mutate(distance = euclidean(x_u, y_u, x_r, y_r)) %>% 
    dplyr::select(city, distance)

In [178]:
head(euc_dist_df)

## Fst vs. Euclidean distance

In [181]:
fst

In [185]:
fst <- obs_fst_df %>% pull(fst)
dist <- euc_dist_df %>% pull(distance)

In [183]:
fst_by_eucl <- qplot() + 
    geom_point(aes(x = fst, y = dist), size = 2, alpha = 0.5) +
#     geom_abline(slope = 1, intercept = 0) +
    xlab("Hudson's Fst") + ylab('Euclidean distance') +
    theme_classic()
fst_by_eucl

In [186]:
# Correlation using all points
cor(fst, dist, method = 'pearson')

In [187]:
fst_highDrop <- fst[fst<0.1]
dist_highDrop <- dist[fst<0.1]

In [188]:
fst_by_eucl_highDrop <- qplot() + 
    geom_point(aes(x = fst_highDrop, y = dist_highDrop), size = 2, alpha = 0.5) +
#     geom_abline(slope = 1, intercept = 0) +
    xlab("Hudson's Fst") + ylab('Euclidean distance') +
    theme_classic()
fst_by_eucl_highDrop

In [189]:
# Correlation when large Fst outliers are removed
cor.test(fst_highDrop, dist_highDrop)

## Models

In [190]:
# Get dataframe with slopes and significance of clines
betaLog <- suppressMessages(read_csv('../../phenotypic-analyses/analysis/supplementary-tables/allCities_logisticReg_coefs.csv')) %>% 
dplyr::select(city, betaLog, pvalLog)
head(betaLog)

In [194]:
df_allStats <- euc_dist_df %>% 
    left_join(., obs_fst_df, by = 'city') %>% 
    left_join(., obs_div_df_wide, by = 'city') %>% 
    left_join(., betaLog, by = 'city') %>% 
    mutate(sigLog = ifelse(pvalLog < 0.05, 'Yes', 'No'),
           significance = case_when(sigLog == 'Yes' & betaLog < 0 ~ 'Significantly negative',
                                    sigLog == 'Yes' & betaLog > 0 ~ 'Significantly positive',
                                    TRUE ~ 'Not significant'))

### Does pi differ by habitat or city?

In [196]:
pi_mod <- aov(tp_scaled ~ city + habitat, data = obs_div_df)
summary(pi_mod)

In [197]:
# Least squared means of pi in each habitat
emmeans(pi_mod, specs = 'habitat')

In [199]:
# Standard errors from data instead of modelobs
obs_div_df %>% 
    group_by(habitat) %>% 
    summarise(mean = round(mean(tp_scaled), 4),
              n = n(),
              se = round(sd(tp_scaled) / sqrt(n), 6))

### Pi by habitat and clines (sig vs. ns)

In [200]:
div_df_mod <- obs_div_df %>% 
    left_join(., betaLog, by = 'city') %>% 
    mutate(sig = ifelse(pvalLog < 0.05, 'Yes', 'No'))

In [201]:
# Model to get least squared means
pi_mod_sig <- aov(tp_scaled ~ habitat + sig, data = div_df_mod)
summary(pi_mod_sig)

In [202]:
# Get least quared means
emmeans(pi_mod_sig, specs = 'habitat', by = 'sig')

In [203]:
# Standard errors from data instead of model
div_df_mod %>% 
    group_by(habitat, sig) %>% 
    summarise(mean = round(mean(tp_scaled), 4),
              n = n(),
              se = round(sd(tp_scaled) / sqrt(n), 6))

### Does the strength of clines predict mean diversity?

- Model above suggests diversity is higher in cities with clines?
- Is this a real result?
- Do mean diversity across cities vary with the strength of clines?

In [204]:
div_df_mean <- obs_div_df %>% 
    group_by(city) %>% 
    summarise(tp_scaled = mean(tp_scaled)) %>% 
    left_join(., betaLog, by = 'city')

In [205]:
summary(lm(tp_scaled ~ betaLog, data = div_df_mean))

### Does difference in neutral diversity predict HCN clines?

In [209]:
div_mod <- aov(betaLog ~ tp_diff, data = df_allStats)
summary(div_mod)

### Does difference in pi differ between cities with and without clines?

In [210]:
tpDiff_by_sig_mod <- aov(tp_diff ~ sigLog, data = df_allStats)
summary(tpDiff_by_sig_mod)

In [211]:
emmeans(tpDiff_by_sig_mod, specs = 'sigLog')

### Does Fst predict HCN?

In [212]:
# Does Fst predict HCN?
# Rremove Kyoto since Fst likely biased by low sample size (N = 1)
df_allStats_noKyoto <- df_allStats %>% filter(!(city == 'Kyoto'))
fst_mod <- aov(betaLog ~ fst, data = df_allStats_noKyoto)
summary(fst_mod)

### Does Fst differ between cities with and without clines?

In [213]:
df_allStats_noKyoto %>% ungroup() %>% summarise(meanFst = mean(fst), n = n(), se = sd(fst) / sqrt(n))

In [214]:
fst_mod <- aov(fst ~ sigLog, data = df_allStats_noKyoto)
summary(fst_mod)

In [215]:
emmeans(fst_mod, specs = 'sigLog')

In [216]:
# Standard errors from data instead of model
df_allStats_noKyoto %>% 
    group_by(sigLog) %>% 
    summarise(mean = round(mean(fst), 4),
              n = n(),
              se = round(sd(fst) / sqrt(n), 4))

### Does euclidean distance predict HCN?

In [217]:
dist_mod <- aov(betaLog ~ distance, data = df_allStats)
summary(dist_mod)

## Figure 4 for main text

- Figure 4 will illustrate some of the (null) results from the models above

In [218]:
# Theme used for plotting
ng1 <- theme(aspect.ratio=0.7,panel.background = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border=element_blank(),
          axis.line.x = element_line(color="black",size=1),
          axis.line.y = element_line(color="black",size=1),
          axis.ticks=element_line(color="black"),
          axis.text=element_text(color="black",size=15),
          axis.title=element_text(color="black",size=1),
          axis.title.y=element_text(vjust=2,size=17),
          axis.title.x=element_text(vjust=0.1,size=17),
          axis.text.x=element_text(size=15),
          axis.text.y=element_text(size=15),
          strip.text.x = element_text(size = 10, colour = "black",face = "bold"),
          strip.background = element_rect(colour="black"),
          legend.position = "top", legend.direction="vertical",
          legend.text=element_text(size=17), legend.key = element_rect(fill = "white"),
          legend.title = element_text(size=17),legend.key.size = unit(1.0, "cm"))

### Figure 4A

- Histogram showing urban and rural theta pi

In [219]:
# Colors for histograms
pal <- wes_palette('Darjeeling1', 5, type = 'discrete')
urban_col <- pal[4]
rural_col <- pal[2]
cols <- c(urban_col, rural_col)

In [221]:
rural_div_df <- obs_div_df %>% filter(habitat == 'r')
urban_div_df <- obs_div_df %>% filter(habitat == 'u')
alpha = 0.75
pi_by_habitat <- ggplot() +
    geom_histogram(data = rural_div_df, 
                   bins = 26, 
                   aes(x = tp_scaled, fill = habitat), 
                   color = 'black',
                  alpha = alpha) +
    geom_histogram(data = urban_div_df, 
                   bins = 26, 
                   aes(x = tp_scaled, fill = habitat), 
                   color = 'black', 
                   alpha = alpha) +
    scale_fill_manual(values = rev(cols), labels = c('Rural', 'Urban')) +
    geom_vline(xintercept = mean(rural_div_df %>% pull(tp_scaled)), color = rural_col, linetype = 'dashed') +
    geom_vline(xintercept = mean(urban_div_df %>% pull(tp_scaled)), color = urban_col, linetype = 'dashed') +
    ylab('Number of cities') + xlab('Pairwise nucleotide diversity') +
    scale_y_continuous(breaks = seq(from = 0, to = 7, by = 1)) +
    scale_x_continuous(breaks = seq(from = -0.017, to = 0.023, by = 0.001)) +
    coord_cartesian(xlim = c(0.017, 0.023)) +
    ng1
pi_by_habitat

### Figure 4B

- Slope of HCN cline vs. difference in pi

In [222]:
pal <- c("#909090", "#FF0000", "#046C9A")

In [223]:
slope_by_PIdiff <- ggplot(df_allStats, aes(x = tp_diff, y = betaLog)) +
    geom_point(size = 5, aes(fill = significance), shape = 21) +
    geom_smooth(method = 'lm', color = 'black', size = 1.5) +
    scale_fill_manual(values = pal) +
    ylab('Slope of HCN clines (log-odds)') + xlab('Rural-urban difference in pariwise nucleotide diversity') +
    ng1
slope_by_PIdiff

### Figure 4C

- Histogram of Fst by significant cline

In [224]:
alpha <- 0.6
fst_by_cline <- ggplot(df_allStats_noKyoto, aes(x = fst)) +
    geom_histogram(data = df_allStats_noKyoto %>% filter(significance == 'Not significant'),
                                                         bins = 25, 
                                                         aes(fill = significance), 
                                                         color = 'black', 
                                                         alpha = alpha) +
    geom_histogram(data = df_allStats_noKyoto %>% filter(significance == 'Significantly negative'),
                                                         bins = 25, 
                                                         aes(fill = significance), 
                                                         color = 'black', 
                                                         alpha = alpha) +
   geom_histogram(data = df_allStats_noKyoto %>% filter(significance == 'Significantly positive'),
                                                         bins = 25, 
                                                         aes(fill = significance), 
                                                         color = 'black', 
                                                         alpha = alpha) +
    scale_fill_manual(values = pal) +
    coord_cartesian(xlim = c(0.01, 0.13)) +
    scale_x_continuous(breaks = seq(from = 0.01, to = 0.13, by = 0.02)) +
    ylab('Number of cities') + xlab('Urban-rural Fst') +
#     scale_y_continuous(breaks = seq(from = 0, to = 7, by = 1)) +
    ng1
fst_by_cline

### Figure 4D

- Slope of clines vs. Fst

In [225]:
slope_by_fst <- ggplot(df_allStats_noKyoto, aes(x = fst, y = betaLog)) +
    geom_point(size = 5, aes(fill = significance), shape = 21) +
    geom_smooth(method = 'lm', color = 'black', size = 1.5) +
    scale_fill_manual(values = pal) +
    scale_x_continuous(breaks = seq(from = 0.01, to = 0.13, by = 0.02)) +
    ylab('Slope of HCN clines (log-odds)') + xlab("Hudson's Fst") +
    ng1
slope_by_fst

## Figure SX 

- PCA figure with urban/rural centroids

In [226]:
euc_dist_df_forPlot <- eigen_df %>% 
    group_by(city, site) %>% 
    summarise(x = mean(PC1),
              y = mean(PC2))
head(euc_dist_df_forPlot)

In [227]:
pca_centroids <- ggplot(euc_dist_df_forPlot, aes(x = x, y = y, fill = site, shape = site)) +
    geom_line(size = 0.35, alpha = 0.4, aes(group = city)) +
    geom_point(size = 3.5) +
    scale_fill_manual(values = rev(cols), labels = c('Rural', 'Urban')) +
    scale_shape_manual(values = c(21, 24), labels = c('Rural', 'Urban')) +
    scale_x_continuous(breaks = seq(from = -0.08, to = 0.06, by = 0.02)) +
    scale_y_continuous(breaks = seq(from = -0.05, to = 0.15, by = 0.03)) +
    ylab('PC2 (7%)') + xlab("PC1 (18%)") + 
    geom_label_repel(data = euc_dist_df_forPlot %>% filter(site == 'r'),
                     aes(label = city),
                        color = 'black',
                        fill = 'white', 
                        size = 3,
                        arrow = arrow(length = unit(0.03, "npc"), 
                        type = "closed", ends = "last"),
                        nudge_y = 0.1,
                        segment.size  = 0.3) +
    ng1
pca_centroids

In [228]:
file.create(snakemake@output[[1]])