# Population structure and diversity

## Setup

In [None]:
# Load required packages
library(tidyverse)
library(wesanderson)
library(vegan)
library(umap)

In [None]:
# Sample order from ANGSD
sample_order <- suppressMessages(
        read_delim(snakemake@input[["order"]], col_names = FALSE, delim = '\t')
)
head(sample_order)

In [None]:
# Load sample sheet
samples <- suppressMessages(
        read_delim(snakemake@config[["samples"]], col_names = TRUE, delim = '\t') %>%
        filter(Sample %in% sample_order$X1)
)
head(samples)

### Functions

## Population structure

All population structure analyses were performed using 199,624 4fold SNPs from across the genome. SNPs had MAF > 0.05 and were pruned for LD such that no pairwise $r^2$ was greater than 0.2

### Pricipal Components Analysis (PCA)

- Results from a PCA performed in `pcangsd` using genotype likelihoods estimated across all samples in `ANGSD`

In [None]:
# Load covariance matrix
covMat <- suppressMessages(
        read_delim(snakemake@input[["cov"]], col_names = FALSE, delim = ' ')
)
head(covMat)

In [None]:
# Perform eigen decomposition of covariance matrix
# Extract eigenvectors and bind with sample sheet for plotting and coloring by habitat
eigenvectors <- eigen(covMat)
eigen_df <- eigenvectors$vectors %>%
        as.data.frame() %>%
        dplyr::select(V1, V2, V3, V4) %>%
        rename('PC1' = 'V1',
               'PC2' = 'V2',
               'PC3' = 'V3',
               'PC4' = 'V4') %>%
    bind_cols(., samples)

In [None]:
# Function to calculate % variance and cummulative variance from PC
# Used to generate summary table rather than printing the lengthy `princomp()` summary
pca_importance <- function(x) {
  vars <- x$sdev^2
  vars <- vars/sum(vars)
  rbind(`Standard deviation` = x$sdev, `Proportion of Variance` = vars, 
      `Cumulative Proportion` = cumsum(vars))
}

# Percent Variance explained by first 4 PCs
pca_importance(summary(princomp(covMat))) %>% 
    as.data.frame() %>% 
    rownames_to_column('var') %>% 
    dplyr::select(var, Comp.1:Comp.4)

In [None]:
cols_hab <- c("#007243", "#914205", "#003876")
pca_plot <- eigen_df %>%
    ggplot(., aes(x = PC1, y = PC2)) +
        geom_point(aes(color = Habitat, shape = Habitat), size = 7, alpha = 0.75) +
        scale_color_manual(values = cols_hab) +
        theme_classic() +
        xlab('PC1 (3.4%)') + ylab('PC2 (2.0%)') +
#         scale_x_continuous(breaks = seq(-0.10, 0.10, 0.10)) +
        theme(axis.text = element_text(size = 18),
              axis.title = element_text(size = 20),
              legend.position = 'top',
              legend.title = element_text(size = 16),
              legend.text = element_text(size = 14))
options(repr.plot.width = 8, repr.plot.height = 8)
pca_plot

In [None]:
ggsave(filename = snakemake@output[["pca"]], plot = pca_plot, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600, )

In [None]:
# What are those outlier cluster?
eigen_df %>% filter((PC1 > 0.1 & PC2 < -0.1) | (PC1 < 0 & PC2 < -0.15))

### Uniform Manifold Approximation Projection (UMAP)

- UMAP for same set of samples
- Uses number of UMAP components equal to the number of significant PCs from the PCA aboe, assessed using the broken stick model
- Takes as input a distance matrix derived from the covariance matrix of sample allele frequencies

In [None]:
scree <- screeplot(princomp(covMat), bstick = TRUE, npcs = 41)

In [None]:
distMat <- as.matrix(dist(covMat))
gwsd_umap <- umap(distMat, random_state = 42, input="dist", n_neighbors = 15, min_dist = 0.1, n_components = 41)
gwsd_umap_layout <- gwsd_umap$layout %>% 
    as_tibble() %>% 
    bind_cols(., samples)

In [None]:
umap_plot <- gwsd_umap_layout %>%
    ggplot(., aes(x = V1, y = V2)) +
        geom_point(aes(color = Habitat, shape = Habitat), size = 7, alpha = 0.75) +
        scale_color_manual(values = cols_hab) +
        theme_classic() +
        xlab('UMAP1') + ylab('UMAP2') +
        theme(axis.text = element_text(size = 18),
              axis.title = element_text(size = 20),
              legend.position = 'top',
              legend.title = element_text(size = 16),
              legend.text = element_text(size = 14))
umap_plot

In [None]:
ggsave(filename = snakemake@output[["umap"]], plot = umap_plot, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600, )

In [None]:
# What are those outlier cluster?
gwsd_umap_layout %>% filter((V1 > 0.3 & V2 < -0.4) | ( V1 < 0.25 & V2 > 0.5))

### Admixture

- I estimated admixture proportions using `NGSadmix` with genotype likelihoods across all samples estimated in `ANGSD`
    - I ran all K values from 2 to 10 10 times, each with a different random seed
    - Uses same 4fold SNPs as above
- I used Evanno's delta-K method implemented in `CLUMPAK` to estimate the upper-most level of structure (i.e., "optimal" K)

In [None]:
# Best K by Evanno is 5. Load logs for K = 5
clumpak_log_path <- paste0(snakemake@input[["evanno"]], '/output.log')
clumpak_log <- readLines(clumpak_log_path)
optimal_K <- as.numeric(str_sub(clumpak_log[grep('Optimal K', clumpak_log)],-1,-1))
optimal_K

In [None]:
# Peak in Delta K looks good
library("IRdisplay")
display_png(file=paste0(snakemake@input[["evanno"]], '/Best_K_By_Evanno-DeltaKByKGraph.png'))

#### Optimal K

In [None]:
# Function to load NGSadmix log files
load_ngsadmix_log <- function(path){
    seed <- as.numeric(str_extract(basename(path), pattern = '(?<=seed)[0-9]+(?=\\.log)'))
    k <- as.numeric(str_extract(basename(path), pattern = '(?<=_K)[0-9]+(?=_seed)'))
    logfile <- readLines(path)[9]  ## 9 index is for line with likelihood
    like <- as.numeric(str_extract(logfile, pattern = '(?<=like=)-[0-9]+\\.[0-9]+'))
    
    df_out <- data.frame(seed = seed, k = k, like = like)
    return(df_out)
}

like_df <- map_dfr(snakemake@input[["admix_log"]], load_ngsadmix_log)
head(like_df)

In [None]:
# Get lowest likelihood for plotting
optim_k_min_seed <- like_df %>%
    filter(k == optimal_K) %>% 
    filter(like == min(like)) %>% 
    pull(seed)
optim_k_min_seed

In [None]:
load_ngsadmix_qopt <- function(path, K, s){
    seed <- as.numeric(str_extract(basename(path), pattern = '(?<=seed)[0-9]+(?=\\.qopt)'))
    k <- as.numeric(str_extract(basename(path), pattern = '(?<=_K)[0-9]+(?=_seed)'))
    if(seed == s & k == K){
        df <- suppressMessages(read_delim(path, col_names = FALSE, delim = " ")) %>%
            dplyr::select(-sprintf("X%s", K + 1)) %>%
            bind_cols(., samples) %>% 
            pivot_longer(X1:sprintf("X%s", K), values_to = 'Probs') %>% 
            mutate(Probs = round(Probs, 5))
        return(df)
    }
}

# Load admixture results for seed with lowest log likelihood
admix_optimal <- purrr::map_dfr(snakemake@input[["admix_qopt"]], load_ngsadmix_qopt, K = optimal_K ,s = optim_k_min_seed)
admix_optimal

In [None]:
cols_admix <- wes_palette("Darjeeling1", n = optimal_K, type = 'continuous')
admix_plot_optimal <- 
  ggplot(admix_optimal, aes(factor(Sample), Probs, fill = factor(name), color = factor(name))) +
  geom_col(width=1) +
  facet_grid(~fct_relevel(Habitat,'Rural', 'Suburban', 'Urban'), switch = "x", scales = "free", space = "free") +
  theme_minimal() + labs(x = "", title = sprintf("K=%s (Best K)", optimal_K), y = "Ancestry") +
  scale_y_continuous(expand = c(0, 0)) +
  scale_x_discrete(expand = expansion(add = 1)) +
  scale_fill_manual(values = cols_admix) + 
  scale_color_manual(values = cols_admix) +
  theme(
    legend.position = 'none',
    panel.spacing.x = unit(0.1, "lines"),
    axis.text.x = element_blank(),
    panel.grid = element_blank(),
    axis.text = element_text(size=16),
    axis.title = element_text(size=20),
    strip.text.x = element_text(size = 16),
    plot.title = element_text(size = 23, face = 'bold'),
    axis.text.y = element_blank()
  ) 
admix_plot_optimal

In [None]:
ggsave(filename = snakemake@output[["admix_optimal"]], plot = admix_plot_optimal, device = 'pdf', 
       width = 20, height = 3, units = 'in', dpi = 600)

#### Optimal K minus 1

In [None]:
# Get lowest likelihood for plotting
optim__minus_k_min_seed <- like_df %>%
    filter(k == optimal_K - 1) %>% 
    filter(like == min(like)) %>% 
    pull(seed)
optim__minus_k_min_seed

In [None]:
# Load admixture results for seed with lowest log likelihood
admix_optimal_minus <- purrr::map_dfr(snakemake@input[["admix_qopt"]], load_ngsadmix_qopt, K = optimal_K - 1 ,s = optim__minus_k_min_seed)
admix_optimal_minus

In [None]:
cols_admix <- wes_palette("Darjeeling1", n = optimal_K - 1, type = 'continuous')
admix_plot_optimal_minus <- 
  ggplot(admix_optimal_minus, aes(factor(Sample), Probs, fill = factor(name), color = factor(name))) +
  geom_col(width=1) +
  facet_grid(~fct_relevel(Habitat,'Rural', 'Suburban', 'Urban'), switch = "x", scales = "free", space = "free") +
  theme_minimal() + labs(x = "", title = sprintf("K=%s", optimal_K - 1), y = "Ancestry") +
  scale_y_continuous(expand = c(0, 0)) +
  scale_x_discrete(expand = expansion(add = 1)) +
  scale_fill_manual(values = cols_admix) + 
  scale_color_manual(values = cols_admix) +
  theme(
    legend.position = 'none',
    panel.spacing.x = unit(0.1, "lines"),
    axis.text.x = element_blank(),
    panel.grid = element_blank(),
    axis.text = element_text(size=16),
    axis.title = element_text(size=20),
    strip.text.x = element_text(size = 16),
    plot.title = element_text(size = 23, face = 'bold'),
    axis.text.y = element_blank()
  ) 
admix_plot_optimal_minus

In [None]:
ggsave(filename = snakemake@output[["admix_optimal_minus"]], plot = admix_plot_optimal_minus, device = 'pdf', 
       width = 20, height = 3, units = 'in', dpi = 600, )

#### Optimal K plus 1

In [None]:
# Get lowest likelihood for plotting
optim_plus_k_min_seed <- like_df %>%
    filter(k == optimal_K + 1) %>% 
    filter(like == min(like)) %>% 
    pull(seed)
optim_plus_k_min_seed

In [None]:
# Load admixture results for seed with lowest log likelihood
admix_optimal_plus <- purrr::map_dfr(snakemake@input[["admix_qopt"]], load_ngsadmix_qopt, K = optimal_K + 1 ,s = optim_plus_k_min_seed)
admix_optimal_plus

In [None]:
cols_admix <- wes_palette("Darjeeling1", n = optimal_K + 1, type = 'continuous')
admix_plot_optimal_plus <- 
  ggplot(admix_optimal_plus, aes(factor(Sample), Probs, fill = factor(name), color = factor(name))) +
  geom_col(width=1) +
  facet_grid(~fct_relevel(Habitat,'Rural', 'Suburban', 'Urban'), switch = "x", scales = "free", space = "free") +
  theme_minimal() + labs(x = "", title = sprintf("K=%s", optimal_K + 1), y = "Ancestry") +
  scale_y_continuous(expand = c(0, 0)) +
  scale_x_discrete(expand = expansion(add = 1)) +
  scale_fill_manual(values = cols_admix) + 
  scale_color_manual(values = cols_admix) +
  theme(
    legend.position = 'none',
    panel.spacing.x = unit(0.1, "lines"),
    axis.text.x = element_blank(),
    panel.grid = element_blank(),
    axis.text = element_text(size=16),
    axis.title = element_text(size=20),
    strip.text.x = element_text(size = 16),
    plot.title = element_text(size = 23, face = 'bold'),
    axis.text.y = element_blank()
  ) 
admix_plot_optimal_plus

In [None]:
ggsave(filename = snakemake@output[["admix_optimal_plus"]], plot = admix_plot_optimal_plus, device = 'pdf', 
       width = 20, height = 3, units = 'in', dpi = 600, )

### Pi and Fst

- Pi and Fst were estimated in `ANGSD` using approximately 9.7 million 4fold sites across the genome

#### By Habitat

- Pi in each of 3 habitats
- Pairwise Fst across all 3 habitat types
- TODO: Consider ammending this to estimate all 3 pairwise comparison at once so that `ANGSD` also estimates the population branch statistic

In [None]:
# Function to load thetas for a given habitat
load_pi <- function(path){
    habitat <- as.character(str_extract(basename(path), pattern = "(?<=4fold_)\\w+(?=\\.thetas)"))
    df <- suppressMessages(read_delim(path, delim = '\t')) %>%
        mutate(habitat = habitat)
    return(df)
}

# Load in thetas for all habitats and merge into single dataframe
pi_byHab <-  purrr::map_dfr(snakemake@input[["pi_byHab"]], load_pi)
head(pi_byHab)

In [None]:
pi_byHab %>% 
    group_by(habitat) %>% 
    summarize(tp_scaled = sum(tP) / sum(nSites)) %>% 
    write_csv(snakemake@output[["pi_byHab_df"]], delim = "\t")

In [None]:
# Function to load Fst for habitat comparisons
load_fst <- function(path){
    print(path)
    hab_comb <- as.character(str_extract(basename(path), pattern = "(?<=4fold_)\\w+(?=\\_readable)"))
    colnames <- c('chrom', 'pos', 'num', 'denom')
    df <- suppressMessages(read_delim(path, delim = '\t', col_names = colnames)) %>%
        # Cap numerators at 0 if negative 
        # https://github.com/ANGSD/angsd/issues/309
        # Does not affect overall pattern
        mutate(num = ifelse(num < 0, 0, num)) %>%         
        # Estimate weighted Fst as ratio of averages
        # https://github.com/ANGSD/angsd/issues/61
        summarise(num_sum = sum(num),
                  denom_sum = sum(denom),
                  fst = num_sum / denom_sum,
                  nSites = n()) %>% 
        mutate(hab_comb = hab_comb)
    return(df)
}
fst_byHab <- purrr::map_dfr(snakemake@input[["fst_byHab"]], load_fst)

In [None]:
write_csv(fst_byHab, snakemake@output[["fst_byHab_df"]], delim = "\t")

### Relatedness

- I estimated pairwise relatedness across all samples using genotype likelihoods estimated in `ANGSD`
- Uses same LD-pruned 4fold SNPs as above as for population structure

In [None]:
# Function to load chromosomal NGSrelate results
load_relatedness <- function(path){
    
    chrom <- str_extract(basename(path), '^(.+)(?=_4fold)')
    df <- suppressMessages(read_delim(path, delim = '\t')) %>%
        mutate(chrom = chrom) %>%
        dplyr::select(-starts_with('J')) %>%
        dplyr::select(chrom, everything())
    return(df)
}

# Load all chromosomal NGSrelate results and combine into single DF
relate_df <- purrr::map_dfr(snakemake@input[["relate"]], load_relatedness)
head(relate_df)

In [None]:
# Get order of samples that were used as input to NGSrelate
# Uses all high quality samples (N = 115)
relate_sample_list <- suppressMessages(read_table(snakemake@input[["bl"]], col_names = 'bam')) %>%
    mutate(sample = str_extract(basename(bam), '(s_\\d+_\\d+)(?=_4fold)')) %>%
    separate(sample, into = c('tmp', 'pop', 'ind'), sep = '_', remove = FALSE) %>%
    dplyr::select(-tmp, -bam) %>%
    mutate(idx = seq(from = 0, to = n() - 1))
head(relate_sample_list)

In [None]:
# Bind sample names to NGSrelate sample IDs
# Add population and habitat info for later comparisons of relatedness among habitats
pops_habitat <- samples %>% 
    rename('pop' = 'Population') %>% 
    dplyr::select(pop, Habitat) %>%
    mutate(pop = as.character(pop)) %>%
    distinct()
relate_df_withHab <- relate_df %>%
    pivot_longer(cols = c('a', 'b'), values_to = 'idx') %>%
    left_join(., relate_sample_list, by = 'idx') %>%
    dplyr::select(-idx) %>%
    left_join(., pops_habitat, by = 'pop') %>%
    pivot_wider(values_from = c('sample', 'pop', 'ind', 'Habitat'))
head(relate_df_withHab)

In [None]:
# Calculate mean relatedness across chromosomes for each sample comparison
mean_relate_acrossChroms <- relate_df_withHab %>%
    group_by(sample_a, sample_b) %>%
    summarise(mean_rab_acrossChroms = mean(rab),
              n = n()) %>%
    left_join(., relate_df_withHab %>%
              dplyr::select(sample_a, sample_b, pop_a, pop_b, Habitat_a, Habitat_b) %>%
              distinct(),
             by = c('sample_a', 'sample_b')) %>%
    ungroup()
head(mean_relate_acrossChroms)

In [None]:
# Calculate mean relatedness across samples for specific habitat comparisons
mean_relate_acrossHabs <- mean_relate_acrossChroms %>%
    mutate(comparison = case_when(Habitat_a == 'Urban' & Habitat_b == 'Urban' ~ 'Within Urban',
                                  Habitat_a == 'Suburban' & Habitat_b == 'Suburban' ~ 'Within Suburban',
                                  Habitat_a == 'Rural' & Habitat_b == 'Rural' ~ 'Within Rural',
                                  
                                  Habitat_a == 'Urban' & Habitat_b == 'Suburban' ~ 'Urban-Suburban',
                                  Habitat_a == 'Suburban' & Habitat_b == 'Urban' ~ 'Urban-Suburban',
                                  
                                  Habitat_a == 'Urban' & Habitat_b == 'Rural' ~ 'Urban-Rural',
                                  Habitat_a == 'Rural' & Habitat_b == 'Urban' ~ 'Urban-Rural',
                                  
                                  Habitat_a == 'Suburban' & Habitat_b == 'Rural' ~ 'Suburban-Rural',
                                  Habitat_a == 'Rural' & Habitat_b == 'Suburban' ~ 'Suburban-Rural')) %>%
    group_by(comparison) %>%
    summarise(mean_rab = mean(mean_rab_acrossChroms),
              sd_rab = sd(mean_rab_acrossChroms),
              se_rab = sd_rab / sqrt(n()))
head(mean_relate_acrossHabs)
write_csv(mean_relate_acrossHabs, snakemake@output[["relate_byHabComb_df"]], delim = "\t")

In [None]:
# Plot relatedness comparisons
mean_relate_acrossHabs_plot <- mean_relate_acrossHabs %>%
    ggplot(., aes(x = comparison, y = mean_rab)) +
    geom_errorbar(aes(ymin = mean_rab - se_rab, ymax = mean_rab + se_rab), width = 0.15) +
    geom_point(size = 6) +
    xlab('Comparison') + ylab('Mean relatedness') +
    scale_fill_manual(values = cols_hab) +
    theme_classic() +
    theme(axis.text = element_text(size = 18),
          axis.text.x = element_text(angle = 45, hjust = 1),
          axis.title = element_text(size = 20))

ggsave(filename = snakemake@output[["relate_byHabComb"]], plot = mean_relate_acrossHabs_plot, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600, )

In [None]:
mean_relate_acrossChroms %>%
    arrange(desc(mean_rab_acrossChroms)) %>%
    head(n = 10) %>% 
    write_csv(., snakemake@output[["relate_bySampleComb"]], delim = '\t')