# Population structure and diversity

## Setup

In [None]:
# Load required packages
library(tidyverse)
library(wesanderson)
library(vegan)
library(ggheatmap)

In [None]:
# Sample order from ANGSD
sample_order <- suppressMessages(
        read_delim(snakemake@input[["order"]], col_names = FALSE, delim = '\t')
)
head(sample_order)

In [None]:
# Load sample sheet
samples <- suppressMessages(
        read_delim(snakemake@config[["samples"]], col_names = TRUE, delim = '\t') %>%
        filter(Sample %in% sample_order$X1)
)
head(samples)

In [None]:
# Load pops sheet
populations <- suppressMessages(
        read_delim(snakemake@config[["pops"]], col_names = TRUE, delim = ',')) %>% 
    mutate(Transect = ifelse(Habitat == "Urban", NA, Transect))
head(populations)

In [None]:
# ggplot theme with transparent backgrounds
transp_theme <- theme(panel.background = element_rect(fill = "transparent", colour = NA_character_), 
                      panel.grid.major = element_blank(),
                      panel.grid.minor = element_blank(),
                      plot.background = element_rect(fill = "transparent", colour = NA_character_),
                      legend.background = element_rect(fill = "transparent"),
                      legend.box.background = element_rect(fill = "transparent"),
                      legend.key = element_rect(fill = "transparent"))

## Pricipal Components Analysis (PCA)

In [None]:
# Load covariance matrix
covMat <- suppressMessages(
        read_delim(snakemake@input[["cov"]], col_names = FALSE, delim = ' ')
)
head(covMat)

In [None]:
# Perform eigen decomposition of covariance matrix
# Extract eigenvectors and bind with sample sheet for plotting and coloring by habitat
eigenvectors <- eigen(covMat)
eigen_df <- eigenvectors$vectors %>%
        as.data.frame() %>%
        dplyr::select(V1, V2, V3, V4) %>%
        rename('PC1' = 'V1',
               'PC2' = 'V2',
               'PC3' = 'V3',
               'PC4' = 'V4') %>%
    bind_cols(., samples)

In [None]:
# Function to calculate % variance and cummulative variance from PC
# Used to generate summary table rather than printing the lengthy `princomp()` summary
pca_importance <- function(x) {
  vars <- x$sdev^2
  vars <- vars/sum(vars)
  rbind(`Standard deviation` = x$sdev, `Proportion of Variance` = vars, 
      `Cumulative Proportion` = cumsum(vars))
}

# Percent Variance explained by first 4 PCs
pca_importance(summary(princomp(covMat))) %>% 
    as.data.frame() %>% 
    rownames_to_column('var') %>% 
    dplyr::select(var, Comp.1:Comp.4)

In [None]:
cols_hab <- c("#007243", "#914205", "#003876")
pca_plot <- eigen_df %>%
    ggplot(., aes(x = PC1, y = PC2)) +
        geom_point(aes(color = Habitat, shape = Habitat), size = 7, alpha = 0.75) +
        scale_color_manual(values = cols_hab) +
        theme_classic() +
        xlab('PC1 (3.4%)') + ylab('PC2 (2.0%)') +
#         scale_x_continuous(breaks = seq(-0.10, 0.10, 0.10)) +
        transp_theme +
        theme(axis.text = element_text(size = 18),
              axis.title = element_text(size = 20),
              legend.position = 'top',
              legend.title = element_text(size = 16),
              legend.text = element_text(size = 14),
              legend.background = element_blank(),
              legend.box.background = element_blank(),
              legend.key = element_blank())
options(repr.plot.width = 8, repr.plot.height = 8)
pca_plot

In [None]:
ggsave(filename = snakemake@output[["pca"]], plot = pca_plot, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600, )

In [None]:
# What are those outlier cluster?
eigen_df %>% filter((PC1 > 0.1 & PC2 < -0.1) | (PC1 < 0 & PC2 < -0.15))

## Admixture

In [None]:
# Best K by Evanno is 5. Load logs for K = 5
clumpak_log_path <- paste0(snakemake@input[["evanno"]], '/output.log')
clumpak_log <- readLines(clumpak_log_path)
optimal_K <- as.numeric(str_sub(clumpak_log[grep('Optimal K', clumpak_log)],-1,-1))
optimal_K

In [None]:
# Peak in Delta K looks good
library("IRdisplay")
display_png(file=paste0(snakemake@input[["evanno"]], '/Best_K_By_Evanno-DeltaKByKGraph.png'))

### Optimal K

In [None]:
# Function to load NGSadmix log files
load_ngsadmix_log <- function(path){
    seed <- as.numeric(str_extract(basename(path), pattern = '(?<=seed)[0-9]+(?=\\.log)'))
    k <- as.numeric(str_extract(basename(path), pattern = '(?<=_K)[0-9]+(?=_seed)'))
    logfile <- readLines(path)[9]  ## 9 index is for line with likelihood
    like <- as.numeric(str_extract(logfile, pattern = '(?<=like=)-[0-9]+\\.[0-9]+'))
    
    df_out <- data.frame(seed = seed, k = k, like = like)
    return(df_out)
}

like_df <- map_dfr(snakemake@input[["admix_log"]], load_ngsadmix_log)
head(like_df)

In [None]:
# Get lowest likelihood for plotting
optim_k_min_seed <- like_df %>%
    filter(k == optimal_K) %>% 
    filter(like == min(like)) %>% 
    pull(seed)
optim_k_min_seed

In [None]:
load_ngsadmix_qopt <- function(path, K, s){
    seed <- as.numeric(str_extract(basename(path), pattern = '(?<=seed)[0-9]+(?=\\.qopt)'))
    k <- as.numeric(str_extract(basename(path), pattern = '(?<=_K)[0-9]+(?=_seed)'))
    if(seed == s & k == K){
        df <- suppressMessages(read_delim(path, col_names = FALSE, delim = " ")) %>%
            dplyr::select(-sprintf("X%s", K + 1)) %>%
            bind_cols(., samples) %>% 
            pivot_longer(X1:sprintf("X%s", K), values_to = 'Probs') %>% 
            mutate(Probs = round(Probs, 5))
        return(df)
    }
}

# Load admixture results for seed with lowest log likelihood
admix_optimal <- purrr::map_dfr(snakemake@input[["admix_qopt"]], load_ngsadmix_qopt, K = optimal_K ,s = optim_k_min_seed)

In [None]:
levels <- admix_optimal %>% 
  dplyr::select(Sample) %>% 
  distinct() %>% 
  mutate(Sample = str_replace(Sample, "s_", "")) %>%
  separate(Sample, into = c('Pop', 'Plant'), sep = '_', remove = FALSE) %>% 
  arrange(as.integer(Pop), as.integer(Plant)) %>% 
  mutate(val = paste0(Pop, "_", Plant)) %>% 
  pull(val)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 3)
cols_admix <- wes_palette("Darjeeling1", n = optimal_K, type = 'continuous')
admix_plot_optimal <- admix_optimal %>% 
  mutate(Sample = str_replace(Sample, "s_", "")) %>% 
  ggplot(., aes(factor(Sample, levels = levels), Probs, fill = factor(name), color = factor(name))) +
  geom_col(width=1) +
  facet_grid(~fct_relevel(Habitat,'Rural', 'Suburban', 'Urban'), switch = "x", scales = "free", space = "free") +
  theme_minimal() + labs(x = "", title = sprintf("K=%s (Best K)", optimal_K), y = "Ancestry") +
  scale_y_continuous(expand = c(0, 0)) +
  scale_x_discrete(expand = expansion(add = 1)) +
  scale_fill_manual(values = cols_admix) + 
  scale_color_manual(values = cols_admix) +
  transp_theme +
  theme(
    legend.position = 'none',
    panel.spacing.x = unit(0.1, "lines"),
    axis.text.x = element_blank(),
    panel.grid = element_blank(),
    axis.text = element_text(size=18),
    axis.title = element_text(size=20),
    strip.text.x = element_text(size = 18),
    plot.title = element_text(size = 23, face = 'bold'),
    axis.text.y = element_blank()
  ) 
admix_plot_optimal

In [None]:
ggsave(filename = snakemake@output[["admix_optimal"]], plot = admix_plot_optimal, device = 'pdf', 
       width = 20, height = 3, units = 'in', dpi = 600)

### Optimal K minus 1

In [None]:
# Get lowest likelihood for plotting
optim__minus_k_min_seed <- like_df %>%
    filter(k == optimal_K - 1) %>% 
    filter(like == min(like)) %>% 
    pull(seed)
optim__minus_k_min_seed

In [None]:
# Load admixture results for seed with lowest log likelihood
admix_optimal_minus <- purrr::map_dfr(snakemake@input[["admix_qopt"]], load_ngsadmix_qopt, K = optimal_K - 1 ,s = optim__minus_k_min_seed)

In [None]:
cols_admix <- wes_palette("Darjeeling1", n = optimal_K - 1, type = 'continuous')
admix_plot_optimal_minus <- admix_optimal_minus %>% 
  mutate(Sample = str_replace(Sample, "s_", "")) %>% 
  ggplot(., aes(factor(Sample, levels = levels), Probs, fill = factor(name), color = factor(name))) +
  geom_col(width=1) +
  facet_grid(~fct_relevel(Habitat,'Rural', 'Suburban', 'Urban'), switch = "x", scales = "free", space = "free") +
  theme_minimal() + labs(x = "", title = sprintf("K=%s", optimal_K - 1), y = "Ancestry") +
  scale_y_continuous(expand = c(0, 0)) +
  scale_x_discrete(expand = expansion(add = 1)) +
  scale_fill_manual(values = cols_admix) + 
  scale_color_manual(values = cols_admix) +
  transp_theme +
  theme(
    legend.position = 'none',
    panel.spacing.x = unit(0.1, "lines"),
    axis.text.x = element_blank(),
    panel.grid = element_blank(),
    axis.title = element_text(size=20),
    strip.text.x = element_text(size = 18),
    plot.title = element_text(size = 23, face = 'bold'),
    axis.text.y = element_blank()
  ) 
admix_plot_optimal_minus

In [None]:
ggsave(filename = snakemake@output[["admix_optimal_minus"]], plot = admix_plot_optimal_minus, device = 'pdf', 
       width = 20, height = 3, units = 'in', dpi = 600, )

### Optimal K plus 1

In [None]:
# Get lowest likelihood for plotting
optim_plus_k_min_seed <- like_df %>%
    filter(k == optimal_K + 1) %>% 
    filter(like == min(like)) %>% 
    pull(seed)
optim_plus_k_min_seed

In [None]:
# Load admixture results for seed with lowest log likelihood
admix_optimal_plus <- purrr::map_dfr(snakemake@input[["admix_qopt"]], load_ngsadmix_qopt, K = optimal_K + 1 ,s = optim_plus_k_min_seed)

In [None]:
cols_admix <- wes_palette("Darjeeling1", n = optimal_K + 1, type = 'continuous')
admix_plot_optimal_plus <- admix_optimal_plus %>% 
  mutate(Sample = str_replace(Sample, "s_", "")) %>%
  ggplot(., aes(factor(Sample, levels = levels), Probs, fill = factor(name), color = factor(name))) +
  geom_col(width=1) +
  facet_grid(~fct_relevel(Habitat,'Rural', 'Suburban', 'Urban'), switch = "x", scales = "free", space = "free") +
  theme_minimal() + labs(x = "", title = sprintf("K=%s", optimal_K + 1), y = "Ancestry") +
  scale_y_continuous(expand = c(0, 0)) +
  scale_x_discrete(expand = expansion(add = 1)) +
  scale_fill_manual(values = cols_admix) + 
  scale_color_manual(values = cols_admix) +
  transp_theme +
  theme(
    legend.position = 'none',
    panel.spacing.x = unit(0.1, "lines"),
    axis.text.x = element_text(size = 10, angle = 90),
    panel.grid = element_blank(),
    axis.title = element_text(size=20),
    strip.text.x = element_text(size = 18),
    plot.title = element_text(size = 23, face = 'bold'),
    axis.text.y = element_blank()
  ) 
admix_plot_optimal_plus

In [None]:
ggsave(filename = snakemake@output[["admix_optimal_plus"]], plot = admix_plot_optimal_plus, device = 'pdf', 
       width = 20, height = 3, units = 'in', dpi = 600, )

## Pi and Fst

### By Habitat

#### Pi

In [None]:
# Function to load thetas for a given habitat
load_pi <- function(path){
    habitat <- as.character(str_extract(basename(path), pattern = "(?<=4fold_)\\w+(?=\\.thetas)"))
    df <- suppressMessages(read_delim(path, delim = '\t')) %>%
        mutate(habitat = habitat)
    return(df)
}

# Load in thetas for all habitats and merge into single dataframe
pi_byHab <-  purrr::map_dfr(snakemake@input[["pi_byHab"]], load_pi)
head(pi_byHab)

In [None]:
pi_byHab_sum <- pi_byHab %>% 
    group_by(habitat) %>% 
    summarize(tp_scaled = sum(tP) / sum(nSites),
              td_scaled = mean(Tajima)) 
pi_byHab_sum
 
write_delim(pi_byHab_sum, snakemake@output[["pi_byHab_df"]], delim = "\t")

#### Fst

In [None]:
# Function to load Fst for habitat comparisons
load_fst_byHab <- function(path){
    hab_comb <- as.character(str_extract(basename(path), pattern = "(?<=4fold_)\\w+(?=\\_readable)"))
    colnames <- c('chrom', 'pos', 'num', 'denom')
    df <- suppressMessages(read_delim(path, delim = '\t', col_names = colnames)) %>%
        # Cap numerators at 0 if negative 
        # https://github.com/ANGSD/angsd/issues/309
        # Does not affect overall pattern
        mutate(num = ifelse(num < 0, 0, num)) %>%         
        # Estimate weighted Fst as ratio of averages
        # https://github.com/ANGSD/angsd/issues/61
        summarise(num_sum = sum(num),
                  denom_sum = sum(denom),
                  fst = num_sum / denom_sum,
                  nSites = n()) %>% 
        mutate(hab_comb = hab_comb)
    return(df)
}
fst_byHab <- purrr::map_dfr(snakemake@input[["fst_byHab"]], load_fst_byHab)

In [None]:
fst_byHab

In [None]:
write_delim(fst_byHab, snakemake@output[["fst_byHab_df"]], delim = "\t")

### By Population

#### Fst

In [None]:
load_fst_byPop <- function(path){
    pop_comb <- as.character(str_extract(basename(path), pattern = "^\\w+(?=\\_4fold)"))
    pop1 <- str_split(pop_comb, pattern = "_", simplify = TRUE)[1]
    pop2 <- str_split(pop_comb, pattern = "_", simplify = TRUE)[2]
    colnames <- c('chrom', 'pos', 'num', 'denom')
    df <- suppressMessages(read_delim(path, delim = '\t', col_names = colnames)) %>%
        mutate(num = ifelse(num < 0, 0, num)) %>%         
        summarise(num_sum = sum(num),
                  denom_sum = sum(denom),
                  fst = num_sum / denom_sum,
                  nSites = n()) %>% 
        mutate(pop1 = pop1, pop2 = pop2)
    return(df)
}

fst_byPop <- purrr::map_dfr(snakemake@input[["fst_byPop"]], load_fst_byPop)

In [None]:
head(fst_byPop)

In [None]:
fill_symmetric_comparisons <- function(df) {
  # Create reversed version of all comparisons
  reversed_df <- df %>%
    select(pop1 = pop2, pop2 = pop1, fst)
  
  # Combine original and reversed dataframes
  complete_df <- bind_rows(df, reversed_df) %>%
    # Remove any duplicates that might exist
    distinct()
  
  return(complete_df)
}

fst_byPop_sym <- fill_symmetric_comparisons(fst_byPop)

In [None]:
matrix_data <- pivot_wider(fst_byPop_sym, 
                           names_from = pop2,
                           values_from = fst,
                           id_cols = pop1) %>% 
    column_to_rownames("pop1") %>% 
    as.matrix()

order <- c("37", "40", "41", "42", "43", "116", "23", "54", "97", "7", "83")

matrix_data <- matrix_data[order, order]

pops <- populations %>% filter(Population %in% rownames(matrix_data))
rows <- pops %>% arrange(match(Population, rev(order))) %>% 
    column_to_rownames("Population") %>% 
    dplyr::select(Transect, Habitat)
cols <- pops %>% arrange(match(Population, order)) %>% 
    column_to_rownames("Population") %>% 
    dplyr::select(Transect, Habitat)
    
hab_col <- c("#007243", "#914205", "#003876")
names(hab_col) <- c("Rural","Suburban","Urban")
tran_col <- c("#bfbfbf", "#808080", "#000000")
names(tran_col) <- c("East","North","West")

col <- list(Transect=tran_col, Habitat=hab_col)
text_rows <- rownames(matrix_data)


ggheatmap_theme <- function (ggheatmap, plotlist, theme){
     if (max(plotlist) > length(ggheatmap[[1]])) {
        message("The plotlist should be included in 1 ~", length(ggheatmap[[1]]))
    }else {
        for (i in 1:length(plotlist)) {
            num <- plotlist[i]
            ggheatmap[[num]] <- ggheatmap[[num]] +
            theme[[i]] #### which was defined as ggheatmap[[1]][[num]] <- ggheatmap[[1]][[num]] + theme[[i]] in the package.
        }
    }
    return(ggheatmap)
}

fst_heatmap <- ggheatmap(matrix_data, scale = "none",
          color=colorRampPalette(c( "#ff4a0dff","#b20000ff","#83080aff"))(100),
          text_show_rows = text_rows,
          legendName="Fst",
          annotation_rows = rows,
          annotation_cols = cols,
          annotation_color = col,
          levels_rows = rev(order),
          levels_cols = order,
          border = "black") %>% 
    ggheatmap_theme(1:5,
                  theme =list(
                    theme(axis.text = element_text(size = 16)),
                    theme(axis.text = element_text(size = 14)),
                    theme(axis.text = element_text(size = 14)),
                    theme(axis.text = element_text(size = 14)),
                    theme(axis.text = element_text(size = 14))
                  ))
fst_heatmap
ggsave(filename = snakemake@output[["fst_byPop"]], plot = fst_heatmap, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600)

#### Isolation by Distance

In [None]:
pop_coords <- pops %>% 
     dplyr::select(Population, latitude, longitude) %>% 
     mutate(Population = as.character(Population))

pop1_coords <- fst_byPop %>% 
    dplyr::select(pop1) %>% 
    rename("Population" = "pop1") %>% 
    left_join(pop_coords, by = "Population") %>% 
    rename("pop1_lat" = "latitude", "pop1_long" = "longitude") %>% 
    dplyr::select(pop1_lat, pop1_long)

pop2_coords <- fst_byPop %>% 
    dplyr::select(pop2) %>% 
    rename("Population" = "pop2") %>% 
    left_join(pop_coords, by = "Population") %>% 
    rename("pop2_lat" = "latitude", "pop2_long" = "longitude") %>% 
    dplyr::select(pop2_lat, pop2_long)


# Convert degrees to radians
deg2rad <- function(deg) return(deg*pi/180)

# Calculates the geodesic distance between two points specified by radian latitude/longitude using the
# Haversine formula (hf)
haversine <- function(long1, lat1, long2, lat2) {

  # Ensure Lats and Longs are in radians
  long1 <- deg2rad(long1)
  lat1 <- deg2rad(lat1)
  long2 <- deg2rad(long2)
  lat2 <- deg2rad(lat2)

  # Calculate geodesic distance based on havesine formala
  R <- 6371 # Earth mean radius [km]
  delta.long <- (long2 - long1)
  delta.lat <- (lat2 - lat1)
  a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.long/2)^2
  c <- 2 * atan2(sqrt(a), sqrt(1 - a))
  d = R * c
  return(d) # Distance in km
}

fst_byPop <- fst_byPop %>% 
    bind_cols(., pop1_coords, pop2_coords) %>% 
    mutate(pop_dist = haversine(pop1_long, pop1_lat, pop2_long, pop2_lat))

In [None]:
ibd_plot <- fst_byPop %>% 
    ggplot(aes(x = pop_dist, y = fst)) +
        geom_point(size = 3) +
        geom_smooth(method = "lm", color = "black") +
        xlab("Pariwise geographic distance (Km)") +
        ylab("Pairwise Fst") +
        coord_cartesian(ylim = c(0.02, 0.047)) +
        scale_y_continuous(breaks=seq(0.02, 0.045, 0.005)) +
        theme_classic() +
        transp_theme +
        theme(axis.text = element_text(size = 18),
              axis.title = element_text(size = 20))

ggsave(filename = snakemake@output[["ibd_plot"]], plot = ibd_plot, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600)

In [None]:
summary(lm(fst ~ pop_dist, data = fst_byPop))