In [2]:
library(tidyverse)
library(wesanderson)

# Looking at coverage across samples

- Will take a quick look at coverage and mapping % across samples

In [3]:
# Load in data with coverage and mapping stats
stats_file <- '../results/qc/multiqc/multiqc_data/multiqc_qualimap_bamqc_genome_results_qualimap_bamqc.txt'
stats <- read_delim(stats_file, delim = '\t') %>% 
    mutate(sample = str_extract(Sample, pattern = 's_\\d+_\\d+$')) %>% 
    dplyr::select(sample, everything(), -Sample)

## Plot and stats

### All samples

In [4]:
mean_coverage_allSamples <- mean(stats$mean_coverage)
mean_coverage_allSamples

In [5]:
# Plot mapping% against mean coverage
ggplot(data = stats, aes(x = mean_coverage, y = percentage_aligned)) +
    geom_point(size = 2) +
    xlab('Mean coverage') + ylab('Percentage aligned') +
    theme_classic()

In [6]:
# Plot general error rate
# 3 samples with high error rates likely Medicago
error_rate_plot <- ggplot(data = stats, aes(x = general_error_rate)) + 
    geom_histogram(bins = 100, color = 'black', fill = 'white') +
    xlab('General error rate') + ylab('Count') +
    theme_classic()
error_rate_plot

In [7]:
outpath <- snakemake@output[[1]]
ggsave(filename = outpath, plot = error_rate_plot, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600)

### High quality samples only

- Remove 3 samples that we identified as _Medicago_
- Remove another two samples that had bad coverage, low mapping %, and weird GC profiles
- Remove one of 2 samples from each of 5 pairs of highly related individuals (likely clones)
    - Based on analysis with `ngsRelate`

In [8]:
lowQual <- c('s_7_14', 's_54_9', 's_97_4', 's_23_12', 's_23_17')
related <- c('s_97_9', 's_37_18', 's_37_11', 's_42_3', 's_42_1', 's_83_11')
stas_mod <- stats %>% 
    mutate(category = case_when(sample %in% lowQual ~ 'bad',
                           sample %in% related ~ 'related',
                           TRUE ~ 'good'))

In [9]:
# Plot mapping% against mean coverage
cols <- wes_palette("Darjeeling1", n = 3)
ggplot(data = stas_mod, aes(x = mean_coverage, y = percentage_aligned)) +
    geom_point(size = 2, aes(colour = category)) +
    scale_color_manual(values = cols) +
    xlab('Mean coverage') + ylab('Percentage aligned') +
    theme_classic()

In [10]:
outpath <- snakemake@output[[2]]
ggsave(filename = outpath, plot = error_rate_plot, device = 'pdf', 
       width = 8, height = 8, units = 'in', dpi = 600)

In [11]:
stas_mod %>% 
    group_by(category) %>% 
    summarise(mean_cov = mean(mean_coverage))

# Samples for GLUE

We want to include 10 urban and 10 rural samples from Toronto in the first GLUE paper. 

Here I'll chose these 20 samples, and export a file with these sample IDs for use in the GLUE Snakemake pipeline

In [13]:
for_glue <- c('s_37_15', 's_37_8', 's_40_10', 's_40_3', 's_41_7', 's_41_8', 's_42_9', 's_42_10', 's_43_13', 's_43_8',
             's_83_9', 's_83_5', 's_83_18', 's_83_14', 's_82_19', 's_81_3', 's_83_13', 's_79_17', 's_78_4', 's_77_18')
samples_for_glue <- stas_mod %>% 
    dplyr::select(sample, category, mean_coverage) %>% 
    separate(sample, sep = '_', into = c('prefix', 'pop', 'plant'), remove = FALSE) %>% 
    filter(sample %in% for_glue)
samples_for_glue

In [14]:
outpath <- snakemake@output[[3]]
write_delim(x = samples_for_glue, file = outpath, delim = '\t', col_names = TRUE)

In [15]:
mean(samples_for_glue$mean_coverage)

In [16]:
range(samples_for_glue$mean_coverage)