In [None]:
library(data.table)
library(dtplyr)
library(dplyr, warn.conflicts = FALSE)
library(lubridate)

In [None]:
pangolin <- fread(snakemake@input$pangolin,showProgress = T,nThread = snakemake@threads,verbose = T) %>%
    lazy_dt()

In [None]:
nextclade <- fread(snakemake@input$nextclade,showProgress = T,nThread = snakemake@threads,verbose = T) %>%
    lazy_dt()

In [None]:
meta <- fread(snakemake@input$meta,showProgress = T,nThread = snakemake@threads,verbose = T) %>%
    lazy_dt()

In [None]:
df <- pangolin %>%
    left_join(meta,by=c("taxon"="strain")) %>%
    left_join(nextclade,by=c("taxon"="seqName")) %>%
    as.data.table() %>%
    lazy_dt()

In [None]:
lineage4focus=snakemake@params$id

In [None]:
df_focus = df %>%
    lazy_dt() %>%
    mutate(date = as.character(date)) %>%
    filter(lineage==lineage4focus) %>%
    filter(qc.overallStatus != "bad" & totalMissing <= 1000) %>%
    filter(nchar(date)==10) %>%
    filter(!grepl(pattern = "/",fixed = T, x = date)) %>%
    filter(!grepl(pattern = "X",fixed = T, x = date)) %>%
    mutate(date = date(date)) %>%
    mutate(month = lubridate::floor_date(date, "month")) %>%
    as.data.table() 

In [None]:
df_focus_per_month <- df_focus %>%
    lazy_dt() %>%
    group_by(month) %>%
    summarise(N=n()) %>%
    as.data.table()

In [None]:
n_to_take = 3
max_idx <- which.max(df_focus_per_month$N) 
min_idx = max_idx - n_to_take+1
min_idx = ifelse(min_idx<1,min_idx,min_idx)
months_to_take = df_focus_per_month$month[seq(min_idx,max_idx)]

In [None]:
df_focus <- df_focus %>%
    lazy_dt() %>%
    filter(month %in% months_to_take) %>%
    as.data.table()
fwrite(x = df_focus,file = snakemake@output$data)

In [None]:
ids <- df_focus %>%
    select(taxon) %>%
    as.data.table()
fwrite(x = ids,file = snakemake@output$ids, col.names = F)