In [None]:
library(data.table)
library(dplyr)
library(lubridate)
library(ggplot2)
library(dtplyr)
library(tidyr)

In [None]:
# clean uo contacts data
pdb_cluster_assign = fread(snakemake@input$pdb_cluster_assignments)
pdb_cluster_df <- list()
ct = 0
for (l in  readLines(snakemake@input$pdb_cluster_assignments)) {
    ct = ct + 1 
    df <- data.frame(PDB = strsplit(l, split = " ")[[1]])
    df$Cluster <- ct
    pdb_cluster_df[[ct]] <- df
}
pdb_cluster_df <- bind_rows(pdb_cluster_df)
head(pdb_cluster_df)

In [None]:
pdb_contacts <- fread(snakemake@input$antibody_contacts_data_pdbwise) 
pdb_contacts[pdb_contacts=="?"] <- NA
pdb_contacts[pdb_contacts <= 4.5 ] <- 1
pdb_contacts[is.na(pdb_contacts)] <- 0
pdb_contacts$Position = seq(1,nrow(pdb_contacts))
#seq(1,nrow(pdb_contacts))
pdb_contacts <- pdb_contacts %>%
    pivot_longer(cols = contains("AHL"), names_to =  "PDB", values_to = "detected") %>%
    filter(detected == 1) %>%
    mutate(PDB = sub("_AHL","",PDB)) %>%
    select(-detected)
head(pdb_contacts)

In [None]:
df <- fread(snakemake@input$df_file)

In [None]:
df <- df %>%
    filter(LT == 1) %>%
    filter(lineage_pangolin == month_strainLT)

In [None]:
median_per_month <- df %>%
    group_by(month) %>%
    summarise(N_mutations_month_median = median(N_mutations)) %>%
    ungroup()
dfsel <- df %>%
    left_join(median_per_month)
dfsel_median_controls <-   dfsel %>%
    filter(N_mutations == N_mutations_month_median) %>%
    group_by(month) %>%
    slice_head(n=1)

controls_seqnames <- unlist(dfsel_median_controls$seqname) 
dfsel_tops <-   dfsel %>%
    filter(! seqname %in% controls_seqnames) %>%
    group_by(month) %>%
    slice_max(order_by = N_mutations, n = 3, with_ties = F)
dfsel_tops$contacts_class <- "increased"
controls_seqnames$contacts_class <- "control"
chosen_sequences_general_info <- bind_rows(dfsel_tops,dfsel_median_controls)
fwrite(x = chosen_sequences_general_info, file = snakemake@output$additional_data)
length(unique((chosen_sequences_general_info$seqname)))

In [None]:
haplotypes_data = fread(snakemake@input$haplotype, showProgress = TRUE)

In [None]:
# get lt seqs only
haplotypes_data <- lazy_dt(haplotypes_data)

In [None]:
haplotypes_data_lt <- df %>%
    select(seqname) %>%
    mutate(LT=1)

haplotypes_data_lt <- lazy_dt(haplotypes_data_lt)
haplotypes_data_lt

In [None]:
haplotypes_data_lt <- haplotypes_data %>%
    left_join(haplotypes_data_lt, by = "seqname") %>%
    filter(LT == 1) %>%
    select(-LT) %>%
    as.data.frame()

In [None]:
haplotypes_data_lt_chosen <- haplotypes_data_lt %>%
    filter(seqname %in% chosen_sequences_general_info$seqname) %>%
    arrange(seqname,pos) %>%
    as.data.frame()

In [None]:
length(unique(haplotypes_data_lt_chosen$seqname))

In [None]:
# choose  PDBS per structure
pdb_choice <-  haplotypes_data_lt_chosen %>%
    left_join(pdb_contacts, by=c("pos" = "Position"),relationship = "many-to-many") %>%
    group_by(PDB, seqname) %>%
    summarise(N=n()) %>%
    ungroup() %>%
    group_by(seqname) %>%
    slice_max(order_by = N,with_ties = F, n=snakemake@params$max_sructures_to_test_per_month4antib) %>%
    ungroup()

In [None]:
#make that all one month sequences analyse same set of sequences
month_lineage <- chosen_sequences_general_info %>%
    select(month, seqname)
month_pdbs <-  pdb_choice %>%
    left_join(month_lineage, by = "seqname") %>%
    group_by(month) %>%
    summarise(PDB=list(unique(PDB))) %>%
    ungroup()
month_pdbs_lineage <- month_pdbs %>%
    left_join(month_lineage, by="month") %>%
    unnest(cols = PDB)
head(month_pdbs_lineage)

In [None]:
# format haplotypes
# Mutations,MutationsID,Template,PDB,Notes
# "F306L,E484K,S494P,D614G,E780A,D839V,T1027I;E156-,F157-",Omicron1,YP_009724390.1 ,7LQV,
# "F306L,E484K,S494P,D614G,E780A,D839V,T1027I;E156-,F157-",Omicron2,YP_009724390.1 ,7LQV,
haplotypes_4models <- haplotypes_data_lt_chosen %>%
    filter(mType != "INS") %>%
    rowwise() %>%
    mutate(MUT=paste(wt,pos,sub, sep = "")) %>%
    group_by(seqname) %>%
    summarise(haplotype = paste0(MUT, collapse = ","))
head(haplotypes_4models)
    

In [None]:
dat4mod <- month_pdbs_lineage %>%
    left_join(haplotypes_4models, by = "seqname") %>%
    select(-month) %>%
    select(Mutations=haplotype, MutationsID = seqname, , PDB) %>%
    mutate(Template="YP_009724390.1", Notes = "AAAAA") #%>%
dat4mod <- dat4mod %>%
    select(MutationsID,Template,PDB,Mutations,Notes)
fwrite(x = dat4mod, file = snakemake@output$chosen_data)
nrow(dat4mod)