In [None]:
library(dplyr)
library(data.table)
library(tidyr)
library(ggplot2)
library(patchwork)
library(ggrepel)
library(fitdistrplus)
library(truncnorm)

In [None]:
# read conservation per lineages 
dfdata <- list()
dfall = lapply(X = snakemake@input$all,
       function(x){
           fread(x)
       } 
)

dflt = lapply(X = snakemake@input$lt,
       function(x){
           fread(x)
       } 
)
names(dfall) <- snakemake@params$lineages
names(dflt) <- snakemake@params$lineages
dfdata[["lt"]] <- dflt
dfdata[["all"]] <- dfall

In [None]:
# read date per lineage data
dfd <- fread(snakemake@input$abundance_dates_per_lineage)
dfd_numeric = dfd %>%
    dplyr::select(-lineage, -Counts, month_peak) %>%
    mutate_all(as.numeric) 

dfd_date = dfd %>%
    dplyr::select(-lineage, -Counts, month_peak)

dfd_lineage <- dfd %>%
    dplyr::select(lineage)

dfd_numeric <- bind_cols(dfd_lineage,dfd_numeric)
dfd_date <- bind_cols(dfd_lineage,dfd_date)

In [None]:
#get contacts
dfcnt = fread(cmd = paste("sed s/?/NA/g ",snakemake@input$contacts))
colnames(dfcnt) <- paste("Cluster",1:ncol(dfcnt),sep="_")
dfcnt$Position = 1:nrow(dfcnt)

dfcnt_tmp <- dfcnt %>%
    pivot_longer(cols = starts_with("Cluster"),names_to = "Cluster",values_to = "Contact")

dfcnt_tmp <- dfcnt_tmp %>%
    group_by(Position) %>%
    filter(!is.na(Contact)) %>%
    summarise(Cluster_all=min(Contact,na.rm = T))   
dfcnt <- dfcnt %>%
    left_join(dfcnt_tmp,by="Position") %>%
    dplyr::select(Position,starts_with("Cluster")) %>%
    pivot_longer(cols = starts_with("Cluster"),names_to = "Cluster",values_to = "Contact")

In [None]:
#collect positions mostly enriched - 
options(warn=-1)
contact_limit = 4.5
norm_limit = 0.95
rez = list()
ct = 0
lineages = snakemake@params$lineages
lineageslt = unlist(strsplit("AY.4.5	Q.1	B.1.1.7	B.1.177.60	BA.2	BA.2.9",split='\t')[[1]])

#lineages = unlist(strsplit("AY.4.5",split='\t')[[1]])

enriched_positions = list()
scope=c('all','lt')
ct_enriched_positions = 0
for (sco in scope) {
    #lineage = "AY.4.5"
    for (lineage in lineages){
        dfe <- dfdata[[sco]][[lineage]]
        for (cluster_id in unique(dfcnt$Cluster) ) {
            ct = ct + 1
            # print(paste("Working on",cluster_id,contact_limit ))
            contacts_df <- dfcnt %>%
                dplyr::filter(Contact > contact_limit & Cluster == cluster_id) %>%
                mutate(Contact=1) %>%
                mutate(Cluster=NULL)

            dftest <- dfe %>%
                left_join(contacts_df, by= "Position") %>%
                mutate(Variability=1-Conservation)
            dftest$Contact[is.na(dftest$Contact)] <- 0
            dftest$Contact <- as.factor(dftest$Contact)
            df4print <- dftest %>%
                group_by(Contact) %>%
                summarise(M=mean(Entropy)) %>%
                ungroup() %>%
                pivot_wider(names_from = Contact, values_from = M)
            wct = wilcox.test(Entropy ~ Contact, dftest,alternative = "less" )
            data_on_contacting <- dftest[dftest$Contact==1]
            if (wct$p.value <= 0.05) {
                ct_enriched_positions = ct_enriched_positions + 1
                testData = data_on_contacting$Entropy
                limitx = tryCatch(
                    {
                        rez_fit_runcnorm = fitdist(testData, "truncnorm",
                        start = list(sd = sd(testData)),  fix.arg=list(a=0, mean=0))
                        limitx = qtruncnorm(norm_limit,a=0,b=Inf, mean = 0, sd = rez_fit_runcnorm$estimate["sd"])
                    }, error = function(err) { 
                        limitx = max(testData)

                    }
                    
                )
                pass <- data_on_contacting %>%
                    filter(Entropy > limitx)
                pass$Scope <- sco
                pass$Cluster <- cluster_id
                pass$lineage <- lineage
                meanE = mean(data_on_contacting$Entropy)
                sdE = sd(data_on_contacting$Entropy)
                pass <- pass %>%
                    mutate(Z_score=(Entropy-meanE)/sdE)
                enriched_positions[[ct]] = pass
               }


        }
    }
}
# ggplot(data_on_contacting) + geom_histogram(aes(x=Entropy))
# print(df4print)
# outtest = rosnerTest(data_on_contacting$Entropy, k=30)
# outtest
enriched_positions <- bind_rows(enriched_positions)
head(enriched_positions)


# LT vs ALL

In [None]:
options(repr.plot.width=9, repr.plot.height=4, jupyter.plot_mimetypes = "image/svg+xml")
enriched_positions_2 <- enriched_positions %>%
    filter(lineage %in% lineageslt) %>%
    group_by(Position,Scope, lineage) %>%
    summarise(N_cls=length(unique(Cluster))) #%>%
    #filter(N_cls > 1) #%>%
enriched_positions_lt <- ggplot(enriched_positions_2)+ geom_point(aes(x=Position,size=N_cls, color=Scope, y=Scope)) + xlab("Scope") +
facet_grid(rows = vars(lineage)) +
theme(text = element_text(size = 12), strip.text.y = element_text(size = 6)) +
scale_size(range = c(1, 2)) +
labs(size = "Number of cluster")
enriched_positions_lt 


In [None]:
enriched_positions_4ven<- enriched_positions %>%
    #filter(Scope == "all", Cluster=="Cluster_all") %>%
    group_by(Position,Scope) %>%
    summarise(N_cls=length(unique(Cluster))) %>%
    ungroup()
 d <- dist(enriched_positions_4ven$Position, method = "euclidean")
clusts <- cutree(hclust(d), h=10)
enriched_positions_4ven$cluster <- clusts

In [None]:
library(ggvenn)
options(repr.plot.width=10, repr.plot.height=5, jupyter.plot_mimetypes = "image/svg+xml")

a <- list(`all` = filter(enriched_positions_4ven, Scope == "all")$cluster,
          `lt` = filter(enriched_positions_4ven, Scope == "lt")$cluster
         )
vent <- ggvenn(a, c("all", "lt"),set_name_size=4, text_size = 4, auto_scale = T,
              fill_alpha=0.2,
              stroke_alpha=0.3,
              stroke_size = 0, fill_color = c("#00B6EB", "#F8766D"))            # draw two-set venn

lt_vs_all <- (enriched_positions_lt | vent  ) +   plot_annotation(tag_levels = 'a') + plot_layout(widths = c(6,4))
ggsave(snakemake@output$image1, plot = lt_vs_all, width = 10, height = 5)
lt_vs_all

# Overall trends

In [None]:
lineage_time_ab <- fread(snakemake@input$abundance_dates_per_lineage) %>%
    dplyr::select(lineage, month_peak) %>%
    mutate(month_peak_numeric = as.numeric(month_peak)) %>%
    arrange(month_peak)
head(lineage_time_ab)

In [None]:
options(repr.plot.width=6, repr.plot.height=9, jupyter.plot_mimetypes = "image/svg+xml")
enriched_positions_3 <- enriched_positions %>%
    group_by(Position,Scope, lineage) %>%
    filter(Scope == "all") %>%
    summarise(N_cls=length(unique(Cluster))) %>%
    mutate(lineageF = factor(lineage, levels= lineage_time_ab$lineage))
# add cluster
d <- dist(enriched_positions_3$Position, method = "euclidean")
clusts <- cutree(hclust(d),h=200)
enriched_positions_3$Pcl <- clusts

    #filter(N_cls > 1) #%>%
enriched_positions_all <- ggplot(enriched_positions_3)+ geom_point(aes(x=Position,size=N_cls, y=Scope, color=as.factor(Pcl))) + 
ylab("Lineages") +
xlab("Sequence position") +
facet_wrap(vars(lineageF), strip.position = "left", ncol =1) +
#, ncol = 1)
theme(text = element_text(size = 12), strip.text.y = element_text(size = 6),
        axis.text.y=element_blank(),  #remove y axis labels
        axis.ticks.y=element_blank(),  #remove y axis ticks
        panel.spacing = unit(0.1, "lines")
     ) +
scale_size(range = c(1, 2)) +
labs(size = "Number of structural \ncluster", color="Positional cluster") 
enriched_positions_all

In [None]:
enriched_positions_34tr <- enriched_positions_3 %>%
    group_by(Pcl, lineage) %>%
    summarise(N=n()) %>%
    ungroup()
#%>%
    #left_join(lineage_time_ab, by = "lineage")
#ggplot(enriched_positions_34tr) + geom_point(aes(x=month_peak, y = N, color = as.factor(Pcl))) + facet_grid(rows = vars(Pcl))

In [None]:
Pcls4t <- unique(enriched_positions_34tr$Pcl)
lineages4t <- unique(enriched_positions_34tr$lineage)
Pcls_V <- list()
leneages_V <- list()
ct = 0
for (l in lineages4t) {
    for (c in Pcls4t) {
        ct = ct + 1
        Pcls_V[[ct]] <- c
        leneages_V[[ct]] <- l
    }
}
df4t <- data.frame('lineage' = unlist(leneages_V), 'Pcl' = unlist(Pcls_V)) %>%
    left_join(enriched_positions_34tr, by = c("lineage","Pcl"))
df4t$N[is.na(df4t$N)] <- 0
df4t <- df4t %>%
    left_join(lineage_time_ab, by = "lineage")
df4t_plot <- ggplot(df4t) + geom_point(aes(x=month_peak, y = N, color = as.factor(Pcl))) + facet_grid(rows = vars(Pcl))
df4t_plot 

In [None]:
df4t4cor <- df4t %>%
    dplyr::select(-month_peak) %>%
    mutate(Pcl = as.character(Pcl)) %>%
    pivot_wider(names_from = Pcl, values_from = N,names_prefix = "Pcl_")
df4t4cor_set1 <- unlist(df4t4cor$month_peak_numeric)
df4t4cor_set2 <- dplyr::select(df4t4cor,starts_with("Pcl_"))
ns <- list()
ps <- list()
rs <- list()
ct0 = 0
for (n in names(df4t4cor_set2)) {
    ct0 = ct0 + 1
    print(n)
    ct = cor.test(df4t4cor_set2[[n]],df4t4cor_set1, method = "spearman",exact = FALSE )
    p = ct$p.value
    r = ct$estimate
    ns[ct0] = sub("Pcl_","",n)
    ps[ct0] = p
    rs[ct0] = r
}
df4t4cor2 <-  data.frame(Pcl = as.integer(ns), "p" = round(unlist(ps), digits = 4), "r" = round(unlist(rs), digits = 2))
df4t4cor2

In [None]:
options(repr.plot.width=6, repr.plot.height=8, jupyter.plot_mimetypes = "image/svg+xml")
df4t_plot <- ggplot(df4t) + geom_point(aes(x=month_peak, y = N, color = as.factor(Pcl))) + facet_grid(rows = vars(Pcl)) +
geom_smooth(method=lm, aes(x=month_peak, y = N), alpha = 0.3,linetype = "dashed", linewidth = 0.4)
df4t_plot <- df4t_plot + geom_text(data = df4t4cor2, aes(label = paste("r = ",r," p = ",p,sep = "")), y = max(df4t$N), x = mean(df4t$month_peak), vjust = 1)+
theme(text = element_text(size = 12), strip.text.y = element_text(size = 12), legend.position = "none")  +
xlab("Month") + ylab("Number of structural clusters")

df4t_plot

In [None]:
options(repr.plot.width=12, repr.plot.height=9, jupyter.plot_mimetypes = "image/svg+xml")

global_enrich <- enriched_positions_all | df4t_plot +  plot_annotation(tag_levels = 'a')
ggsave(snakemake@output$image2, plot = global_enrich, width = 12, height = 9)
global_enrich

# additional table on clusters enruchment

In [None]:
#df4t4cor2
#intervals of intervals
out1 <- enriched_positions_3 %>%
    group_by(Pcl) %>%
    summarise(Interval = paste("[",min(Position),"-",max(Position),"]",sep=""))
out1 <- df4t4cor2 %>%
    left_join(out1, by="Pcl") %>%
    dplyr::select(`Positional cluster`=Pcl, r,p,`Sequence range` = Interval)
fwrite(x = out1,file = snakemake@output$data1)

In [None]:
#Summarizing output

In [None]:
out2 <- enriched_positions %>%
    filter(Scope == "lt") %>%
    group_by(Position,Reference,lineage) %>%
    summarise(Cluster=paste0(Cluster, collapse = "|"), N_clusters = n(), Z_score = mean(Z_score)) %>%
    filter(N_clusters > 1)
    
    
fwrite(x = out2, file = snakemake@output$data2)