In [None]:
library(dplyr)
library(data.table)
library(ggplot2)
library(tidyverse)
library(lubridate)
library(patchwork)
library(dtplyr)
library(clusterAnalysisUtils)
library(MASS)
library(robmixglm)
library(mvabund)
library(fitdistrplus)
library(lattice)
library(lmtest)
#require(pscl) # alternatively can use package ZIM for zero-inflated models
library(lmtest)
library(pscl)





In [None]:
dataid = fread(snakemake@input$data) %>%
     dplyr::select(taxon,country,date) %>%
     as.data.table()
names(dataid)
head(dataid)

In [None]:
dataref = fread(snakemake@input$data_ref) %>%
    dplyr::select(taxon,country,date) %>%
    as.data.table()


In [None]:
data = bind_rows(dataid,dataref)

In [None]:
#Convert all dates to days from the earliest
data$date <- as.Date(data$date)
min_date = min(data$date)
data$days = as.numeric(data$date - min_date)
data$weeks = round((data$days/7),digits = 0)


In [None]:
#reda clusters
id <- clusterAnalysisUtils::clusterDataParser$new(snakemake@input$id,format = "UC",remove_sizes = T)$df
id$Sample <- snakemake@wildcards$id
ref <- clusterAnalysisUtils::clusterDataParser$new(snakemake@input$ref,format = "UC",remove_sizes = T)$df
ref$Sample <- paste("Ref_",snakemake@wildcards$id,sep="")
df <- bind_rows(id,ref)

In [None]:
df <- df %>%
    lazy_dt() %>%
    left_join(data,by=c("Member"="taxon")) %>%
    as.data.table()
    


In [None]:
# group
df <- df %>%
    group_by(Sample,Cluster) %>%
    summarise(N=n(),D=max(days)-min(days)) %>%
    ungroup() %>%
    as.data.table()

head(df)



In [None]:
countdf <- df %>%
    group_by(Sample) %>%
    summarise(N=n(),D=mean(D,na.rm = T)) %>%
    ungroup() %>%
    as.data.table()
countdf

In [None]:
#general data on cluster with more than 1 member
dfwo0 <- filter(df,N>1) %>%
    as.data.table()

data = dfwo0
test = data %>%
    group_by(Sample) %>%
    summarise(Number=n(),
              DD=mean(D,na.rm = T),
              `Maximum of N`=max(N),
              `Minimum of N`=min(N),
              `Mean of N`=mean(N),
              `Median of N`=median(N),
              `Maximum of D`=max(D,na.rm = T),
              `Minimum of D`=min(D,na.rm = T),
              `Mean of D`=mean(D,na.rm = T),
              `Median of D`=median(D,na.rm = T),

              ) %>%
    mutate(
        `Mean of N`=round(`Mean of N`,digits = 2),
    ) %>%
    as.data.table()
test

In [None]:
options(repr.plot.width = 10, repr.plot.height = 4, jupyter.plot_mimetypes = "image/svg+xml")
#get initial situation with densities - HISTOGRAM
data=dfwo0
pd <- ggplot(data,aes(x=D,color=Sample)) + geom_density()
plotd <- as.data.table(ggplot_build(pd)$data) %>%
    group_by(colour) %>%
    summarise(x=max(x[which.max(y)])) %>%
    as.data.table() 
    
pd <- pd + geom_vline(xintercept=plotd$x,color=plotd$colour) + xlab("D, days")
pb <- ggplot(data, aes(x=Sample, y=D)) +
    geom_boxplot(aes(fill=Sample)) +
    guides(fill="none") +
    ylab("D, days")
print(plotd)
(pd | pb) + plot_annotation(title = 'Distribution identical sequence existance (D)')


In [None]:
options(repr.plot.width = 10, repr.plot.height = 4, jupyter.plot_mimetypes = "image/svg+xml")
data=dfwo0

pd <- ggplot(data,aes(x=N,color=Sample)) + geom_density(adjust=2) + xlim(0,15)

plotd <- as.data.table(ggplot_build(pd)$data) %>%
    group_by(colour) %>%
    summarise(x=max(x[which.max(y)])) %>%
    as.data.table()
pd <- pd + geom_vline(xintercept=plotd$x,color=plotd$colour) + xlab("N, number") 
pb <- ggplot(data, aes(x=Sample, y=N)) +
    geom_boxplot(aes(fill=Sample)) +
    guides(fill="none") + scale_y_log10() +
    ylab("N, number")
print(plotd)
(pd | pb) + plot_annotation(title = 'Distribution of cluster sizes (N)')


# plotd <- as.data.table(ggplot_build(pd)$data) %>%
#     group_by(colour) %>%
#     summarise(x=max(x[which.max(y)])) %>%
#     as.data.table()
# pd <- pd + geom_vline(xintercept=plotd$x,color=plotd$colour) 

In [None]:
#filter by N
data= dfwo0
test = as.data.frame(dplyr::filter(data,Sample==snakemake@wildcards$id))
q = quantile(test$N)
print(q)
qvals = unlist(q, use.names = F)
minl = qvals[[2]]
maxl = qvals[[4]]

dfwoe = dfwo0 %>%
    filter(N >=  minl & N <= maxl) %>%
    as.data.table()


In [None]:
#Data  on the filteredf data

data = dfwoe
test = data %>%
    group_by(Sample) %>%
    summarise(Number=n(),
              `Maximum of N`=max(N),
              `Minimum of N`=min(N),
              `Mean of N`=mean(N),
              `Median of N`=median(N),
              `Maximum of D`=max(D,na.rm = T),
              `Minimum of D`=min(D,na.rm = T),
              `Mean of D`=mean(D,na.rm = T),
              `Median of D`=median(D,na.rm = T),

              ) %>%
    mutate(
        `Mean of N`=round(`Mean of N`,digits = 2),
    ) %>%
    as.data.table()
test

In [None]:
options(repr.plot.width = 10, repr.plot.height = 4, jupyter.plot_mimetypes = "image/svg+xml")
data=dfwoe

pd <- ggplot(data,aes(x=N,fill=Sample)) + geom_histogram(binwidth = 1) +
    facet_grid(rows = vars(Sample),scales = "free") + xlab("N, number")


pb <- ggplot(data, aes(x=Sample, y=N)) +
    geom_boxplot(aes(fill=Sample)) +
    guides(fill="none") +
    ylab("N, number")
(pd | pb) + plot_annotation(title = 'Distribution of cluster sizes (N)  after cluster size match')


In [None]:
options(repr.plot.width = 10, repr.plot.height = 4, jupyter.plot_mimetypes = "image/svg+xml")
#get initial situation with densities - HISTOGRAM
data=dfwoe
pd <- ggplot(data,aes(x=D,color=Sample)) + geom_density()
plotd <- as.data.table(ggplot_build(pd)$data) %>%
    group_by(colour) %>%
    summarise(x=max(x[which.max(y)])) %>%
    as.data.table() 
    
pd <- pd + geom_vline(xintercept=plotd$x,color=plotd$colour) + xlab("D, days")
pb <- ggplot(data, aes(x=Sample, y=D)) +
    geom_boxplot(aes(fill=Sample)) +
    guides(fill="none") +
    ylab("D, days")
print(plotd)
(pd | pb) + plot_annotation(title = 'Distribution identical sequence existance (D) after cluster size match')

In [None]:
# Statistical analysis

In [None]:
data = dfwoe
d1 <- as.data.table(filter(data, Sample==snakemake@wildcards$id))$D
d2 <- as.data.table(filter(data, Sample==paste("Ref",snakemake@wildcards$id,sep="_")))$D
data$Sample <- factor(data$Sample)
levels(data$Sample)


In [None]:
options(repr.plot.width = 6, repr.plot.height = 4, jupyter.plot_mimetypes = "image/svg+xml")

fitd1 <- fitdist(d1, "nbinom")
fitd2 <- fitdist(d2, "nbinom")
colors = c("#F8766D","#00BFC4")
ids = c(snakemake@wildcards$id,paste("Ref",snakemake@wildcards$id,sep="_"))

fx1 <- min(d1):max(d1)
fx2 <- min(d2):max(d2)
fy1 <- dnbinom(fx1, size=fitd1$estimate[[1]], mu=fitd1$estimate[[2]])*length(d1)
fy2 <- dnbinom(fx2, size=fitd2$estimate[[1]], mu=fitd2$estimate[[2]])*length(d2)

px1 <- ggplot(data=data.frame(D=d1),aes(x=D)) + geom_histogram(fill=colors[[1]],alpha=0.5,binwidth = 1) +
    geom_line(data=data.frame(x=fx1,y=fy1),aes(x,y),color="black",size=0.5) +
    xlab("Identical sequence existance, days") +
    ylab("Count") + 
    annotate("text",x=mean(fx1),y=Inf,label = ids[[1]],hjust=1,vjust=2,size=4,fontface="bold")

px2 <- ggplot(data=data.frame(D=d2),aes(x=D)) + geom_histogram(fill=colors[[2]],alpha=0.5,binwidth = 1) +
    geom_line(data=data.frame(x=fx2,y=fy2),aes(x,y),color="black",size=0.5) +
    xlab("Identical sequence existance, days") +
    ylab("Count") + ggtitle(ids[[2]]) +
    annotate("text",x=mean(fx2),y=Inf,label = ids[[2]],hjust=1,vjust=2,size=4,fontface="bold")


px1/px2 



In [None]:
## Check distribution

In [None]:
outondist = list()

In [None]:
## Poisson GLM

In [None]:

M1 <- glm(D ~ Sample,
          family = 'poisson',
          data = data)

M1sum<-summary(M1)
P <- M1sum$coefficients[[2,4]]
P

In [None]:
## Check for over/underdispersion in the model
E2 <- resid(M1, type = "pearson")
N  <- nrow(data)
p  <- length(coef(M1))   
Res <- sum(E2^2) / (N - p)
#add_row(outondist, Distribution="Poisson GLM", P=p,`Residual mean deviance`=Res )
Res

In [None]:
outondist[[1]]=list("Poison GLM",P,Res)

In [None]:
## Negative Binomial GLM

In [None]:
M2 <- glm.nb(D ~ Sample,
             data = data)
M2sum <- summary(M2)
P <- M2sum$coefficients[[2,4]]



In [None]:
# Dispersion statistic
E2 <- resid(M2, type = "pearson")
N  <- nrow(data)
p  <- length(coef(M2)) + 1  # '+1' is for variance parameter in NB
Res <- sum(E2^2) / (N - p)

In [None]:
outondist[[2]]=list("Negative Binomial GLM",P,Res)

In [None]:
## Zero-Inflated Poisson GLM

In [None]:
M3 <- zeroinfl(D ~ Sample | ## Predictor for the Poisson process
                 Sample, ## Predictor for the Bernoulli process;
               dist = 'poisson',
               data = data)

M3sum <- summary(M3)
P <- M3sum$coefficients$count[[2,4]]
P
M3sum

In [None]:
# Dispersion statistic
E3 <- resid(M3, type = "pearson")
N  <- nrow(data)
p  <- length(coef(M3))  
Res <- sum(E2^2) / (N - p)
outondist[[3]]=list("Zero-Inflated Poisson GLM",P,Res)

In [None]:
## Zero-Inflated Negative Binomial GLM

In [None]:
M4 <- zeroinfl(D ~ Sample |
                 Sample,
               dist = 'negbin',
               data = data)
M4sum <- summary(M4)
P <- M4sum$coefficients$count[[2,4]]
M4sum

In [None]:
# Dispersion Statistic
E2 <- resid(M4, type = "pearson")
N  <- nrow(data)
p  <- length(coef(M4)) + 1 # '+1' is due to theta

Res <- sum(E2^2) / (N - p)
outondist[[4]]=list("Zero-Inflated Negative Binomial GLM",P,Res)

In [None]:
outondist4show <- as.data.frame(do.call(rbind, outondist))
names(outondist4show) <- c("Distribution","P","Residual mean deviance")
outondist4show

In [None]:
# get a mock data for playing 
n=1000
d1t=rpois(n, 1) 
da = data.table(D=d1t,Sample="A")
d2t=rpois(n/10,0.5)
db = data.table(D=d2t,Sample="B")
dftest = bind_rows(da,db)

In [None]:
## Choose data set for analysis and output collection

In [None]:
data = dfwoe #dfwoe #dftest #choose either testing play data for dev  or real data here and play with data var
data$Sample <- factor(data$Sample)
print(paste("Sample levels"))
print(levels(data$Sample))

In [None]:
## Medians and averages

In [None]:
out <- data %>%
    group_by(Sample) %>%
    summarise(Median=median(D),Mean = mean(D)) %>%
    as.data.table()
out <- out %>%
    pivot_wider(names_from = Sample,values_from = c("Median","Mean")) %>%
    as.data.table()
names(out) <- str_replace_all( names(out),pattern = snakemake@wildcards$id,replacement = "Lineage")
out$Lineage = snakemake@wildcards$id
out <- out %>%
    dplyr::select(Lineage,starts_with("Median"),starts_with("Mean")) %>%
    as.data.table()
out

In [None]:
## Non parametric testing


In [None]:
res <- wilcox.test(D ~ Sample, data = data,
                   exact = FALSE)
out$Wilcox_twotailed_P <- res$p.value
res <- wilcox.test(D ~ Sample, data = data,
                   exact = FALSE,alternative = "less")
out$Wilcox_less_P <- res$p.value
res <- wilcox.test(D ~ Sample, data = data,
                   exact = FALSE,alternative = "greater")
out$Wilcox_greater_P <- res$p.value
out

In [None]:
## Parametric testing

In [None]:
zip <- zeroinfl(D ~ Sample | ## Predictor for the Poisson process
                 Sample, ## Predictor for the Bernoulli process;
               dist = 'poisson',
               data = data)
zip_summary <- summary(zip)
E2 <- resid(zip_summary, type = "pearson")
N  <- nrow(data)
p  <- length(coef(zip)) # '+1' is due to theta
Res <- sum(E2^2) / (N - p)
out$Anova_zip_P <- zip_summary$coefficients$count[[2,4]]
out$Anova_zip_coef <- zip$coefficients$count[[2]]
out$Anova_zip_wald_value <- zip_summary$coefficients$count[[2,3]]
out$Anova_zip_Residual_mean_deviance <- Res


In [None]:
zinb <- zeroinfl(D ~ Sample | ## Predictor for the Poisson process
                 Sample, ## Predictor for the Bernoulli process;
               dist = 'negbin',
               data = data)
zinb_summary <- summary(zinb)
E2 <- resid(zinb_summary, type = "person")
N  <- nrow(data)
p  <- length(coef(zinb)) +1 #'+1' is due to theta
Res <- sum(E2^2) / (N - p)
out$Anova_zinb_P <- zinb_summary$coefficients$count[[2,4]]
out$Anova_zinb_coef <- zinb$coefficients$count[[2]]
out$Anova_zinb_wald_value <- zinb_summary$coefficients$count[[2,3]]
out$Anova_zinb_Residual_mean_deviance <- Res


In [None]:
out

In [None]:
manyglmt <- manyglm(D~Sample, family="negative.binomial",data=data,pairwise.comp=c("Sample"))
manyglmt_summary <- summary(manyglmt)
manyglmt_summary

In [None]:
# Dispersion Statistic
E2 <- resid(manyglmt, type = "pearson")
N  <- nrow(data)
p  <- length(coef(manyglmt)) + 1 # '+1' is due to theta
Res <- sum(E2^2) / (N - p)
print(coef(manyglmt))

In [None]:
out$Anova_mavabubd_P <- manyglmt_summary$coefficients[2,2]
out$Anova_mavabubd_coef <- manyglmt$coefficients[[2]]
out$Anova_mavabubd_wald_value <- manyglmt_summary$coefficients[2,1]
out$Anova_mavabubd_Residual_mean_deviance <- Res


In [None]:
# robmixglmdt <- robmixglm(D~Sample, family = "nbinom", data=data,cores=16)
# robmixglmdt_summary <- summary(robmixglmdt)

In [None]:
# out$Anova_robmixglm_P <- robmixglmdt_summary$coefficients[2,4]
# out$Anova_robmixglm_outlietp <- robmixglmdt_summary$coefficients[3,1]
# out$Anova_robmixglm_coef <- robmixglmdt_summary$coefficients[2,1]
# out$Anova_robmixglm_zval <- robmixglmdt_summary$coefficients[2,3]


In [None]:
outrep <- out %>%
    dplyr::select(Lineage,starts_with("Median"),starts_with("Mean"),ends_with("_P"),ends_with("coef")) %>%
    as.data.table()
outrep
fwrite(file = snakemake@output$ref, x = out )