In [36]:
library(dplyr)
library(data.table)
library(dtplyr)
library(tidyr)
library(Rfast)

In [55]:
nextclade="./audines11/rez/lineages/BA.2/data.csv"
out="./audines11/rez/lineages/BA.2/sprotein/reference_based_mutation_frequencies.csv"
out_details="./audines11/rez/lineages/BA.2/sprotein/reference_based_mutation_details.csv"

In [31]:
nextclade_df = fread(nextclade)

In [32]:
#get the strain specific
nextclade_df <- nextclade_df %>%
    lazy_dt() %>%
    filter(qc.overallStatus != "bad") %>%
    mutate(LT = (grepl(pattern = "Lithuania", x = country))) %>%
    as.data.table() 


In [33]:
N_LT = length(nextclade_df$LT[nextclade_df$LT])
N_NONLT = nrow(nextclade_df) -N_LT 
nextclade_df$LT <- NULL

In [45]:
names = nextclade_df$taxon
substitutions = nextclade_df$aaSubstitutions
deletions = nextclade_df$aaDeletions
insertions = nextclade_df$aaInsertions

In [46]:
out_subs=list()
ct = 0
WT = list()
MUT = list()
POS = list()
SEQ = list()
n_rows = nrow(nextclade_df)
print(n_rows)
for (ctrow in seq(1,n_rows))
{
    if (ctrow %% 10000 ==0){
        print(paste(ctrow/n_rows," part is done"))
        flush.console()
    }
    s = substitutions[[ctrow]]  
    muts = strsplit(split=",",x=s,fixed=T)[[1]]
    aa = grepl(pattern = "S:",muts)
    muts = muts[aa]
    muts = sub(pattern = "S:", replacement = "", x=muts)
    for (m in muts){
        mlength = nchar(m)
        ct = ct + 1
        WT[[ct]]=substr(m,1,1)
        MUT[[ct]]=substr(m,mlength,mlength)
        POS[[ct]]=strtoi(substr(m,2,mlength-1))
        SEQ[[ct]]=names[[ctrow]]        
        }
}
print("Transfering to data table")
flush.console()
out_subs_df = data.table(
    WT=unlist(WT),
    MUT=unlist(MUT),
    POS=unlist(POS),
    SEQ=unlist(SEQ))
out_subs_df$TYPE="SUB"


[1] 615033
[1] "0.0162592901519105  part is done"
[1] "0.0325185803038211  part is done"
[1] "0.0487778704557316  part is done"
[1] "0.0650371606076422  part is done"
[1] "0.0812964507595527  part is done"
[1] "0.0975557409114633  part is done"
[1] "0.113815031063374  part is done"
[1] "0.130074321215284  part is done"
[1] "0.146333611367195  part is done"
[1] "0.162592901519105  part is done"
[1] "0.178852191671016  part is done"
[1] "0.195111481822927  part is done"
[1] "0.211370771974837  part is done"
[1] "0.227630062126748  part is done"
[1] "0.243889352278658  part is done"
[1] "0.260148642430569  part is done"
[1] "0.276407932582479  part is done"
[1] "0.29266722273439  part is done"
[1] "0.3089265128863  part is done"
[1] "0.325185803038211  part is done"
[1] "0.341445093190122  part is done"
[1] "0.357704383342032  part is done"
[1] "0.373963673493943  part is done"
[1] "0.390222963645853  part is done"
[1] "0.406482253797764  part is done"
[1] "0.422741543949674  part is done

In [47]:
out_deletion=list()
ct = 0
WT = list()
MUT = list()
POS = list()
SEQ = list()
n_rows = nrow(nextclade_df)
for (ctrow in seq(1,n_rows))
{
    if (ctrow %% 10000 ==0){
        print(paste(ctrow/n_rows," part is done"))
        flush.console()
    }
    s = deletions[[ctrow]]  
    muts = strsplit(split=",",x=s,fixed=T)[[1]]
    aa = grepl(pattern = "S:",muts)
    muts = muts[aa]
    muts = sub(pattern = "S:", replacement = "", x=muts)
    for (m in muts){
        mlength = nchar(m)
        ct = ct + 1           
        WT[[ct]]=substr(m,1,1)
        MUT[[ct]]=substr(m,mlength,mlength)
        POS[[ct]]=strtoi(substr(m,2,mlength-1))
        SEQ[[ct]]=names[[ctrow]]        
        }
    
}
print("Transfering to data table")
flush.console()
out_deletion_df = data.table(
    WT=unlist(WT),
    MUT=unlist(MUT),
    POS=unlist(POS),
    SEQ=unlist(SEQ))
out_deletion_df$TYPE="DEL"


[1] "0.0162592901519105  part is done"
[1] "0.0325185803038211  part is done"
[1] "0.0487778704557316  part is done"
[1] "0.0650371606076422  part is done"
[1] "0.0812964507595527  part is done"
[1] "0.0975557409114633  part is done"
[1] "0.113815031063374  part is done"
[1] "0.130074321215284  part is done"
[1] "0.146333611367195  part is done"
[1] "0.162592901519105  part is done"
[1] "0.178852191671016  part is done"
[1] "0.195111481822927  part is done"
[1] "0.211370771974837  part is done"
[1] "0.227630062126748  part is done"
[1] "0.243889352278658  part is done"
[1] "0.260148642430569  part is done"
[1] "0.276407932582479  part is done"
[1] "0.29266722273439  part is done"
[1] "0.3089265128863  part is done"
[1] "0.325185803038211  part is done"
[1] "0.341445093190122  part is done"
[1] "0.357704383342032  part is done"
[1] "0.373963673493943  part is done"
[1] "0.390222963645853  part is done"
[1] "0.406482253797764  part is done"
[1] "0.422741543949674  part is done"
[1] "0.43

In [48]:
out_insertion=list()
ct = 0
WT = list()
MUT = list()
POS = list()
SEQ = list()
n_rows = nrow(nextclade_df)
for (ctrow in seq(1,n_rows))
{
    if (ctrow %% 10000 ==0){
        print(paste(ctrow/n_rows," part is done"))
        flush.console()
    }
    s = insertions[[ctrow]]  
    muts = strsplit(split=",",x=s,fixed=T)[[1]]
    aa = grepl(pattern = "S:",muts)
    muts = muts[aa]
    muts = sub(pattern = "S:", replacement = "", x=muts)    
    for (m in muts){
        mlength = nchar(m)
        ct = ct + 1
        parts= strsplit(split = ":",fixed = T, x = m)[[1]] 
        WT[[ct]]=NA
        MUT[[ct]]=parts[[2]]
        POS[[ct]]=strtoi(parts[[1]])
        SEQ[[ct]]=names[[ctrow]]        
        }
    
}
out_insertion_df = data.table(
    WT=unlist(WT),
    MUT=unlist(MUT),
    POS=unlist(POS),
    SEQ=unlist(SEQ))
out_insertion_df$TYPE="INS"

[1] "0.0162592901519105  part is done"
[1] "0.0325185803038211  part is done"
[1] "0.0487778704557316  part is done"
[1] "0.0650371606076422  part is done"
[1] "0.0812964507595527  part is done"
[1] "0.0975557409114633  part is done"
[1] "0.113815031063374  part is done"
[1] "0.130074321215284  part is done"
[1] "0.146333611367195  part is done"
[1] "0.162592901519105  part is done"
[1] "0.178852191671016  part is done"
[1] "0.195111481822927  part is done"
[1] "0.211370771974837  part is done"
[1] "0.227630062126748  part is done"
[1] "0.243889352278658  part is done"
[1] "0.260148642430569  part is done"
[1] "0.276407932582479  part is done"
[1] "0.29266722273439  part is done"
[1] "0.3089265128863  part is done"
[1] "0.325185803038211  part is done"
[1] "0.341445093190122  part is done"
[1] "0.357704383342032  part is done"
[1] "0.373963673493943  part is done"
[1] "0.390222963645853  part is done"
[1] "0.406482253797764  part is done"
[1] "0.422741543949674  part is done"
[1] "0.43

In [49]:
all = bind_rows(out_insertion_df,out_subs_df,out_deletion_df) %>%
    lazy_dt() %>%
    mutate(LT=ifelse(grepl(pattern = "Lithuania",SEQ),"LT","NONLT")) %>%
    as.data.table()
head(all)



WT,MUT,POS,SEQ,TYPE,LT
<chr>,<chr>,<int>,<chr>,<chr>,<chr>
,SGR,212,Denmark/DCGC-455294/2022,INS,NONLT
,SGR,212,Denmark/DCGC-368227/2022,INS,NONLT
,SGR,212,Denmark/DCGC-455303/2022,INS,NONLT
,SGR,212,Denmark/DCGC-455311/2022,INS,NONLT
,SGR,212,Denmark/DCGC-455323/2022,INS,NONLT
,SGR,212,Denmark/DCGC-456780/2022,INS,NONLT


WT,MUT,POS,SEQ,TYPE,LT
<chr>,<chr>,<int>,<chr>,<chr>,<chr>
T,I,19,Lithuania/S22C453/2022,SUB,LT
A,S,27,Lithuania/S22C453/2022,SUB,LT
G,D,142,Lithuania/S22C453/2022,SUB,LT
V,G,213,Lithuania/S22C453/2022,SUB,LT
G,D,339,Lithuania/S22C453/2022,SUB,LT
S,F,371,Lithuania/S22C453/2022,SUB,LT


In [50]:
pos_sum = all %>%
    group_by(POS,TYPE,LT) %>%
    summarise(N=as.double(n())) %>%
    ungroup() %>%
    as.data.table()
if (N_LT > 0) {
    pos_sum[LT=="LT"]$N = pos_sum[LT=="LT"]$N/N_LT
    }
pos_sum[LT=="NONLT"]$N = pos_sum[LT=="NONLT"]$N/N_NONLT
N_ALL = nrow(nextclade_df)

`summarise()` has grouped output by 'POS', 'TYPE'. You can override using the `.groups` argument.


In [51]:
position_wise = dcast(pos_sum,POS ~ TYPE + LT,value.var = "N")
#check if Lithuanian dat exits
for (cl in list("DEL_LT","INS_LT","SUB_LT","DEL_NONLT","INS_NONLT","SUB_NONLT")) {
    if (!(cl %in% names(position_wise))) {
        position_wise[,eval(cl)] <- as.double(NA)
    }
}
head(position_wise)

POS,DEL_LT,DEL_NONLT,INS_NONLT,SUB_LT,SUB_NONLT,INS_LT
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,,,,,1.630239e-06,
2,,,,,0.0001108563,
3,,8.151195e-06,3.260478e-06,,7.173051e-05,
4,,,,,1.304191e-05,
5,,2.445358e-05,,0.006765068,0.01036343,
6,,1.630239e-06,,,0.0001565029,


In [52]:
position_wise[is.na(position_wise)] <- 0
position_wise[, 
              SUM_NONLT := sum(DEL_NONLT,INS_NONLT,SUB_NONLT,na.rm = T),
              by=1:nrow(position_wise)]
              
position_wise[, 
              SUM_LT := sum(DEL_LT,INS_LT,SUB_LT,na.rm = T),
              by=1:nrow(position_wise)]
position_wise[, 
              DSUM_LT := SUM_LT -  SUM_NONLT]       
head(position_wise)

POS,DEL_LT,DEL_NONLT,INS_NONLT,SUB_LT,SUB_NONLT,INS_LT,SUM_NONLT,SUM_LT,DSUM_LT
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0,0.0,0.0,0.0,1.630239e-06,0,1.630239e-06,0.0,-1.630239e-06
2,0,0.0,0.0,0.0,0.0001108563,0,0.0001108563,0.0,-0.0001108563
3,0,8.151195e-06,3.260478e-06,0.0,7.173051e-05,0,8.314219e-05,0.0,-8.314219e-05
4,0,0.0,0.0,0.0,1.304191e-05,0,1.304191e-05,0.0,-1.304191e-05
5,0,2.445358e-05,0.0,0.006765068,0.01036343,0,0.01038788,0.006765068,-0.003622815
6,0,1.630239e-06,0.0,0.0,0.0001565029,0,0.0001581332,0.0,-0.0001581332


In [56]:
fwrite(x = position_wise,file = out)
fwrite(x = all,file = out_details)