In [8]:
import pandas as pd
from Bio import SeqIO, Seq
import plotly.express as px

In [13]:
# Comparing wgsim with 0% and 3% error rates.
df_0 = pd.read_csv("../testing_jelly/sim0/kmer_summation.tsv", sep="\t")
df_0["err"] = 0

df_003 = pd.read_csv("../testing_jelly/sim003/kmer_summation.tsv", sep="\t")
df_003["err"] = 0.03
df_tot = pd.concat([df_0, df_003])
df_tot["err_str"] = df_tot["err"].astype(str)

df_tot["corrected"] = df_tot["Depth median"] / ((1 - df_tot["err"])**21)

{0.5274805126389446, 1.0}

In [14]:
df_tot

Unnamed: 0,Contig,count,Depth avg,std,min,25%,Depth median,75%,max,count_nonzero,count_unique,count_unique_nonzero,err,err_str,corrected
0,NC_014328.1.region003,20725.0,9.421472,4.270547,1.0,7.0,9.0,11.0,42.0,20725,20725,20725,0.0,0.0,9.0
1,NZ_CP053893.1.region002,19844.0,19.584408,53.26197,0.0,1.0,3.0,7.0,414.0,17976,19844,17976,0.0,0.0,3.0
2,NZ_CP020566.1.region001,21701.0,45.706004,7.803356,21.0,41.0,46.0,51.0,92.0,21701,21701,21701,0.0,0.0,46.0
3,NZ_LT906470.1.region002,32242.0,14.138019,9.450253,1.0,10.0,13.0,16.0,133.0,32242,32242,32242,0.0,0.0,13.0
4,NZ_CP053893.1.region005,65902.0,2.827152,1.868469,0.0,1.0,3.0,4.0,18.0,60810,65902,60810,0.0,0.0,3.0
5,NZ_CP053893.1.region004,19578.0,2.810706,2.125914,0.0,1.0,2.0,4.0,28.0,17548,19578,17548,0.0,0.0,2.0
6,NZ_CP053893.1.region006,20740.0,3.626905,2.476691,0.0,2.0,3.0,5.0,15.0,19593,20740,19593,0.0,0.0,3.0
7,NZ_LT906470.1.region003,21739.0,12.901007,6.963051,3.0,10.0,12.0,15.0,80.0,21739,21739,21739,0.0,0.0,12.0
8,NC_014328.1.region005,21570.0,10.633565,17.513982,1.0,7.0,9.0,11.0,400.0,21570,21570,21570,0.0,0.0,9.0
9,NC_014328.1.region004,22066.0,9.870117,16.906533,2.0,7.0,8.0,10.75,400.0,22066,22066,22066,0.0,0.0,8.0


In [15]:
df_plot = df_tot.melt(id_vars=["Contig", "err_str"], value_vars=["Depth median","corrected"])
expected = (0.2*10**9) / 16859816

In [20]:
fig = px.scatter(df_plot, x="Contig", y="value", color="err_str", symbol="variable",
                title="Comparing estimates at different error rates>")
fig.add_hline(y=expected)

In [31]:
import glob
from pathlib import Path

In [33]:
Path("../testing_jelly/sim003/counts/NC_014328.1.region003.counted").parent.parent.name

'sim003'

In [57]:
# Lets look at the histograms of counts:
dfs = []
for file in glob.glob("../testing_jelly/sim0*/counts/*"):
    file = Path(file)
    df_i = pd.read_csv(file, sep = " ", names=["mer", "count"])
    df_i["contig"] = file.stem
    df_i["dataset"] = file.parent.parent.name
    dfs.append(df_i)
df_counts = pd.concat(dfs)

In [60]:
err_dict = {"sim003":0.03, "sim0":0}
df_counts["err"] = [err_dict[x] for x in df_counts["dataset"]]
df_counts["counts_corrected"] = df_counts["count"] / ((1-df_counts["err"])**21)
df_counts

Unnamed: 0,mer,count,contig,dataset,err,corrected,counts_corrected
0,CATACCCTTTACCTCGTCCCA,1,NC_014328.1.region003,sim003,0.03,1.895805,1.895805
1,TAAGCCTCATCAAATTTTCCA,3,NC_014328.1.region003,sim003,0.03,5.687414,5.687414
2,GTGTGGAACAATGCTGACAAA,3,NC_014328.1.region003,sim003,0.03,5.687414,5.687414
3,ATTACTATTTCAATTTGTTCA,3,NC_014328.1.region003,sim003,0.03,5.687414,5.687414
4,TTCACCAATACATATTTAGAA,0,NC_014328.1.region003,sim003,0.03,0.000000,0.000000
...,...,...,...,...,...,...,...
18081,ATAATTTTTTTTGTTTTCATG,14,NC_014328.1.region001,sim0,0.00,14.000000,14.000000
18082,TAGCTATGGTAACTTTAATAA,12,NC_014328.1.region001,sim0,0.00,12.000000,12.000000
18083,CACCTGAATAGACTTTTTATG,10,NC_014328.1.region001,sim0,0.00,10.000000,10.000000
18084,TAATCAAATAATTTATCGTCA,10,NC_014328.1.region001,sim0,0.00,10.000000,10.000000


In [59]:
fig = px.histogram(df_counts.query("contig == 'NC_014328.1.region003'"),
                   x="count", color="dataset", histnorm="probability", marginal='violin')
#fig.update_layout(xaxis_range=[0,200])
fig.add_vline(expected)

In [65]:
fig = px.histogram(df_counts.query("contig == 'NC_014328.1.region003'"),
                   x="counts_corrected", color="dataset", histnorm="probability", marginal='violin',
                   nbins =40
                  )
#fig.update_layout(xaxis_range=[0,200])
fig.add_vline(expected)

In [55]:
# Load "true" coverage
df_true = pd.read_csv("../testing_trueCount/genome1.cov", sep="\t", names=["contig", "pos","cov"])

In [51]:
df_true.head(5)

Unnamed: 0,contig,pos,cov
0,NC_014328.1,1,0
1,NC_014328.1,2,0
2,NC_014328.1,3,0
3,NC_014328.1,4,0
4,NC_014328.1,5,0


![image.png](attachment:image.png)![image.png](attachment:image.png)

In [56]:
df_true[1186650:1204801].describe()

Unnamed: 0,pos,cov
count,18151.0,18151.0
mean,1195726.0,12.53088
std,5239.887,3.809965
min,1186651.0,3.0
25%,1191188.0,10.0
50%,1195726.0,12.0
75%,1200264.0,15.0
max,1204801.0,26.0


In [57]:
df_true.describe()

Unnamed: 0,pos,cov
count,4630065.0,4630065.0
mean,2315033.0,11.86248
std,1336585.0,3.777545
min,1.0,0.0
25%,1157517.0,9.0
50%,2315033.0,12.0
75%,3472549.0,14.0
max,4630065.0,33.0


In [53]:
1.016319e+01

10.16319

In [49]:
pd.read_csv("../testing_jelly/kat-sect-stats.tsv", sep="\t")

Unnamed: 0,seq_name,median,mean,gc%,seq_length,kmers_in_seq,invalid_kmers,%_invalid,non_zero_kmers,%_non_zero,%_non_zero_corrected
0,NC_014328.1.region004 ['cyclic-lactone-autoind...,6,7.02037,0.29712,22109,22089,0,0.0,22033,99.74648,99.74648
1,NZ_CP053893.1.region001 ['ranthipeptide'] NZ_C...,5,5.55638,0.29122,21561,21541,0,0.0,21396,99.32687,99.32687
2,NZ_LT906470.1.region001 ['betalactone'] NZ_LT9...,6,6.26375,0.40413,26333,26313,0,0.0,26273,99.84798,99.84798
3,NZ_CP053893.1.region004 ['cyclic-lactone-autoi...,6,6.55813,0.28421,19598,19578,0,0.0,19497,99.58627,99.58627
4,NZ_CP053893.1.region003 ['cyclic-lactone-autoi...,6,6.14273,0.28401,20394,20374,0,0.0,20302,99.64661,99.64661
5,NZ_CP053893.1.region002 ['cyclic-lactone-autoi...,7,27.65061,0.33288,19866,19846,0,0.0,19812,99.82868,99.82868
6,NZ_CP053893.1.region005 ['cyclic-lactone-autoi...,6,5.85124,0.2968,66040,66020,0,0.0,65874,99.77885,99.77885
7,NC_014328.1.region003 ['cyclic-lactone-autoind...,6,6.4523,0.27843,20752,20732,0,0.0,20668,99.6913,99.6913
8,NZ_CP053893.1.region006 ['cyclic-lactone-autoi...,6,7.16302,0.27278,20760,20740,0,0.0,20646,99.54677,99.54677
9,NC_014328.1.region001 ['cyclic-lactone-autoind...,6,6.49167,0.2738,18152,18132,0,0.0,18120,99.93382,99.93382


In [6]:
# "simulate" reads - first as fasta.

comb_fasta = SeqIO.parse(open("../testing_jelly/combined.fa","r"), "fasta")

In [11]:
next(comb_fasta).seq.__str__()

'TATATATTTTTATAAGTTTTTTTCGGATAGTTTAGAAAAGTTATCCACATTTTTTCTCTTTTGACCTGTGAATAGTATAAACTCAGTATTTATAAGGTTGGAAGGGAATTTTATTTTCTCTTATTCACAAGTTGTGCACAGGTTATTCACAGGCATTTGTGGTTTTTCCACAGTTTTAGAATAAGTTATGCACATAATAACGAAAGTTATACACATTTTTAGTAAAAGTTATACACAAGAGGAACAAAATGCAAGCAGCTGATATTAATGATATATGGGAACGTATATTACAAGTAGCAAAACAAACACTGCCGCCTGCGATTTATTCAAGTTTGAGTACTTCTCTCATACCGATGAGTATAGATAATAATTCTATACACATTGGGGTTATGCAGAGTTTTATAAAAAGTGTCATCGAATCTCAGCAAACGGTAAGTAAGTTGTTAACAGATGCCATCAAGCAAGTGACAGGCAAAGAATTAAATATGGTTTTATTGGATCTGTTTCCACAAAAGGATGATATTCCAACAGCACCGCAGGTAGCAGATACTTTCACAGAGAATACTGCAGAAAATACTAGTACGAACACTATAGAAAATACGGTGGTAAGTGACGCGGACAACATCGATAAAAAATCCTCAAAACCTGCAAAGCAGGAGGTTCCATATCAAGAGGAATTTTACACACCGGTGTACGCAGATCCTGTGTATATCCAGCGTAAAGAGGTGGACGAGTTAATTCCAGATGAACCGATGTTTCCTGTAGAGCAGCCACAAATGGTGCAAACGTTGCAATCTACGGATATTCCTATCGATTTGTCATCGTCACAGCTAAACAGTGGTTATCGATTTGATAATTACATTACAGGCAATGCGAACCGCATTCCGTTTGGAGCAGCCCAAAATGTGTCGGAGCATCCTGGCGGCGATTATAATCCGCTTTTTATCTACGGTCCATCGGGCCTCGGTAAAACGCATTTGATGCACGCCATCGGCAATG

In [43]:
4 % 2 == 0

True

In [39]:
Seq.Seq("ATGCAAAGTTTTATAAAAAACTTGATTGATCAGCAGCCCGTCATCAGTAATGCATTACAAAATGCGATTACCACCGTGCTCGGATCCCAT").reverse_complement().__str__()

'ATGGGATCCGAGCACGGTGGTAATCGCATTTTGTAATGCATTACTGATGACGGGCTGCTGATCAATCAAGTTTTTTATAAAACTTTGCAT'

In [38]:
c

'ATGGGATCCGAGCACGGTGGTAATCGCATTTTGTAATGCATTACTGATGACGGGCTGCTGATCAATCAAGTTTTTTATAAAACTTTGCAT'

In [27]:
df_count = pd.read_csv("../data/simulated_data/quantification_kmer/0_2GB/counts/NC_014328.1.region001.counted", sep=" ", names=["mer","mercount"])

In [20]:
import plotly.express as px

In [28]:
px.histogram(df_count, "mercount")

In [25]:
k = 18
df_count["rolling_cov"] = [sum(df_count["mercount"].values[max(i-k+1,0):i+1])/k for i in range(len(df_count))]

In [26]:
px.histogram(df_count, "rolling_cov")