In [14]:
import os
__file__ = "somefile.py"
def gen_qsub_args(working_dir, **kwargs):
    scriptname = os.path.basename(__file__).split(".")[0]
    qsub_args = dict(
        directory = working_dir,
        modules = "tools antismash/6.1.1",
        runtime = 120,
        cores = 30,
        ram=100,
        group="dtu_00009",
        jobname=os.path.basename(__file__),
        output = os.path.join(working_dir, "logs", scriptname+ "_stdout"),
        error = os.path.join(working_dir, "logs", scriptname+ "_stderr")
    )
    qsub_args.update(kwargs)
    return qsub_args

In [1]:
jobs = {"a":123}

In [3]:
[jobs[x] for x in "ab" if x in jobs]

[123]

In [None]:
# theoretical count:
# total bp / sum of input genome lengths.

In [14]:
#input:
file = "/mnt/computerome/home/projects/dtu_00009/people/henspi/git/AntibioticaScreening/project/data/simulated_data/input_genomes/combined.fa"

In [18]:
total_length = 0
for line in open(file, "rb"):
    if line[0] == 62: #bit for '>'
        print(total_length)
        print(line)
        continue
    total_length += len(line.strip())
print(total_length)

0
b'>NZ_LT906445.1 Veillonella parvula strain NCTC11810 chromosome 1, complete sequence\n'
2132142
b'>NZ_CP053893.1 Clostridium beijerinckii strain ASCUSDY20 chromosome, complete genome\n'
8116509
b'>NC_014328.1 Clostridium ljungdahlii DSM 13528, complete sequence\n'
12746574
b'>NZ_CP020566.1 Veillonella atypica strain OK5 chromosome, complete genome\n'
14818526
b'>NZ_LT906470.1 Veillonella rodentium strain NCTC12018 chromosome 1, complete sequence\n'
16859816


In [94]:
import glob
import pandas as pd
import re
import sys
import numpy as np
def detect_quantifications(directory:str="../data/simulated_data/quantification/"):
    #find runs
    summary_files = glob.glob(directory+"/*/cmseq_summation.tsv")
    print(f"Found ({len(summary_files)}) summary files", file=sys.stderr)
    dataframes = []
    for file in summary_files:
        label = re.search(r"quantification/(.+)/", file).group(1)
        readGB = float(label[:-2].replace("_", "."))
        df = pd.read_csv(file, sep="\t")
        df["readGB"] = readGB
        dataframes.append(df)
    
    #Ensure the runs have same length.
    #assert(all([len(dataframes[0]) == len(df) for df in dataframes]), "summary files doesn't have the same length, check inputfiles")
    df_summary = pd.concat(dataframes)
    df_summary["expectedCount"] = (df_summary["readGB"].values*10**9) / total_length
    df_summary["abserr_avg"] = np.abs(df_summary["expectedCount"].values-df_summary["Depth avg"])
    df_summary["abserr_med"] = np.abs(df_summary["expectedCount"].values-df_summary["Depth median"])
    
    df_summary["RAE_avg"] = df_summary["abserr_avg"] / df_summary["expectedCount"]
    df_summary["RAE_med"] = df_summary["abserr_med"] / df_summary["expectedCount"]
    
    return df_summary.sort_values("readGB")

In [95]:
df_summary = detect_quantifications()
df_summary

Found (8) summary files


Unnamed: 0,Contig,Breadth,Depth avg,Depth median,readGB,expectedCount,abserr_avg,abserr_med,RAE_avg,RAE_med
1,NZ_CP053893.1.region001,0.072770,1.033142,1.0,0.001,0.059313,0.973830,0.940687,16.418586,15.859816
0,NC_014328.1.region004,0.037315,1.012121,1.0,0.001,0.059313,0.952809,0.940687,16.064177,15.859816
2,NZ_LT906470.1.region001,0.091900,1.099174,1.0,0.001,0.059313,1.039861,0.940687,17.531864,15.859816
3,NZ_CP053893.1.region004,0.085366,1.069337,1.0,0.001,0.059313,1.010024,0.940687,17.028817,15.859816
4,NZ_CP053893.1.region003,0.023683,1.055901,1.0,0.001,0.059313,0.996588,0.940687,16.802290,15.859816
...,...,...,...,...,...,...,...,...,...,...
3,NZ_CP053893.1.region004,1.000000,111.522349,100.0,2.000,118.625257,7.102907,18.625257,0.059877,0.157009
2,NZ_LT906470.1.region001,1.000000,104.669578,101.0,2.000,118.625257,13.955679,17.625257,0.117645,0.148579
1,NZ_CP053893.1.region001,1.000000,99.210936,99.0,2.000,118.625257,19.414320,19.625257,0.163661,0.165439
7,NC_014328.1.region003,1.000000,103.123024,100.0,2.000,118.625257,15.502232,18.625257,0.130682,0.157009


In [96]:
px.line(df_summary, x="readGB",y="RAE_avg", color="Contig")

In [98]:
px.line(df_summary, x="readGB",y="RAE_med", color="Contig")

In [88]:
df = df_summary.groupby("readGB").agg(
    expected = pd.NamedAgg(column="expectedCount", aggfunc="mean"),
    Depth_median = pd.NamedAgg(column="Depth median", aggfunc="mean"),
    Depth_avg = pd.NamedAgg(column="Depth avg", aggfunc="mean"),
    MSE_median = pd.NamedAgg(column="SQErr_med", aggfunc="mean"),
    MSE_avg = pd.NamedAgg(column="SQErr_avg", aggfunc="mean"),
)
df["RAE_median"] = df["MSE_avg"] / df["expected"]
df["RAE_avg"] = df["MSE_median"] / df["expected"]
df.reset_index(inplace=True)
df

Unnamed: 0,readGB,expected,Depth_median,Depth_avg,MSE_median,MSE_avg,RAE_median,RAE_avg
0,0.001,0.059313,1.0,1.090025,0.884893,1.065897,17.970828,14.919129
1,0.002,0.118625,1.0,1.157392,0.776821,1.125197,9.485308,6.548533
2,0.01,0.593126,1.0625,1.573637,0.278905,1.663066,2.803899,0.470229
3,0.02,1.186253,1.25,2.130282,0.191564,2.937434,2.47623,0.161486
4,0.1,5.931263,5.125,6.752666,0.759435,31.549487,5.319185,0.128039
5,0.2,11.862526,10.0625,13.30177,3.548686,124.758115,10.516994,0.299151
6,1.0,59.312628,50.3125,66.729243,82.217154,3146.714798,53.053032,1.386166
7,2.0,118.625257,100.9375,134.243185,317.165329,13034.348628,109.87836,2.673675


In [51]:

#add expected count:
df_summary["expectedCount"] = (df_summary["readGB"].values*10**9) / total_length

In [76]:
df_long = pd.melt(df_summary, id_vars=["Contig","readGB"], value_vars=["Breadth","Depth avg", "Depth median","expectedCount", "SQErr_avg","SQErr_med"])

In [65]:
import plotly.express as px
px.line(df_summary.sort_values("readGB"), x="readGB", y="Breadth", color="Contig")

In [67]:
df_long.head(5)

Unnamed: 0,Contig,readGB,variable,value
0,NC_014328.1.region004,0.01,Breadth,0.412592
1,NZ_CP053893.1.region001,0.01,Breadth,0.288391
2,NZ_LT906470.1.region001,0.01,Breadth,0.25618
3,NZ_CP053893.1.region004,0.01,Breadth,0.347893
4,NZ_CP053893.1.region003,0.01,Breadth,0.452192


In [78]:
fig = px.line(df_long.sort_values("readGB"),
        x="readGB",
        y="value",
        facet_row="variable",
        color="Contig"
       )
fig.update_layout(height=1800)
fig.update_yaxes(matches=None)

In [13]:
import plotly

280

In [2]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--gbk", required=True, help="input .gbk file")
parser.add_argument("--fa", help="output .file")
args = parser.parse_args(["--gbk", "out.gbk"])
args

Namespace(gbk='out.gbk', fa=None)

In [2]:
ncbi_id_file = "../config/ids_simulation_genomes.txt"

In [15]:
import os, sys

ncbi_ids = [entry.strip() for line in open(ncbi_id_file, "r").readlines() for entry in line.split()]
print(f"Found {len(ncbi_ids)} ids in file", file=sys.stderr)

expected_output = [f"../data/simulated_data/input_genomes/{x.replace('.', '_')}.fa" for x in ncbi_ids+["combined"]]

Found 5 ids in file


In [13]:
expected_output

['../data/simulated_data/input_genomes/NZ_LT906445_1.fa',
 '../data/simulated_data/input_genomes/NZ_CP053893_1.fa',
 '../data/simulated_data/input_genomes/NC_014328_1.fa',
 '../data/simulated_data/input_genomes/NZ_CP020566_1.fa',
 '../data/simulated_data/input_genomes/NZ_LT906470_1.fa',
 '../data/simulated_data/input_genomes/combined_fasta.fa']

In [14]:
[os.path.isfile(x) for x in expected_output]

[True, True, True, True, True, False]

In [3]:
str(number).replace(".","_")

'0_03'

In [16]:
gen_qsub_args(working_dir="home", runtime=1)

{'directory': 'home',
 'modules': 'tools antismash/6.1.1',
 'runtime': 1,
 'cores': 30,
 'ram': 100,
 'group': 'dtu_00009',
 'jobname': 'somefile.py',
 'output': 'home/logs/somefile_stdout',
 'error': 'home/logs/somefile_stderr'}

In [1]:
import re

In [23]:
import os
import glob
import sys


def output_exists(reads_gb: int) -> bool:
    """Checks if the output from the run already exists.
    Can be used as a check to see if the job ran succesfully,
    or if the job doesn't need to run.
    """
    reads_dir = f"../data/simulated_data/camisim/{gen_prefix(reads_gb)}/*_sample_0/reads/*"
    file_names = [x.rsplit("/",1)[1] for x in glob.glob(reads_dir)]
    n_output_fasta_files = len(re.findall("(Genome[1-5][1,2])", "".join(file_names)))
    print(f"Found ({n_output_fasta_files}/10) outputfiles", file=sys.stderr)
    return n_output_fasta_files == 10 #TODO: probably shouldn't be hardcoded.
def gen_prefix(reads_gb:int) -> str:
    return str(reads_gb).replace(".","_")+"GB"

In [24]:
output_exists(0.1)

Found (10/10) outputfiles


True

In [27]:
assemblies = f"../data/simulated_data/camisim/*GB/*_sample_0/contigs/gsa.fasta.gz"

In [29]:
camisim_dataset_GB_range = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]

In [31]:
for gb in camisim_dataset_GB_range:
    antismash_html = f"../data/simulated_data/antismash/{gen_prefix(gb)}/index.html"
    print(gb, os.path.isfile(antismash_html))

1 False
0.1 False
0.01 False
0.001 False
0.0001 False
1e-05 False


In [30]:
for gb in camisim_dataset_GB_range:
    f = glob.glob(f"../data/simulated_data/camisim/{gen_prefix(gb)}/*_sample_0/contigs/gsa.fasta.gz")
    print(gb, f)

1 []
0.1 ['../data/simulated_data/camisim/0_1GB/2022.09.07_14.33.06_sample_0/contigs/gsa.fasta.gz']
0.01 []
0.001 []
0.0001 []
1e-05 []
