## Download the data

### Main genomes.
Veillonella Parvula type strain (refseq reference): https://www.ncbi.nlm.nih.gov/nuccore/NZ_LT906445.1/
NZ_LT906445.1  

Clostridium beijerinckii: https://www.ncbi.nlm.nih.gov/nuccore/NZ_CP053893.1  
NZ_CP053893.1  
Clostridium ljungdahlii: https://www.ncbi.nlm.nih.gov/nuccore/NC_014328.1/  
NC_014328.1  

### Noise

Veillonella atypica: https://www.ncbi.nlm.nih.gov/nuccore/NZ_CP020566.1  
NZ_CP020566.1  
Veillonella rodentium: https://www.ncbi.nlm.nih.gov/nuccore/NZ_LT906470.1  
NZ_LT906470.1 


### Script: See bin/fetch_ncbi.py

In [5]:
import os
os.path.dirname("a/b/")

'a/b'

In [1]:
id_list = [
    "NZ_LT906445.1",
    "NZ_CP053893.1",
    "NC_014328.1",
    "NZ_CP020566.1",
    "NZ_LT906470.1"
]

In [4]:
ncbi_id = "NZ_LT906470.1"
entrez_api_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={ncbi_id}&rettype=genbank&retmode=text'
entrez_api_url

'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=NZ_LT906470.1&rettype=genbank&retmode=text'

In [58]:
import requests
import os, sys
from typing import List


def fetch_fasta_from_ncbi(ncbi_id, outdir = None, overwrite: bool=False, verbose: bool=False) -> str:
    
    
    entrez_api_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={ncbi_id}&rettype=fasta&retmode=text'
    local_filename = ncbi_id.replace(".","_")+".fasta"
    
    if outdir:
        if not os.path.isdir(outdir):
            print("Creating output folder: " +outdir)
            os.mkdir(outdir)
        local_filehandle = os.path.join(outdir, local_filename)
    else:
        local_filehandle = local_filename
   
    #Check if file already exists:
    if not overwrite and os.path.isfile(local_filehandle):
        if verbose:
            print(f"{ncbi_id} already in outdir -> Skipping", file=sys.stderr)
        return os.path.abspath(local_filehandle)
        
    
    # NOTE the stream=True parameter below
    print("Pulling file with id: " + ncbi_id, file=sys.stderr)
    with requests.get(entrez_api_url, stream=True) as r:
        r.raise_for_status()
        with open(local_filehandle, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
    return os.path.abspath(local_filehandle)

def fetch_fastas_async(ncbi_ids: List[str], outdir: str=None, overwrite: bool=False, verbose: bool=False) -> List[str]:
    loaded_files = []
    for ncbi_id in ncbi_ids: #todo: run as async group
        try:
            filepath = fetch_fasta_from_ncbi(ncbi_id = ncbi_id, outdir = outdir, overwrite=overwrite, verbose=verbose)
            loaded_files.append(filepath)
        except Exception as err:
            print(err, file=sys.sterr)
    return loaded_files

In [29]:
#!/usr/bin/env python3
import os, sys

# Qsub pipeline intro:

from lib.general_functions import submit2

code_dir = "/home/people/henspi/git/AntibioticaScreening/project" # TODO: Should be argument
working_dir = "/home/people/henspi/AntibioticaScreeningWD"        # TODO: Should be argument

id_list_fp = "data/simulated_data/id_list.txt"

if not os.path.isdir(working_dir):
    print("Creating working dir -> "+working_dir, file=sys.stderr)
    os.mkdir(working_dir)


run_command = f"""\
cat {os.path.join(working_dir, id_list_fp)} |\
{os.path.join(code_dir, 'bin/ncbi_fetch.py')}\ 
--outdir {os.path.join(working_dir, 'simulated_data/input_genomes')}\
"""



print(run_command, file=sys.stderr)


Creating working dir -> /home/people/henspi/AntibioticaScreeningWD


FileNotFoundError: [Errno 2] No such file or directory: '/home/people/henspi/AntibioticaScreeningWD'

In [10]:
metadata_str = """\
genome_ID,OTU,NCBI_ID,novelty_category
Genome1,x,748727,known_species
Genome2,x,39777,known_species
Genome3,x,1520,known_species
Genome4,x,29466,known_species
Genome5,x,248315,known_species
"""

In [8]:
from io import StringIO

In [12]:
import pandas as pd
df = pd.read_csv(StringIO(metadata_str), sep=",")

In [23]:
output = open("test", "r").readlines()
output_cleaned = [x.strip() for x in "".join(output).replace("\n\t", "").split("\n")]
output_cleaned
output_dict = {k.strip(): v.strip() for line in output_cleaned for k,v in line.split("=")}
output_dict

ValueError: too many values to unpack (expected 2)

In [35]:
output_dict = {}
for line in output_cleaned[1::]:
    if not line or "=" not in line:
        continue
    k, v = line.split(" = ")
    if k in ["Error_Path", "Output_Path"]:
        v = v.split(":")[1]
    output_dict[k] = v
    

In [37]:
output_dict['job_state']

'R'

In [38]:
output_dict

{'Job_Name': 'camisim',
 'Job_Owner': 'henspi@g-12-l0002.eth.cla',
 'resources_used.cput': '00:09:03',
 'resources_used.energy_used': '0',
 'resources_used.mem': '1790832kb',
 'resources_used.vmem': '90528556kb',
 'resources_used.walltime': '00:05:37',
 'job_state': 'R',
 'queue': 'batch',
 'server': 'moab.eth.cla',
 'Account_Name': 'dtu_00009',
 'Checkpoint': 'u',
 'ctime': 'Wed Aug 31 13:21:18 2022',
 'Error_Path': '/home/projects/dtu_00009/people/henspi/git/AntibioticaScreening/project/logs/camisim_stderr.txt',
 'exec_host': 'g-01-c0070/0-34',
 'group_list': 'dtu_00009',
 'Hold_Types': 'n',
 'Join_Path': 'n',
 'Keep_Files': 'n',
 'Mail_Points': 'a',
 'mtime': 'Wed Aug 31 13:21:57 2022',
 'Output_Path': '/home/projects/dtu_00009/people/henspi/git/AntibioticaScreening/project/logs/camisim_output.txt',
 'Priority': '0',
 'qtime': 'Wed Aug 31 13:21:18 2022',
 'Rerunable': 'True',
 'Resource_List.nodes': '1:ppn=35',
 'Resource_List.mem': '185gb',
 'Resource_List.walltime': '02:40:00',
 '