In [1]:
import os
import json
from Bio import Phylo
from jw_utils import ncbi_datasets_fxs as ndf
from ete3 import NCBITaxa
ncbi = NCBITaxa()

In [2]:
def append_json_dicts(taxids):
    p_dir = 'Tree_json_summaries'
    summary_dicts = {}
    for taxid in taxids:
        with open(f'{p_dir}/{taxid}_summary.json', 'r') as f:
            s = json.load(f)
            if s.get('assemblies'):
                summary_dicts[taxid] = ndf.make_summary_dict(s)
    return summary_dicts


def get_refseq_summaries(taxids):
    p_dir = 'Tree_json_summaries'
    os.makedirs(p_dir)
    for taxid in taxids:
        filepath = f'{p_dir}/{taxid}_summary.json'
        !datasets summary genome taxon $taxid --reference --annotated --assembly-source refseq > $filepath
    return append_json_dicts(taxids)


def get_summaries(taxids, limit = 500):
    p_dir = 'Tree_json_summaries'
    for taxon in taxids:
        filepath = f'{p_dir}/{taxon}_summary.json'
        !datasets summary genome taxon $taxon --annotated --assembly-source refseq --limit $limit > $filepath
    return append_json_dicts(taxids)
        
    
def get_taxids_empty_jsons(taxids):
    no_summary = []
    for taxid in taxids:
        with open(f'{p_dir}/{taxid}_summary.json', 'r') as f:
            s = json.load(f)
            if not s.get('assemblies'):
                no_summary.append(taxid)
    return no_summary


def get_assembly_highestn50(summary_dict):
    max_n50 = {}
    for taxid, d in summary_dict.items():
        n50 = 0
        for acc, tax_d in d.items():
            new_n50 = tax_d['contig_n50']
            if new_n50>n50:
                max_n50[taxid] = tax_d
            else:
                n50 = new_n50
    return max_n50


def get_lower_rankID(taxID, rank, null_return='rank_not_available'):
    """Return lower rank NCBI taxoniomic ID from a given higher resolution ID"""
    
    if type(taxID)!= int:
        raise TypeError(f'taxID {taxID} is a {type(taxID)}, but needs to be of type "int"')  
    available_ranks = ['domain', 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    if rank not in available_ranks :
        raise ValueError(f'"{rank}" not in available ranks: {available_ranks}')
    lineage = ncbi.get_rank(ncbi.get_lineage(taxID))
    if rank not in lineage.values():
        return null_return
    else:
        lineage_d = {value:key for key, value in lineage.items()}
        return lineage_d[rank]
#!datasets summary  genome taxon $taxon --help

### Get species-level taxonomic ID corresponding to the leaves in the itol TOL. 

In [122]:
tree = Phylo.read('./itol_TOL.nwk', format='nexus') # node names are taxonomic IDs
tree_cl = tree.common_ancestor('217992', '119072') # return the clade of interest as a tree
taxids = [cl.name for cl in tree_cl.get_terminals()]
species_ranks = []
for taxid in taxids:
    rank_taxid = get_lower_rankID(int(taxid), 'species', null_return='rank_not_available')
    species_ranks.append(rank_taxid)
species_rank_taxids =  list(set(species_ranks)) 

### Get one reference assembly, if one exists, for each species-level taxonomic ID in the itol tree clade of interest

In [112]:
refseq_summ_dict  = get_refseq_summaries(species_rank_taxids)
refseq_summ_dict = get_assembly_highestn50(refseq_summ_dict)

New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) availabl

### Get full chromosome assemblies for those leafs that don't have a reference assembly in ncbi datasets

In [115]:
no_refseq = set(species_rank_taxids).difference(set(refseq_summ_dict.keys()))
no_refseq_summ_dict = get_summaries(no_refseq, limit = 100)
no_refseq_summ_dict = get_assembly_highestn50(no_refseq_summ_dict)

New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets
New version of client (14.20.0) availabl

### Merge dictionaries

In [117]:
summary_dict = {}
missing_taxids = []
for taxid in species_rank_taxids:
    a = no_refseq_summ_dict.get(taxid)
    b = refseq_summ_dict.get(taxid)
    if a:
        summary_dict[taxid] = a
    elif b:
        summary_dict[taxid] = b
    else:
        print(a, b)
        missing_taxids.append(taxid)
        
ndf.sci_strain_names(summary_dict)

None None
None None
None None
None None


{518: 'Bordetella bronchiseptica NCTC10543',
 519: 'Bordetella parapertussis A005',
 520: 'Bordetella pertussis H640',
 9: 'Buchnera aphidicola (Schizaphis graminum)',
 536: 'Chromobacterium violaceum FDAARGOS_1273',
 51229: 'Wigglesworthia glossinidia endosymbiont of Glossina morsitans morsitans (Yale colony)',
 2096: 'Mycoplasmoides gallisepticum VA94_7994-1-7P',
 2097: 'Mycoplasmoides genitalium G-37',
 562: 'Escherichia coli K-12 substr. MG1655',
 1076: 'Rhodopseudomonas palustris RCB100',
 2102: 'Mycoplasma mycoides GM12',
 1590: 'Lactiplantibacillus plantarum SRCM100442',
 2104: 'Mycoplasmoides pneumoniae NCTC10119',
 2107: 'Mycoplasmopsis pulmonis NCTC10139',
 28227: 'Malacoplasma penetrans HF-2',
 2118: 'Mesomycoplasma mobile 163K',
 1176649: 'Agrobacterium fabrum 1D132',
 1097: 'Chlorobaculum tepidum TLS',
 83557: 'Chlamydia caviae GPIC',
 83558: 'Chlamydia pneumoniae TW-183',
 1639: 'Listeria monocytogenes EGD-e',
 83560: 'Chlamydia muridarum Nigg',
 1642: 'Listeria innocua C

In [118]:
accessions_to_add = ['GCF_001077675.1', 'GCF_000770605.1', 'GCF_000046845.1', 'GCF_000413935.1', 'GCF_001682515.1', 'GCF_005281455.1', 'GCA_000691605.1', 
                      'GCA_001592755.1', 'GCA_001592745.1', 'GCA_000196175.1', 'GCA_001592735.1', 'GCA_002208115.1', 'GCA_000317895.1', 'GCA_000525675.1', 
                      'GCA_000348725.1', 'GCF_000014625.1', 'GCF_000517305.1', 'GCF_000026105.1', 'GCF_000733715.2', 'GCF_000297075.2', 'GCF_000412675.1', 
                      'GCF_000761155.1', 'GCF_000934565.1', 'GCF_000498975.2', 'GCF_000007805.1', 'GCF_001294575.1',]

In [119]:
all_accessions = []
for taxid, summ_d in summary_dict.items():
    all_accessions.append(summ_d['assembly_accession'])
for acc in missing_accessions:
    all_accessions.append(acc)
all_accessions = set(all_accessions)
with open('./accessions.txt', 'w') as f:
    for acc in all_accessions:
        f.write(acc + '\n')


### Download genome assemblies and annotations from NCBI datasets

In [124]:
#!datasets download genome accession --dehydrated --exclude-rna --inputfile accessions.txt 
!datasets rehydrate --directory ncbi_dataset/

Found 544 files for rehydration
Completed 0 of 544 [------------------------------------------------]   0%
[1A[2KCompleted 0 of 544 [------------------------------------------------]   0%
[1A[2KCompleted 0 of 544 [------------------------------------------------]   0%
[1A[2KCompleted 0 of 544 [------------------------------------------------]   0%
[1A[2KCompleted 0 of 544 [------------------------------------------------]   0%
[1A[2KCompleted 0 of 544 [------------------------------------------------]   0%
[1A[2KCompleted 0 of 544 [------------------------------------------------]   0%
[1A[2KCompleted 0 of 544 [------------------------------------------------]   0%
[1A[2KCompleted 0 of 544 [------------------------------------------------]   0%
[1A[2KCompleted 0 of 544 [------------------------------------------------]   0%
[1A[2KCompleted 0 of 544 [------------------------------------------------]   0%
[1A[2KCompleted 0 of 544 [-----------------------------------

In [133]:
data_dir = './dash_app_bdelivibrio_full/ncbi_dataset/ncbi_dataset/data/'
files = [f for f in os.listdir(data_dir) if f.startswith('GC')]