In [1]:
import datetime
import gzip
import pandas as pd
import shutil
import time
import urllib.request
from Bio import SeqIO
from tqdm.notebook import tqdm
from pyscripts.config import path2
from pyscripts.datasets import DatasetDownloader

In [2]:
today = datetime.date.today()
genome_dir = path2.pubdata/'genomic_gbff'

print(today)

2021-05-20


In [3]:
# Get assembly summary
ftp_source  = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt'
file_target = path2.metadata/f'NCBI_refseq_bacteria_assembly_summary_{today.strftime("%y%m%d")}.txt'

with urllib.request.urlopen(ftp_source) as response, open(file_target, 'wb') as outfile:
    shutil.copyfileobj(response, outfile)
#=> set 

assembly_summary = pd.read_csv(file_target, sep='\t', skiprows=1, converters={'excluded_from_refseq': str}, index_col=0)


In [4]:
# Filter 1: based on assembly status
asm_filtered = assembly_summary[
    (assembly_summary['refseq_category'].isin({'representative genome', 'reference genome'})) &
    (assembly_summary['version_status'] == 'latest') &
    (assembly_summary['genome_rep']     == 'Full') &
    (assembly_summary['assembly_level'] == 'Complete Genome') &
    (assembly_summary['excluded_from_refseq'] == '') 
]

In [5]:
# Get GTDB r202 species clusters
ftp_source  = 'https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/sp_clusters_r202.tsv'
file_target = path2.metadata/f'GTDB_sp_clusters_r202.tsv'

with urllib.request.urlopen(ftp_source) as response, open(file_target, 'wb') as outfile:
    shutil.copyfileobj(response, outfile)

sp_clst = pd.read_csv(file_target, sep='\t', index_col=0)#, usecols=[0,9], squeeze=True)


In [6]:
# Filter 2: Phylogenetic classification is described in GTDB

tmp = sp_clst['Clustered genomes'].apply(lambda l: l.split(','))
refseq2gtdbrep = pd.Series({
    gcf: 
    hit.idxmax()
    if (hit := tmp.apply(lambda l: (f'RS_{gcf}' in l) or (f'GB_{gca}' in l))).any() else
    None
    for gcf, gca in tqdm(asm_filtered['gbrs_paired_asm'].items())
})
del tmp

gtdbrep2refseq = pd.Series({}, dtype=object)
for gtdb_rep, ncbi_genomes in refseq2gtdbrep.groupby(refseq2gtdbrep):
    for gcf in ncbi_genomes.index:
        if gtdb_rep[6:] == gcf[3:]:
            gtdbrep2refseq[gtdb_rep] = gcf
            break
    else:
        gtdbrep2refseq[gtdb_rep] = ncbi_genomes.index[0]


0it [00:00, ?it/s]

In [7]:
dd = DatasetDownloader()

In [8]:
# Filter 3: genetic code
def has_genetic_code_11(file_to_save):
    with gzip.open(file_to_save, 'rt') as genome_decomp:
        is11 = all([
            feat.qualifiers.get('transl_table') == ['11']
            for rec in SeqIO.parse(genome_decomp, 'gb')
            for feat in rec.features
            if feat.type == 'CDS'
        ])
    return is11

In [9]:
included_genomes = []
    
for gcf, ftp_path in tqdm(asm_filtered.loc[gtdbrep2refseq.values, 'ftp_path'].items()):
    file_to_save = genome_dir/f'{gcf}.gbff.gz'
    dd.fetch_NCBI_genome(ftp_path, file_to_save)
    if has_genetic_code_11(file_to_save):
        included_genomes.append(gcf)
    else:
        file_to_save.unlink()
    
    time.sleep(0.5)


0it [00:00, ?it/s]

In [38]:
assert refseq2gtdbrep.loc[included_genomes].rename_axis('refseq').rename('gtdbrep').to_dict() == \
       gtdbrep2refseq[gtdbrep2refseq.isin(included_genomes)].rename('refseq').rename_axis('gtdbrep').reset_index().set_index('refseq')['gtdbrep'].to_dict()

print(len(included_genomes))


2624


In [44]:
refseq2gtdbrep.loc[included_genomes].rename_axis('refseq').rename('gtdbrep').reset_index().to_pickle(
    path2.metadata/'refseq_gtdbrep_mappings.pkl.bz2'
)

sp_clst.loc[refseq2gtdbrep[included_genomes]].to_pickle(
    path2.metadata/'GTDB_taxonomy_inuse.pkl.bz2'
)

asm_filtered.loc[included_genomes].to_pickle(
    path2.metadata/'NCBI_asm_inuse.pkl.bz2'
)
