diff --git a/autometa/taxonomy/gtdb.py b/autometa/taxonomy/gtdb.py index d81862da8..36590b9b0 100644 --- a/autometa/taxonomy/gtdb.py +++ b/autometa/taxonomy/gtdb.py @@ -5,16 +5,15 @@ File containing definition of the GTDB class and containing functions useful for handling GTDB taxonomy databases """ + import gzip import logging import os import re -import string import tarfile import glob -from pathlib import Path -from typing import Dict, List, Set, Tuple +from typing import Dict, Set, Tuple from itertools import chain from tqdm import tqdm from typing import Dict @@ -25,7 +24,6 @@ from autometa.common.utilities import file_length, is_gz_file from autometa.common.external import diamond from autometa.taxonomy.database import TaxonomyDatabase -from autometa.common.exceptions import DatabaseOutOfSyncError logger = logging.getLogger(__name__) @@ -59,7 +57,9 @@ def create_gtdb_db(reps_faa: str, dbdir: str) -> str: reps_faa = dbdir genome_protein_faa_filepaths = glob.glob( - os.path.join(reps_faa, "**", "*_protein.faa"), recursive=True + os.path.join(reps_faa, "**", "*_protein.faa*"), + recursive=True + # To find *_protein.faa and *_protein.faa.gz files ) faa_index: Dict[str, str] = {} @@ -79,12 +79,15 @@ def create_gtdb_db(reps_faa: str, dbdir: str) -> str: combined_faa = os.path.join(dbdir, "gtdb.faa") with open(combined_faa, "w") as f_out: for faa_file, acc in faa_index.items(): - with open(faa_file) as f_in: + with gzip.open(faa_file, "rb") as f_in: for line in f_in: + line = line.decode("utf-8") if line.startswith(">"): - seqheader = line.lstrip(">") - line = f"\n>{acc} {seqheader}" - f_out.write(line) + seqheader = line.lstrip(">").strip() + outline = f">{acc} {seqheader}\n" + else: + outline = line + f_out.write(outline) logger.debug(f"Combined GTDB faa file written to {combined_faa}") return combined_faa